In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import requests
from itertools import chain

from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np

'''
Selenium web scaper for sports reference 

set up:
import scraper in jupyter note book 
------------------------------------------------
from scraper import Sports_Ref_Scaper

url = "https://www.pro-football-reference.com"
options = Options()
options.headless = True
b = webdriver.Chrome(options=options)

scaper = Sports_Ref_Scraper(b, url)
------------------------------------------------


'''

class football_scraper:
    def __init__(self, driver, home_url):
        self.driver = driver
        self.home_url = home_url
        self.get_links()
        
    def get_links(self):
        url = self.home_url + '/years/'
        self.driver.get(url)
        page = BeautifulSoup(self.driver.page_source, features="lxml")
        table = page.find(id = "div_years")
        rows = []
        table.find_all("th")
        for th in table.find_all("th"):
            for data in th.find_all("a", href = True):
                row = [data.get_text(), data['href']]
                rows.append(row)

        links = pd.DataFrame(rows, columns=['year', 'link'])
        self.links = links
    
    def get_passing_data(self, link):
        url = self.home_url + link + 'passing.htm'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        tables = soup.find_all("table")

        table = tables[0]
        tab_data = [[cell.text for cell in row.find_all(["th","td"])]
                                for row in table.find_all("tr")]
        df = pd.DataFrame(tab_data)
        df.columns = df.iloc[0,:]
        df.drop(index=0,inplace=True)
        df['year'] = link

        return df
    
    def giant_concat(self, collection):
        list_of_dicts = [cur_df.T.to_dict().values() for cur_df in collection]    
        giant_concat_df = pd.DataFrame(list(chain(*list_of_dicts)))
        giant_concat_df = giant_concat_df.replace(r'^\s*$', np.NaN, regex=True)
        return giant_concat_df
    
    def get_individual_passing(self, num_years, label):
        
        collection = []
        for i in range(1, num_years):
            link = self.links.loc[i]['link']
            df = self.get_passing_data(link)
            collection.append(df)
        
        combined = self.giant_concat(collection)
        combined.to_csv(label + '.csv')
    
    def get_team_stats_helper(self, link, idd):
        url = self.home_url + link
        self.driver.get(url)
        time.sleep(3)
        page = BeautifulSoup(self.driver.page_source, features="lxml")
        table = page.find(id = idd)
        tab_data = [[cell.text for cell in row.find_all(["th","td"])]
                                    for row in table.find_all("tr")]

        df = pd.DataFrame(tab_data)
        df.columns = df.iloc[0,:]
        df.drop(index=0,inplace=True)
        df['year'] = link

        return df.reset_index(drop=True)
    
    def get_team_stats(self, num_years, label, idd):
        '''
        idds:
            - "passing": passing stats 
            - "team_stats": summary of team offense (not recommended at the moment)
        '''
        collection = []
        for i in range(1, num_years):
            print("iteration: ", i)
            link = self.links.loc[i]['link']
            df = self.get_team_stats_helper(link, idd)
            collection.append(df)
            
        combined = self.giant_concat(collection)
        combined.to_csv(label + '.csv')
        
    def get_data_tables(self, category, start, num_years):
        
        collection = []
        for i in range(start, num_years+1):
            page = self.home_url + self.links.loc[i]['link'] + '{}.htm'.format(category)
            print(page)
            r = requests.get(page)
            soup = BeautifulSoup(r.content, 'html.parser')
            table = soup.find_all('table')[0]  

            tab_data = [[cell.text for cell in row.find_all(["th","td"])]
                                        for row in table.find_all("tr")]
            df = pd.DataFrame(tab_data)
            df.columns = df.iloc[0,:]
            df.drop(index=0,inplace=True)
            df['year'] = self.links.loc[i]['link'].split("/")[2]
            df.query("Rk != {}".format("Rk"))

            collection.append(df)


        
        tables = pd.concat(collection)
        
        return tables
    
    
    
    def get_advanced_data_tables_passing(self, category, num_years, tbl_id):
        
        collection = []
        for i in range(1, num_years+1):
            page = self.home_url + self.links.loc[i]['link'] + 'passing_advanced.htm'
            print(page)
            r = requests.get(page)
            soup = BeautifulSoup(r.content, 'html.parser')
            table = soup.find(id = tbl_id)
            tab_data = [[cell.text for cell in row.find_all(["th","td"])]
                                                for row in table.find_all("tr")][1:]

            df = pd.DataFrame(tab_data)
            df.columns = df.iloc[0,:]
            df.drop(index=0,inplace=True)
            df['year'] = self.links.loc[i]['link'].split("/")[2]
            df.query("Rk != {}".format("Rk"))

            collection.append(df)


        
        tables = pd.concat(collection)
        
        return tables

In [29]:
url = "https://www.pro-football-reference.com"
options = Options()
options.headless = True
b = webdriver.Chrome(options=options)

scraper = football_scraper(b, url)

In [30]:
scraper.links[scraper.links['year'] == '1970']

Unnamed: 0,year,link
50,1970,/years/1970/


In [33]:
df = scraper.get_data_tables('receiving', 41, 50)

https://www.pro-football-reference.com/years/1979/receiving.htm
https://www.pro-football-reference.com/years/1978/receiving.htm
https://www.pro-football-reference.com/years/1977/receiving.htm
https://www.pro-football-reference.com/years/1976/receiving.htm
https://www.pro-football-reference.com/years/1975/receiving.htm
https://www.pro-football-reference.com/years/1974/receiving.htm
https://www.pro-football-reference.com/years/1973/receiving.htm
https://www.pro-football-reference.com/years/1972/receiving.htm
https://www.pro-football-reference.com/years/1971/receiving.htm
https://www.pro-football-reference.com/years/1970/receiving.htm


In [38]:
df = df[df['Rk'] != 'Rk']
df

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Rec,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,year
1,1,Joe Washington*,BAL,26,RB,15,15,82,750,9.1,3,43,,5.5,50.0,8,1979
2,2,Ahmad Rashad*,MIN,30,WR,16,16,80,1156,14.5,9,52,,5.0,72.3,2,1979
3,3,Wallace Francis,ATL,28,WR,16,16,74,1013,13.7,8,42,,4.6,63.3,3,1979
4,4,Charlie Joiner*,SDG,32,WR,16,16,72,1008,14.0,4,39,,4.5,63.0,1,1979
5,5,Rickey Young,MIN,26,RB,16,16,72,519,7.2,4,18,,4.5,32.4,2,1979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,291,Bob Shaw,NOR,23,,4,0,1,49,49.0,0,49,,0.3,12.3,0,1970
301,292,Wayne Stewart,NYJ,23,,7,0,1,7,7.0,0,7,,0.1,1.0,0,1970
302,293,Billy Walik,PHI,23,,14,0,1,0,0.0,0,0,,0.1,0.0,3,1970
303,294,Ernie Wheelwright,NOR,31,,4,0,1,7,7.0,0,7,,0.3,1.8,1,1970


In [39]:
df.to_csv("receiving_70_79.csv")

In [41]:
df = scraper.get_data_tables("receiving", 1, 50)

https://www.pro-football-reference.com/years/2019/receiving.htm
https://www.pro-football-reference.com/years/2018/receiving.htm
https://www.pro-football-reference.com/years/2017/receiving.htm
https://www.pro-football-reference.com/years/2016/receiving.htm
https://www.pro-football-reference.com/years/2015/receiving.htm
https://www.pro-football-reference.com/years/2014/receiving.htm
https://www.pro-football-reference.com/years/2013/receiving.htm
https://www.pro-football-reference.com/years/2012/receiving.htm
https://www.pro-football-reference.com/years/2011/receiving.htm
https://www.pro-football-reference.com/years/2010/receiving.htm
https://www.pro-football-reference.com/years/2009/receiving.htm
https://www.pro-football-reference.com/years/2008/receiving.htm
https://www.pro-football-reference.com/years/2007/receiving.htm
https://www.pro-football-reference.com/years/2006/receiving.htm
https://www.pro-football-reference.com/years/2005/receiving.htm
https://www.pro-football-reference.com/y

In [48]:
df = df[df['Rk'] != 'Rk']
df

KeyError: 'Rk'

In [50]:
df.to_csv("receiving_70_19.csv")