In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from time import sleep
import re
pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', None)

# Scrape Awards

In [12]:
from io import StringIO

def scrape_pfr(id_, url, colname):
    response = requests.get(url)
    soup = bs(response.text, 'lxml')
    table = soup.find('table', id=id_)
    df = pd.read_html(StringIO(str(table)))[0]
    df = df[['Year', 'Player', 'Tm']]
    df[colname] = 1
    return df

# MVP

In [13]:
mvp = scrape_pfr(
    id_='awards', 
    url='https://www.pro-football-reference.com/awards/ap-nfl-mvp-award.htm', 
    colname='MVPs'
)

In [4]:
mvp.to_csv('individual_awards/MVP_season') 

# OPOY

In [5]:
opoy = scrape_pfr('awards', 'https://www.pro-football-reference.com/awards/ap-offensive-player-of-the-year.htm', 'OPOYs')
opoy.head(10)


Unnamed: 0,Year,Player,Tm,OPOYs
0,2022,Justin Jefferson,Minnesota Vikings,1
1,2021,Cooper Kupp,Los Angeles Rams,1
2,2020,Derrick Henry,Tennessee Titans,1
3,2019,Michael Thomas,New Orleans Saints,1
4,2018,Patrick Mahomes,Kansas City Chiefs,1
5,2017,Todd Gurley,Los Angeles Rams,1
6,2016,Matt Ryan,Atlanta Falcons,1
7,2015,Cam Newton,Carolina Panthers,1
8,2014,DeMarco Murray,Dallas Cowboys,1
9,2013,Peyton Manning,Denver Broncos,1


In [6]:
opoy.to_csv('individual_awards/OPOY_season') 

# DPOY

In [7]:
dpoy = scrape_pfr('awards', 'https://www.pro-football-reference.com/awards/ap-defensive-player-of-the-year.htm', 'DPOYS')
dpoy.head(10)


Unnamed: 0,Year,Player,Tm,DPOYS
0,2022,Nick Bosa,San Francisco 49ers,1
1,2021,T.J. Watt,Pittsburgh Steelers,1
2,2020,Aaron Donald,Los Angeles Rams,1
3,2019,Stephon Gilmore,New England Patriots,1
4,2018,Aaron Donald,Los Angeles Rams,1
5,2017,Aaron Donald,Los Angeles Rams,1
6,2016,Khalil Mack,Oakland Raiders,1
7,2015,J.J. Watt,Houston Texans,1
8,2014,J.J. Watt,Houston Texans,1
9,2013,Luke Kuechly,Carolina Panthers,1


In [8]:
dpoy.to_csv('individual_awards/DPOY_season') 

# ROY

In [9]:
def clean_roy(df,colname):
    df.drop(['Position', 'Ref'], axis =1, inplace = True)
    df.rename(columns = {'Team':'Tm', 'Season':'Year'}, inplace=True)
    df[colname] = 1
    df['Player'] = df['Player'].str.replace(r"\s+\(.*\)","", regex= True)
    return(df)

In [10]:
roy_url = 'https://en.wikipedia.org/wiki/Associated_Press_NFL_Rookie_of_the_Year_Award'
res = requests.get(roy_url)
soup = bs(res.text, 'html.parser')
table = soup.find_all('table', class_ = 'wikitable')
roy_offense = pd.read_html(str(table))[0]
roy_defense = pd.read_html(str(table))[1]

In [11]:
roy_offense = clean_roy(roy_offense, 'ROY_OFFENSE')
roy_defense = clean_roy(roy_defense, 'ROY_DEFENSE')

In [12]:
roy_defense.tail(10)

Unnamed: 0,Year,Player,Tm,ROY_DEFENSE
46,2013,Sheldon Richardson,New York Jets,1
47,2014,Aaron Donald,St. Louis Rams,1
48,2015,Marcus Peters,Kansas City Chiefs,1
49,2016,Joey Bosa,San Diego Chargers,1
50,2017,Marshon Lattimore,New Orleans Saints,1
51,2018,Shaquille Leonard,Indianapolis Colts,1
52,2019,Nick Bosa,San Francisco 49ers,1
53,2020,Chase Young,Washington Football Team,1
54,2021,Micah Parsons,Dallas Cowboys,1
55,2022,Sauce Gardner,New York Jets,1


In [13]:
roy_offense.tail(10)

Unnamed: 0,Year,Player,Tm,ROY_OFFENSE
56,2013,Eddie Lacy,Green Bay Packers,1
57,2014,Odell Beckham Jr.,New York Giants,1
58,2015,Todd Gurley,St. Louis Rams,1
59,2016,Dak Prescott,Dallas Cowboys,1
60,2017,Alvin Kamara,New Orleans Saints,1
61,2018,Saquon Barkley,New York Giants,1
62,2019,Kyler Murray,Arizona Cardinals,1
63,2020,Justin Herbert,Los Angeles Chargers,1
64,2021,Ja'Marr Chase,Cincinnati Bengals,1
65,2022,Garrett Wilson,New York Jets,1


In [14]:
roy_offense.to_csv('individual_awards/ROYoffense')
roy_defense.to_csv('individual_awards/ROYdefense')

# CPOY

In [15]:
cpoy = scrape_pfr('awards', 'https://www.pro-football-reference.com/awards/ap-comeback-player-award.htm', 'CPOY')
cpoy.head(10)

Unnamed: 0,Year,Player,Tm,CPOY
0,2022,Geno Smith,Seattle Seahawks,1
1,2021,Joe Burrow,Cincinnati Bengals,1
2,2020,Alex Smith,Washington Football Team,1
3,2019,Ryan Tannehill,Tennessee Titans,1
4,2018,Andrew Luck,Indianapolis Colts,1
5,2017,Keenan Allen,Los Angeles Chargers,1
6,2016,Jordy Nelson,Green Bay Packers,1
7,2015,Eric Berry,Kansas City Chiefs,1
8,2014,Rob Gronkowski,New England Patriots,1
9,2013,Philip Rivers,San Diego Chargers,1


In [16]:
cpoy.to_csv('individual_awards/CPOY')

# All pro

In [17]:
df = []
for year in range(2000,2023):
    with open("all_pro/{}.html".format(year)) as f:
        page = f.read()

    soup = bs(page, 'html.parser')
    table = soup.find('table', id = 'all_pro')
    table_stats = pd.read_html(str(table))[0]
    table_stats['Year'] = year
    df.append(table_stats)
    
final_df = pd.concat(df)
final_df = final_df[~(final_df['Pos'].isin(['K','P','LS','KR','PR','PK','ST']))]

first_team_allpro = final_df[final_df["All-pro teams"].str.contains('AP: 1st Tm')]
second_team_allpro = final_df[final_df["All-pro teams"].str.contains('AP: 2nd Tm')]

first_team_allpro = first_team_allpro[['Tm','Year']]
first_team_allpro['1stTeam']=1
second_team_allpro = second_team_allpro[['Tm','Year']]
second_team_allpro['2ndTeam']=1

first_team_allpro = first_team_allpro.groupby(['Year', 'Tm'], as_index=False).sum()
second_team_allpro = second_team_allpro.groupby(['Year', 'Tm'], as_index=False).sum()


In [19]:
first_team_allpro.tail(13)

Unnamed: 0,Year,Tm,1stTeam
310,2022,2TM,1
311,2022,BUF,1
312,2022,CLE,1
313,2022,DAL,2
314,2022,DEN,1
315,2022,KAN,3
316,2022,LVR,2
317,2022,MIA,1
318,2022,MIN,1
319,2022,NYJ,2


In [20]:
first_team_allpro.to_csv('individual_awards/1stTeam.csv')
second_team_allpro.to_csv('individual_awards/2ndTeam.csv')

# Pro Bowls

In [21]:
df = []   
for i in range(2000,2023):
    with open('probowl/{}.html'.format(i)) as f:
        page = f.read()
    soup = bs(page, 'html.parser')
    table = soup.find('table', id = 'pro_bowl')
    pb_table = pd.read_html(str(table))[0]
    pb_table['Year'] = i
    df.append(pb_table)

In [22]:
probowls = pd.concat(df)

In [23]:
probowls = probowls.loc[:,['Tm', 'Year', 'Pos', 'Player']]
probowls = probowls[probowls['Player'] != 'Player']
probowls['ProBowl'] = 1
probowls['Player'] = probowls['Player'].replace(r'(\%|\+)','',regex=True)
pat = r'(\,|\'|\.|Jr|Sr|III |IV |-)'
probowls['Player'] = probowls['Player'].replace(pat, '', regex=True)

probowl_qb = probowls[(probowls['Pos'] == 'QB')]
probowls = probowls[~(probowls['Pos'] == 'QB')]
probowls = probowls.groupby(['Year', 'Tm'], as_index=False).sum()
probowl_qb

Unnamed: 0,Tm,Year,Pos,Player,ProBowl
2,OAK,2000,QB,Rich Gannon,1
42,MIN,2000,QB,Daunte Culpepper,1
62,IND,2000,QB,Peyton Manning,1
65,DEN,2000,QB,Brian Griese,1
66,SFO,2000,QB,Jeff Garcia,1
...,...,...,...,...,...
5,JAX,2022,QB,Trevor Lawrence,1
6,MIN,2022,QB,Kirk Cousins,1
7,LVR,2022,QB,Derek Carr,1
8,BAL,2022,QB,Tyler Huntley,1


In [24]:
probowls.to_csv('individual_awards/nonqb_probowls')

In [25]:
probowl_qb = probowl_qb.groupby(['Year', 'Tm'], as_index=False).sum()
probowl_qb.to_csv('individual_awards/QB_probowl')