In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [6]:
games = pd.read_csv('')
players = pd.read_csv('')
teams = pd.read_csv('')
team_info = pd.read_csv('')

print(f"Games: {len(games)} records")
print(f"Players: {len(players)} records")
print(f"Teams: {len(teams)} records")
print(f"Team info: {len(team_info)} records")

Games: 1408 records
Players: 3103 records
Teams: 160 records
Team info: 36 records


In [7]:
games.head()

Unnamed: 0,game_id,season,game_type,week,gameday,weekday,gametime,away_team,away_score,home_team,...,wind,away_qb_id,home_qb_id,away_qb_name,home_qb_name,away_coach,home_coach,referee,stadium_id,stadium
0,2020_01_HOU_KC,2020,REG,1,2020-09-10,Thursday,20:20,HOU,20.0,KC,...,7.0,00-0033537,00-0033873,Deshaun Watson,Patrick Mahomes,Bill O'Brien,Andy Reid,Clete Blakeman,KAN00,Arrowhead Stadium
1,2020_01_SEA_ATL,2020,REG,1,2020-09-13,Sunday,13:00,SEA,38.0,ATL,...,,00-0029263,00-0026143,Russell Wilson,Matt Ryan,Pete Carroll,Dan Quinn,Shawn Hochuli,ATL97,Mercedes-Benz Stadium
2,2020_01_CLE_BAL,2020,REG,1,2020-09-13,Sunday,13:00,CLE,6.0,BAL,...,5.0,00-0034855,00-0034796,Baker Mayfield,Lamar Jackson,Kevin Stefanski,John Harbaugh,Ronald Torbert,BAL00,M&T Bank Stadium
3,2020_01_NYJ_BUF,2020,REG,1,2020-09-13,Sunday,13:00,NYJ,17.0,BUF,...,15.0,00-0034869,00-0034857,Sam Darnold,Josh Allen,Adam Gase,Sean McDermott,Shawn Smith,BUF00,New Era Field
4,2020_01_LV_CAR,2020,REG,1,2020-09-13,Sunday,13:00,LV,34.0,CAR,...,5.0,00-0031280,00-0031237,Derek Carr,Teddy Bridgewater,Jon Gruden,Matt Rhule,Brad Allen,CAR00,Bank of America Stadium


In [8]:
players.head()

Unnamed: 0,player_id,season,season_type,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,...,yac_sh,wopr_y,ry_sh,rtd_sh,rfd_sh,rtdfd_sh,dom,w8dom,yptmpa,ppr_sh
0,00-0019596,2020,REG,401,610,4633.0,40,12.0,21.0,143.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197091
1,00-0019596,2021,REG,485,719,5316.0,43,12.0,22.0,144.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195705
2,00-0019596,2022,REG,490,733,4694.0,25,9.0,22.0,160.0,...,0.0,0.004477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.175012
3,00-0020531,2020,REG,275,390,2942.0,24,6.0,13.0,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17111
4,00-0022127,2020,REG,0,0,0.0,0,0.0,0.0,0.0,...,0.014804,0.10183,0.025247,0.095238,0.061069,0.065789,0.060243,0.039245,0.204142,0.032303


In [9]:
teams.head()

Unnamed: 0,recent_team,season,completions,attempts,passing_yards,passing_tds,interceptions,sacks,rushing_yards,rushing_tds
0,ARI,2020,387,575,4102.0,27,13.0,29.0,2237.0,22
1,ARI,2021,434,625,4756.0,27,13.0,41.0,2137.0,24
2,ARI,2022,433,664,3966.0,17,17.0,46.0,1873.0,15
3,ARI,2023,355,555,3430.0,18,12.0,42.0,2365.0,17
4,ARI,2024,374,543,3859.0,21,11.0,30.0,2451.0,18


In [13]:
teams['recent_team'].unique()


array(['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL',
       'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV',
       'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF',
       'TB', 'TEN', 'WAS'], dtype=object)

In [17]:
teams.query("recent_team == 'ATL'")

Unnamed: 0,recent_team,season,completions,attempts,passing_yards,passing_tds,interceptions,sacks,rushing_yards,rushing_tds
5,ATL,2020,408,628,4620.0,27,11.0,41.0,1532.0,13
6,ATL,2021,377,573,3987.0,20,15.0,40.0,1451.0,11
7,ATL,2022,257,415,2927.0,17,9.0,37.0,2718.0,17
8,ATL,2023,327,530,3775.0,17,17.0,40.0,2159.0,14
9,ATL,2024,364,559,4283.0,21,19.0,32.0,2219.0,18


In [28]:
stats = pd.read_csv('historical/player_stats_seasonal_2010-2012.csv')
rosters = pd.read_csv('rosters/rosters_2010-2012.csv')

stats_ids = set(stats['player_id'])
roster_ids = set(rosters['player_id'])

print(f"Stats has {len(stats_ids)} unique player ids")
print(f"Rosters has {len(roster_ids)} unique player ids")
print(f"Matching ids: {len(stats_ids & roster_ids)}")
print(f"stats ids not in rosters: {len(stats_ids - roster_ids)}")

Stats has 919 unique player ids
Rosters has 2934 unique player ids
Matching ids: 919
stats ids not in rosters: 0


In [30]:
player_names = rosters[['player_id', 'player_name', 'position']].drop_duplicates()

joined = stats.merge(player_names, on='player_id', how='left')

print(f"stats records: {len(stats)}")
print(f"joined records: {len(joined)}")



stats records: 1763
joined records: 1832


In [31]:
joined.head()

Unnamed: 0,player_id,season,season_type,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,...,ry_sh,rtd_sh,rfd_sh,rtdfd_sh,dom,w8dom,yptmpa,ppr_sh,player_name,position
0,00-0000108,2011,REG,1,1,14.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047313,David Akers,K
1,00-0000865,2010,REG,29,49,352.0,3,3.0,4.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135845,Charlie Batch,QB
2,00-0000865,2011,REG,15,24,208.0,0,1.0,2.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02427,Charlie Batch,QB
3,00-0000865,2012,REG,45,70,475.0,1,4.0,3.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119048,Charlie Batch,QB
4,00-0001231,2010,REG,0,0,0.0,0,0.0,0.0,0.0,...,0.0,,0.0,0.0,,,0.0,0.0,Josh Bidwell,P
