In [1]:
# imports

import requests
import pandas as pd
import json
import html
import os.path
import numpy as np
from scipy import stats

In [2]:
def retrieveCfbData(endpoint, team, year, week):
    file_path = f"data/{endpoint if (endpoint != 'plays') else 'pbp'}/{endpoint[:-1] if (endpoint != 'plays') else 'pbp'}-data-{team.lower().replace(' ','-')}-{year}-wk{week}.json"
    if (os.path.exists(file_path)):
        return file_path
    res = requests.get(f"https://api.collegefootballdata.com/{endpoint}?seasonType=regular&year={year}&team={html.escape(team)}&week={week}")
    content = res.json()
#     with open(file_path, 'w') as f:
#         json.dump(content, f)
    return json.dumps(content)

def retrieveRemoteCfbGame(game_id, year):
    file_path = f"data/games/game-data-{game_id}.json"
    if (os.path.exists(file_path)):
        return file_path
    res = requests.get(f"https://api.collegefootballdata.com/games?year={year}&seasonType=regular&id={game_id}")
    content = res.json()
#     with open(file_path, 'w') as f:
#         json.dump(content, f)
    return json.dumps(content)

# init data retrieval
teams = pd.read_csv("data/teams/2018.csv", encoding = 'latin-1')

base_drives = pd.DataFrame()
games = pd.DataFrame()
pbp_data = pd.DataFrame()

def retrieveCfbDataFile(endpoint, year):
    return pd.read_csv(f"data/{endpoint}/{year}.csv", encoding='latin-1')

for i in range(2016, 2020):
    drive = retrieveCfbDataFile('drives',i)
    drive['year'] = i
    base_drives = base_drives.append(drive)
    
    gm = retrieveCfbDataFile('games',i)
    gm['year'] = i
    games = games.append(gm)
    
    plys = retrieveCfbDataFile('pbp',i)
    plys['year'] = i
    pbp_data = pbp_data.append(plys)
    
print(f"Total Games: {len(games)}")
print(f"Total Drives: {len(base_drives)}")
print(f"Total Plays: {len(pbp_data)}")

# print(f"2016 Drives: {len(base_drives[base_drives.game_id == 400868979])}")
# base_drives[base_drives.offense == 'Ole Miss']

games.reset_index(inplace = True) 
pbp_data.reset_index(inplace = True) 
base_drives.reset_index(inplace = True) 

base_drives = base_drives[
    (~base_drives.drive_result.isin(['Uncategorized']))
]
base_drives.drop(['offense_conference','start_time.minutes','start_time.seconds','end_time.minutes','end_time.seconds','defense_conference','elapsed.seconds','elapsed.minutes'], axis = 1, inplace=True) 
drives = pd.merge(base_drives, games[['id','away_team','home_team']], left_on='game_id', right_on='id', how='right')
drives.rename(columns={'id_x':'drive_id'}, inplace=True)
drives.drop(['id_y'], axis = 1, inplace=True)
drives.dropna(inplace=True)
print(f"Clean Drives: {len(drives)}")

drives.loc[
    drives.offense == drives.away_team, ['start_yardline']
] = 100 - drives.start_yardline
drives.loc[
    drives.offense == drives.away_team, ['end_yardline']
] = 100 - drives.end_yardline
pbp_data = pbp_data[
    (pbp_data.down != 0)
]

pbp_data.distance = pbp_data.distance.astype(float)

pbp_data.drop(['offense_conference','defense_conference'], axis = 1, inplace=True) 
# Ignore some types of plays cause they're special teams and weird
ignore_types = ["Defensive 2pt Conversion","Blocked Field Goal","Blocked Punt","Missed Field Goal Return","Blocked Punt Touchdown","Missed Field Goal Return Touchdown","Extra Point Missed","Extra Point Good","Timeout","End of Half","End of Game","Uncategorized","Kickoff","Kickoff Return (Offense)","Kickoff Return Touchdown","Punt", "Field Goal Good","Field Goal Missed","Safety"]
pbp_data = pbp_data[~(pbp_data.play_type.isin(ignore_types))] # & ~(pbp_data.play_text.str.contains("Penalty").astype(bool))]

bad_types = ["Interception","Pass Interception Return","Interception Return Touchdown",'Fumble Recovery (Opponent)','Sack','Fumble Return Touchdown']
pbp_data.loc[
    ((pbp_data.play_type.isin(bad_types))
     & (~pbp_data.play_type.str.contains('Sack'))) ,['yards_gained']] = 0

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Total Games: 3289
Total Drives: 84218
Total Plays: 595530
Clean Drives: 83699


In [17]:
penalties = pbp_data[((pbp_data.play_text.str.lower().str.contains("penalty").astype(bool))) | ((pbp_data.play_type.isin(["Penalty"])))]
penalties.head()

Unnamed: 0,index,away,clock.minutes,clock.seconds,defense,defense_score,distance,down,drive_id,home,id,offense,offense_score,period,play_text,play_type,ppa,yard_line,yards_gained,year
12,12,Northwestern State,12,45,Northwestern State,0,1.0,3,4008688762,Baylor,400868876101875403,Baylor,0,1,"Linwood,Shock rush for 3 yards to the NWST47 (...",Penalty,,50,-10,2016
14,14,Northwestern State,10,46,Baylor,7,15.0,2,4008688763,Baylor,400868876101895305,Northwestern State,0,1,Brooks Haack pass complete to Shakeir Ryan for...,Pass Reception,,80,-12,2016
16,16,Northwestern State,10,46,Baylor,7,27.0,4,4008688763,Baylor,400868876101895307,Northwestern State,0,1,"NORTHWESTERN ST Penalty, false start (-4 Yards...",Penalty,,92,-4,2016
19,19,Northwestern State,9,29,Northwestern State,0,2.0,3,4008688764,Baylor,400868876101907005,Baylor,7,1,"Williams,T. rush for 1 yard to the NWST1 (Krol...",Penalty,,98,-10,2016
47,47,Northwestern State,3,22,Northwestern State,0,6.0,2,4008688768,Baylor,400868876101967705,Baylor,17,1,"BAYLOR Penalty, false start (-5 Yards) to the ...",Penalty,,75,-5,2016


In [18]:
penalties.dropna(inplace=True)
personal_fouls = penalties[(penalties.play_text.str.lower().str.contains("personal foul"))]
personal_fouls.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,index,away,clock.minutes,clock.seconds,defense,defense_score,distance,down,drive_id,home,id,offense,offense_score,period,play_text,play_type,ppa,yard_line,yards_gained,year
455744,75,Portland State,13,9,Portland State,3,10.0,1,4011107212,Arkansas,401110721101869007,Arkansas,0,1,Ben Hicks pass complete to Mike Woods for 5 yd...,Pass Reception,0.571121,60,20,2019
455786,117,Portland State,12,29,Arkansas,10,7.0,3,4011107217,Arkansas,401110721102877003,Portland State,6,2,Davis Alexander run for 7 yds to the Ark 45 fo...,Rush,3.327652,62,22,2019
456132,463,Ole Miss,0,53,Memphis,7,10.0,2,4011107267,Memphis,401110726101994601,Ole Miss,0,1,Matt Corral pass complete to Elijah Moore for ...,Pass Reception,3.097714,59,34,2019
456188,519,Mississippi State,3,10,Louisiana,7,8.0,2,4011107279,Louisiana,401110727101968904,Mississippi State,7,1,Tommy Stevens run for 3 yds to the MisSt 33 fo...,Rush,2.35765,70,18,2019
456285,616,South Carolina,5,57,North Carolina,3,10.0,1,4011107293,North Carolina,401110729101944201,South Carolina,3,1,Jake Bentley pass complete to Dakereon Joyner ...,Pass Reception,1.767563,71,19,2019
