In [163]:
# imports

import requests
import pandas as pd
import json
import html
import os.path
import numpy as np
from scipy import stats

In [164]:
def retrieveCfbData(endpoint, team, year, week):
    file_path = f"data/{endpoint if (endpoint != 'plays') else 'pbp'}/{endpoint[:-1] if (endpoint != 'plays') else 'pbp'}-data-{team.lower().replace(' ','-')}-{year}-wk{week}.json"
    if (os.path.exists(file_path)):
        return file_path
    res = requests.get(f"https://api.collegefootballdata.com/{endpoint}?seasonType=regular&year={year}&team={html.escape(team)}&week={week}")
    content = res.json()
#     with open(file_path, 'w') as f:
#         json.dump(content, f)
    return json.dumps(content)

def retrieveRemoteCfbGame(game_id, year):
    file_path = f"data/games/game-data-{game_id}.json"
    if (os.path.exists(file_path)):
        return file_path
    res = requests.get(f"https://api.collegefootballdata.com/games?year={year}&seasonType=regular&id={game_id}")
    content = res.json()
#     with open(file_path, 'w') as f:
#         json.dump(content, f)
    return json.dumps(content)
    

In [165]:
# init data retrieval
teams = pd.read_csv("data/teams/2018.csv", encoding = 'latin-1')

base_drives = pd.DataFrame()
games = pd.DataFrame()
pbp_data = pd.DataFrame()

def retrieveCfbDataFile(endpoint, year):
    return pd.read_csv(f"data/{endpoint}/{year}.csv", encoding='latin-1')

for i in range(2016, 2020):
    drive = retrieveCfbDataFile('drives',i)
    drive['year'] = i
    base_drives = base_drives.append(drive)
    
    gm = retrieveCfbDataFile('games',i)
    gm['year'] = i
    games = games.append(gm)
    
    plys = retrieveCfbDataFile('pbp',i)
    plys['year'] = i
    pbp_data = pbp_data.append(plys)

In [166]:
print(f"Total Games: {len(games)}")
print(f"Total Drives: {len(base_drives)}")
print(f"Total Plays: {len(pbp_data)}")

# print(f"2016 Drives: {len(base_drives[base_drives.game_id == 400868979])}")
# base_drives[base_drives.offense == 'Ole Miss']

Total Games: 3350
Total Drives: 84218
Total Plays: 595530


In [167]:
# Data cleaning

games.reset_index(inplace = True) 
pbp_data.reset_index(inplace = True) 
base_drives.reset_index(inplace = True) 

base_drives = base_drives[
    (~base_drives.drive_result.isin(['Uncategorized']))
]
base_drives.drop(['offense_conference','start_time.minutes','start_time.seconds','end_time.minutes','end_time.seconds','defense_conference','elapsed.seconds','elapsed.minutes'], axis = 1, inplace=True) 
drives = pd.merge(base_drives, games[['id','away_team','home_team']], left_on='game_id', right_on='id', how='right')
drives.rename(columns={'id_x':'drive_id'}, inplace=True)
drives.drop(['id_y'], axis = 1, inplace=True)
drives.dropna(inplace=True)
print(f"Clean Drives: {len(drives)}")

drives.loc[
    drives.offense == drives.away_team, ['start_yardline']
] = 100 - drives.start_yardline
drives.loc[
    drives.offense == drives.away_team, ['end_yardline']
] = 100 - drives.end_yardline
pbp_data = pbp_data[
    (pbp_data.down != 0)
]

Clean Drives: 83699


In [168]:
pbp_data.distance = pbp_data.distance.astype(float)

pbp_data.drop(['offense_conference','defense_conference'], axis = 1, inplace=True) 
# Ignore some types of plays cause they're special teams and weird
ignore_types = ["Defensive 2pt Conversion","Blocked Field Goal","Blocked Punt","Missed Field Goal Return","Blocked Punt Touchdown","Missed Field Goal Return Touchdown","Extra Point Missed","Extra Point Good","Timeout","End of Half","End of Game","Uncategorized","Penalty","Kickoff","Kickoff Return (Offense)","Kickoff Return Touchdown","Punt", "Field Goal Good","Field Goal Missed","Safety"]
pbp_data = pbp_data[~(pbp_data.play_type.isin(ignore_types)) & ~(pbp_data.play_text.str.contains("Penalty").astype(bool))]

bad_types = ["Interception","Pass Interception Return","Interception Return Touchdown",'Fumble Recovery (Opponent)','Sack','Fumble Return Touchdown']
pbp_data.loc[
    ((pbp_data.play_type.isin(bad_types))
     & (~pbp_data.play_type.str.contains('Sack'))) ,['yards_gained']] = 0

In [169]:
def verify_division(num1, num2):
    if num2 == 0:
        return 0
    else:
        return num1 / num2
    
def calculate_success_in_scoring_opps(pbp, opps, team):
    opp_ids = opps.drive_id.unique()
    success = 0
    total = 0
    for opp_id in opp_ids:
        opp_set = pbp[(pbp.drive_id == opp_id) & (pbp.offense == team)]
        opp_s_rate = verify_division(len(opp_set[opp_set.play_successful == True]), len(opp_set))
        success += len(opp_set[opp_set.play_successful == True])
        total += len(opp_set)
    s_rate = 0 if total == 0 else (success / total)
    return s_rate
    
def is_successful(down, distance, yards_gained, play_type):
    if (play_type in bad_types):
        return False 
    if ((down == 1) & (yards_gained >= (0.5 * distance))):
        return True
    elif ((down == 2)) & (yards_gained >= (0.7 * distance)):
        return True
    elif ((down == 3) & (yards_gained >= distance)):
        return True
    elif ((down == 4) & (yards_gained >= distance)):
        return True
    else:
        return False
    
def is_successful_vector(play):
    if (play.play_type in bad_types):
        return False 
    if ((play.down == 1) & (play.yards_gained >= (0.5 * play.distance))):
        return True
    elif ((play.down == 2)) & (play.yards_gained >= (0.7 * play.distance)):
        return True
    elif ((play.down == 3) & (play.yards_gained >= play.distance)):
        return True
    elif ((play.down == 4) & (play.yards_gained >= play.distance)):
        return True
    else:
        return False

def is_explosive(yards_gained):
    if (yards_gained >= 15):
        return True
    else:
        return False
    
pbp_data['play_explosive'] = np.vectorize(is_explosive)(pbp_data.yards_gained)
pbp_data['play_successful'] = np.vectorize(is_successful)(pbp_data.down, pbp_data.distance, pbp_data.yards_gained, pbp_data.play_type)
    
def calculate_success_rate(pbp, exclude_types):
    return verify_division(len(pbp[(pbp.play_successful == True) & (~pbp.play_type.isin(exclude_types))]), len(pbp[(~pbp.play_type.isin(exclude_types))]))
    
def calculate_exp_rate(pbp, exclude_types):
    return verify_division(len(pbp[(pbp.play_explosive == True) & (~pbp.play_type.isin(exclude_types))]), len(pbp[(~pbp.play_type.isin(exclude_types))]))
    
standard_downs = pbp_data[
    (pbp_data.down == 1)
    | ((pbp_data.down == 2) & (pbp_data.distance <= 7))
    | ((pbp_data.down == 3) & (pbp_data.distance <= 4))
    | ((pbp_data.down == 4) & (pbp_data.distance <= 4)) 
]

passing_downs = pbp_data[
    ((pbp_data.down == 2) & (pbp_data.distance >= 8))
    | ((pbp_data.down == 3) & (pbp_data.distance >= 5))
    | ((pbp_data.down == 4) & (pbp_data.distance >= 5)) 
]

pass_types = ["Pass Reception","Pass Incompletion","Passing Touchdown","Interception","Pass Interception Return","Interception Return Touchdown","Sack"]
rush_types = ["Rush","Rushing Touchdown",'Fumble Recovery (Opponent)','Fumble Return Touchdown']

In [170]:
def generate_team_play_stats(pbp, team):
    team_off_plays = pbp[pbp.offense == team]
    off_sr = calculate_success_rate(team_off_plays, [])
    off_er = calculate_exp_rate(team_off_plays, [])
    ypp = verify_division(sum(team_off_plays.yards_gained), len(team_off_plays))
    return pd.DataFrame({
        'team': [team],
        "OffSR": [off_sr],
        "OffER" : [off_er],
        "YPP" : [ypp]
    })

def generate_team_drive_stats(drvs, pbp, gm, points, team):
    team_drives = drvs[drvs.offense == team]
    scoring_opps = team_drives[
        ((team_drives.start_yardline + team_drives.yards) >= 60)
    ]
    avg_fp = verify_division(sum(team_drives.start_yardline), len(team_drives))
    ppd = verify_division(points, len(team_drives))
    opp_effcy = verify_division(len(scoring_opps[scoring_opps.scoring == True]), len(scoring_opps))
    opp_rate = verify_division(len(scoring_opps), len(team_drives))
    opp_sr = calculate_success_in_scoring_opps(pbp, scoring_opps, team)
    return pd.DataFrame({
        'team': [team],
        'FP': [avg_fp],
        'PPD': [ppd],
        'OppEff': [opp_effcy],
        'OppRate': [opp_rate],
        'OppSR': [opp_sr]
    })

def generate_team_turnover_stats(pbp, team):
    adj_turnover_plays = pbp[
        (pbp.play_type.str.contains('Interception', regex=False))
        | ((pbp.play_type == 'Pass Incompletion')
        & (pbp.play_text.str.contains('broken up', regex=False)))
        | (pbp.play_type.str.contains('Fumble', regex=False))
    ]

    fum_plays = adj_turnover_plays[
        (adj_turnover_plays.play_type.str.contains('Fumble', regex=False))
    ]

    # away_team Adj Turnovers
    team_tos = adj_turnover_plays[
        (adj_turnover_plays.offense == team)
        | (adj_turnover_plays.defense == team)
    ]

    team_ints_off = team_tos[
       (team_tos.play_type.str.contains('Interception', regex=False))
        & (team_tos.offense == team)
    ]

    team_pds = team_tos[
       (team_tos.play_type == 'Pass Incompletion')
        & (team_tos.play_text.str.contains('broken up', regex=False))
        & (team_tos.offense == team)
    ]
    
    exp_to = (0.22 * (len(team_pds) + len(team_ints_off))) + (0.49 * len(fum_plays))

    return pd.DataFrame({
        'team' : [team],
        'ExpTO': [exp_to]
    })

def stringify_entry(team_entry):
    return team_entry.tolist()[0]

def calculate_five_factors_rating(team_stat_pack):
    return (.35 * team_stat_pack.OffSR) + (.25 * (team_stat_pack.OffER * team_stat_pack.YPP)) + (.15 * (team_stat_pack.FP / 50.0)) + (.15 * ((team_stat_pack.PPD + team_stat_pack.OppSR + team_stat_pack.OppEff)/3)) + (.10 * team_stat_pack.ExpTODiff)

def calculate_box_score(game_id, year):
    game_data = games[games.id == game_id]
    
    if (len(game_data) == 0):
        print(f"Could not find basic game data for game_id {game_id} locally, checking CFB Data API")
        game_data = pd.read_json(retrieveRemoteCfbGame(game_id, year))
        if (len(game_data) == 0):
            print(f"Could not find basic game data for game_id {game_id} on CFB Data API, bailing out")
            return None
    
    home_team = stringify_entry(game_data.home_team)
    away_team = stringify_entry(game_data.away_team)
    home_score = stringify_entry(game_data.home_points)
    away_score = stringify_entry(game_data.away_points)
    
    game_year = stringify_entry(game_data.season)
    game_week = stringify_entry(game_data.week)
    
    game_drives = drives[drives.game_id == game_id]
    if ((len(game_drives) == 0)):
        print(f"Could not find drive data for game_id {game_id} locally, checking CFB Data API")
        if (year == 2016):
            print(f"Could not find drive data for game_id {game_id} bc of issues with 2016 data source, bailing out")
            return None
        else:
            game_drives = pd.read_json(retrieveCfbData('drives', home_team, game_year, game_week))
            if (len(game_drives) == 0):
                print(f"Could not find drive data for game_id {game_id} on CFB Data API, bailing out")
                return None
            else:
                game_drives = pd.merge(game_drives, game_data[['id','away_team','home_team']], left_on='game_id', right_on='id', how='right')
                game_drives.rename(columns={'id_x':'drive_id'}, inplace=True)
                game_drives.drop(['id_y'], axis = 1, inplace=True)
                game_drives.dropna(inplace=True)

                game_drives.loc[
                    game_drives.offense == game_drives.away_team, ['start_yardline']
                ] = 100 - game_drives.start_yardline
                game_drives.loc[
                    game_drives.offense == game_drives.away_team, ['end_yardline']
                ] = 100 - game_drives.end_yardline
    
    game_pbp = pbp_data[pbp_data.drive_id.isin(game_drives.drive_id.tolist())]
    if (len(game_pbp) == 0):
        print(f"Could not find play by play data for game_id {game_id} locally, checking CFB Data API")
        game_pbp = pd.read_json(retrieveCfbData('plays', home_team, game_year, game_week))
        if (len(game_pbp) == 0):
            print(f"Could not find play by play data for game_id {game_id} on CFB Data API, bailing out")
            return None
    
    if 'play_explosive' not in game_pbp.columns:
        game_pbp['play_explosive'] = game_pbp.apply(lambda x: x.yards_gained >= 15, axis=1)
    if 'play_successful' not in game_pbp.columns:
        game_pbp['play_successful'] = game_pbp.apply(lambda x: is_successful_vector(x), axis=1)
    
    home_team_play_stats = generate_team_play_stats(game_pbp, home_team)
    away_team_play_stats = generate_team_play_stats(game_pbp, away_team)
    
    home_team_drv_stats = generate_team_drive_stats(game_drives, game_pbp, game_data, home_score, home_team)
    away_team_drv_stats = generate_team_drive_stats(game_drives, game_pbp, game_data, away_score, away_team)
    
    home_team_stats = pd.merge(home_team_play_stats, home_team_drv_stats, left_on="team", right_on="team", how='right')
    away_team_stats = pd.merge(away_team_play_stats, away_team_drv_stats, left_on="team", right_on="team", how='right')
    
    home_team_tos = generate_team_turnover_stats(game_pbp, home_team)
    away_team_tos = generate_team_turnover_stats(game_pbp, away_team)
    
    home_team_stats = pd.merge(home_team_stats, home_team_tos, left_on="team", right_on="team", how='right')
    away_team_stats = pd.merge(away_team_stats, away_team_tos, left_on="team", right_on="team", how='right')
    home_team_stats['ExpTODiff'] = home_team_stats['ExpTO'] - away_team_stats['ExpTO']
    away_team_stats['ExpTODiff'] = away_team_stats['ExpTO'] - home_team_stats['ExpTO']
    
    home_team_stats['5FR'] = calculate_five_factors_rating(home_team_stats)
    away_team_stats['5FR'] = calculate_five_factors_rating(away_team_stats)
    home_team_stats['5FRDiff'] = home_team_stats['5FR'] - away_team_stats['5FR']
    away_team_stats['5FRDiff'] = away_team_stats['5FR'] - home_team_stats['5FR']
    
    comb_stat_pack = away_team_stats.append(home_team_stats)
    
    box = pd.DataFrame({
        "team" : [away_team, home_team],
        "Pts" : [away_score, home_score],
        "PtsDiff" : [away_score - home_score, home_score - away_score],
        "CfbDataWinProb" : [stringify_entry(game_data.away_post_win_prob),stringify_entry(game_data.home_post_win_prob)]
    })
    
    box = pd.merge(box, comb_stat_pack, left_on="team", right_on="team", how="right")
    box.rename(columns={"team": "Team"}, inplace=True)
    
    return box
calculate_box_score(401013183, 2018)

Unnamed: 0,Team,Pts,PtsDiff,CfbDataWinProb,OffSR,OffER,YPP,FP,PPD,OppEff,OppRate,OppSR,ExpTO,ExpTODiff,5FR,5FRDiff
0,Virginia,31.0,-3.0,0.875665,0.396825,0.111111,6.507937,26.0,2.214286,0.833333,0.428571,0.567568,1.15,-0.66,0.512424,-0.052568
1,Virginia Tech,34.0,3.0,0.124335,0.368421,0.078947,5.513158,31.4,2.266667,0.625,0.533333,0.44898,1.81,0.66,0.564992,0.052568


In [None]:
stored_game_boxes = pd.DataFrame()
game_ids = games.id.unique()
team_list = teams.school.tolist()
for i, row in games.iterrows():
    gameId = row.id
    print(f"[{i+1}/{len(game_ids)}] Getting game information for ESPN game_id: {gameId}")
    print(f"[{i+1}/{len(game_ids)}] Started processing game information for ESPN game_id: {gameId}")
    if ((row.home_team in team_list) & (row.away_team in team_list)):
        box_score = calculate_box_score(gameId, row.season)
        if (box_score is not None):
            game_tms = box_score.Team.tolist()
            print(f"[{i+1}/{len(game_ids)}] Completed processing game information for ESPN game_id: {gameId}")
            stored_game_boxes = stored_game_boxes.append(box_score)
            print(f"[{i+1}/{len(game_ids)}] Aggreggating game_id {gameId} to master data copy")
        else:
            print(f"[{i+1}/{len(game_ids)}] Got 'None' for game_id {gameId}'s box score, skipping processing")
    else:
        print(f"[{i+1}/{len(game_ids)}] Skipping checking game_id {gameId} bc one of the teams isn't FBS")

[1/3350] Getting game information for ESPN game_id: 400869090
[1/3350] Started processing game information for ESPN game_id: 400869090
[1/3350] Completed processing game information for ESPN game_id: 400869090
[1/3350] Aggreggating game_id 400869090 to master data copy
[2/3350] Getting game information for ESPN game_id: 400869421
[2/3350] Started processing game information for ESPN game_id: 400869421
[2/3350] Completed processing game information for ESPN game_id: 400869421
[2/3350] Aggreggating game_id 400869421 to master data copy
[3/3350] Getting game information for ESPN game_id: 400869257
[3/3350] Started processing game information for ESPN game_id: 400869257
[3/3350] Skipping checking game_id 400869257 bc one of the teams isn't FBS
[4/3350] Getting game information for ESPN game_id: 400869502
[4/3350] Started processing game information for ESPN game_id: 400869502
[4/3350] Skipping checking game_id 400869502 bc one of the teams isn't FBS
[5/3350] Getting game information for ES

[35/3350] Completed processing game information for ESPN game_id: 400868976
[35/3350] Aggreggating game_id 400868976 to master data copy
[36/3350] Getting game information for ESPN game_id: 400869507
[36/3350] Started processing game information for ESPN game_id: 400869507
[36/3350] Completed processing game information for ESPN game_id: 400869507
[36/3350] Aggreggating game_id 400869507 to master data copy
[37/3350] Getting game information for ESPN game_id: 400869260
[37/3350] Started processing game information for ESPN game_id: 400869260
[37/3350] Completed processing game information for ESPN game_id: 400869260
[37/3350] Aggreggating game_id 400869260 to master data copy
[38/3350] Getting game information for ESPN game_id: 400869509
[38/3350] Started processing game information for ESPN game_id: 400869509
[38/3350] Skipping checking game_id 400869509 bc one of the teams isn't FBS
[39/3350] Getting game information for ESPN game_id: 400869267
[39/3350] Started processing game infor

[71/3350] Completed processing game information for ESPN game_id: 400868973
[71/3350] Aggreggating game_id 400868973 to master data copy
[72/3350] Getting game information for ESPN game_id: 400868957
[72/3350] Started processing game information for ESPN game_id: 400868957
[72/3350] Completed processing game information for ESPN game_id: 400868957
[72/3350] Aggreggating game_id 400868957 to master data copy
[73/3350] Getting game information for ESPN game_id: 400869624
[73/3350] Started processing game information for ESPN game_id: 400869624
[73/3350] Skipping checking game_id 400869624 bc one of the teams isn't FBS
[74/3350] Getting game information for ESPN game_id: 400868969
[74/3350] Started processing game information for ESPN game_id: 400868969
[74/3350] Completed processing game information for ESPN game_id: 400868969
[74/3350] Aggreggating game_id 400868969 to master data copy
[75/3350] Getting game information for ESPN game_id: 400868887
[75/3350] Started processing game infor

[107/3350] Completed processing game information for ESPN game_id: 400869184
[107/3350] Aggreggating game_id 400869184 to master data copy
[108/3350] Getting game information for ESPN game_id: 400869101
[108/3350] Started processing game information for ESPN game_id: 400869101
[108/3350] Completed processing game information for ESPN game_id: 400869101
[108/3350] Aggreggating game_id 400869101 to master data copy
[109/3350] Getting game information for ESPN game_id: 400869277
[109/3350] Started processing game information for ESPN game_id: 400869277
[109/3350] Completed processing game information for ESPN game_id: 400869277
[109/3350] Aggreggating game_id 400869277 to master data copy
[110/3350] Getting game information for ESPN game_id: 400869433
[110/3350] Started processing game information for ESPN game_id: 400869433
[110/3350] Skipping checking game_id 400869433 bc one of the teams isn't FBS
[111/3350] Getting game information for ESPN game_id: 400868877
[111/3350] Started proces

[142/3350] Completed processing game information for ESPN game_id: 400868987
[142/3350] Aggreggating game_id 400868987 to master data copy
[143/3350] Getting game information for ESPN game_id: 400869612
[143/3350] Started processing game information for ESPN game_id: 400869612
[143/3350] Completed processing game information for ESPN game_id: 400869612
[143/3350] Aggreggating game_id 400869612 to master data copy
[144/3350] Getting game information for ESPN game_id: 400868981
[144/3350] Started processing game information for ESPN game_id: 400868981
[144/3350] Completed processing game information for ESPN game_id: 400868981
[144/3350] Aggreggating game_id 400868981 to master data copy
[145/3350] Getting game information for ESPN game_id: 400869357
[145/3350] Started processing game information for ESPN game_id: 400869357
[145/3350] Completed processing game information for ESPN game_id: 400869357
[145/3350] Aggreggating game_id 400869357 to master data copy
[146/3350] Getting game inf

[176/3350] Completed processing game information for ESPN game_id: 400869281
[176/3350] Aggreggating game_id 400869281 to master data copy
[177/3350] Getting game information for ESPN game_id: 400869519
[177/3350] Started processing game information for ESPN game_id: 400869519
[177/3350] Completed processing game information for ESPN game_id: 400869519
[177/3350] Aggreggating game_id 400869519 to master data copy
[178/3350] Getting game information for ESPN game_id: 400869520
[178/3350] Started processing game information for ESPN game_id: 400869520
[178/3350] Completed processing game information for ESPN game_id: 400869520
[178/3350] Aggreggating game_id 400869520 to master data copy
[179/3350] Getting game information for ESPN game_id: 400869001
[179/3350] Started processing game information for ESPN game_id: 400869001
[179/3350] Completed processing game information for ESPN game_id: 400869001
[179/3350] Aggreggating game_id 400869001 to master data copy
[180/3350] Getting game inf

[209/3350] Completed processing game information for ESPN game_id: 400868993
[209/3350] Aggreggating game_id 400868993 to master data copy
[210/3350] Getting game information for ESPN game_id: 400868996
[210/3350] Started processing game information for ESPN game_id: 400868996
[210/3350] Completed processing game information for ESPN game_id: 400868996
[210/3350] Aggreggating game_id 400868996 to master data copy
[211/3350] Getting game information for ESPN game_id: 400869521
[211/3350] Started processing game information for ESPN game_id: 400869521
[211/3350] Completed processing game information for ESPN game_id: 400869521
[211/3350] Aggreggating game_id 400869521 to master data copy
[212/3350] Getting game information for ESPN game_id: 400869522
[212/3350] Started processing game information for ESPN game_id: 400869522
[212/3350] Completed processing game information for ESPN game_id: 400869522
[212/3350] Aggreggating game_id 400869522 to master data copy
[213/3350] Getting game inf

[239/3350] Completed processing game information for ESPN game_id: 400869634
[239/3350] Aggreggating game_id 400869634 to master data copy
[240/3350] Getting game information for ESPN game_id: 400869363
[240/3350] Started processing game information for ESPN game_id: 400869363
[240/3350] Completed processing game information for ESPN game_id: 400869363
[240/3350] Aggreggating game_id 400869363 to master data copy
[241/3350] Getting game information for ESPN game_id: 400869204
[241/3350] Started processing game information for ESPN game_id: 400869204
[241/3350] Completed processing game information for ESPN game_id: 400869204
[241/3350] Aggreggating game_id 400869204 to master data copy
[242/3350] Getting game information for ESPN game_id: 400869203
[242/3350] Started processing game information for ESPN game_id: 400869203
[242/3350] Completed processing game information for ESPN game_id: 400869203
[242/3350] Aggreggating game_id 400869203 to master data copy
[243/3350] Getting game inf

[270/3350] Completed processing game information for ESPN game_id: 400869364
[270/3350] Aggreggating game_id 400869364 to master data copy
[271/3350] Getting game information for ESPN game_id: 400869816
[271/3350] Started processing game information for ESPN game_id: 400869816
[271/3350] Completed processing game information for ESPN game_id: 400869816
[271/3350] Aggreggating game_id 400869816 to master data copy
[272/3350] Getting game information for ESPN game_id: 400869365
[272/3350] Started processing game information for ESPN game_id: 400869365
[272/3350] Completed processing game information for ESPN game_id: 400869365
[272/3350] Aggreggating game_id 400869365 to master data copy
[273/3350] Getting game information for ESPN game_id: 400868913
[273/3350] Started processing game information for ESPN game_id: 400868913
[273/3350] Completed processing game information for ESPN game_id: 400868913
[273/3350] Aggreggating game_id 400868913 to master data copy
[274/3350] Getting game inf

[301/3350] Completed processing game information for ESPN game_id: 400869635
[301/3350] Aggreggating game_id 400869635 to master data copy
[302/3350] Getting game information for ESPN game_id: 400869011
[302/3350] Started processing game information for ESPN game_id: 400869011
[302/3350] Skipping checking game_id 400869011 bc one of the teams isn't FBS
[303/3350] Getting game information for ESPN game_id: 400869620
[303/3350] Started processing game information for ESPN game_id: 400869620
[303/3350] Completed processing game information for ESPN game_id: 400869620
[303/3350] Aggreggating game_id 400869620 to master data copy
[304/3350] Getting game information for ESPN game_id: 400869454
[304/3350] Started processing game information for ESPN game_id: 400869454
[304/3350] Completed processing game information for ESPN game_id: 400869454
[304/3350] Aggreggating game_id 400869454 to master data copy
[305/3350] Getting game information for ESPN game_id: 400869294
[305/3350] Started proces

[333/3350] Completed processing game information for ESPN game_id: 400869372
[333/3350] Aggreggating game_id 400869372 to master data copy
[334/3350] Getting game information for ESPN game_id: 400869371
[334/3350] Started processing game information for ESPN game_id: 400869371
[334/3350] Completed processing game information for ESPN game_id: 400869371
[334/3350] Aggreggating game_id 400869371 to master data copy
[335/3350] Getting game information for ESPN game_id: 400869010
[335/3350] Started processing game information for ESPN game_id: 400869010
[335/3350] Completed processing game information for ESPN game_id: 400869010
[335/3350] Aggreggating game_id 400869010 to master data copy
[336/3350] Getting game information for ESPN game_id: 400869374
[336/3350] Started processing game information for ESPN game_id: 400869374
[336/3350] Completed processing game information for ESPN game_id: 400869374
[336/3350] Aggreggating game_id 400869374 to master data copy
[337/3350] Getting game inf

[365/3350] Completed processing game information for ESPN game_id: 400869021
[365/3350] Aggreggating game_id 400869021 to master data copy
[366/3350] Getting game information for ESPN game_id: 400868951
[366/3350] Started processing game information for ESPN game_id: 400868951
[366/3350] Completed processing game information for ESPN game_id: 400868951
[366/3350] Aggreggating game_id 400868951 to master data copy
[367/3350] Getting game information for ESPN game_id: 400869458
[367/3350] Started processing game information for ESPN game_id: 400869458
[367/3350] Completed processing game information for ESPN game_id: 400869458
[367/3350] Aggreggating game_id 400869458 to master data copy
[368/3350] Getting game information for ESPN game_id: 400869300
[368/3350] Started processing game information for ESPN game_id: 400869300
[368/3350] Completed processing game information for ESPN game_id: 400869300
[368/3350] Aggreggating game_id 400869300 to master data copy
[369/3350] Getting game inf

[397/3350] Completed processing game information for ESPN game_id: 400869134
[397/3350] Aggreggating game_id 400869134 to master data copy
[398/3350] Getting game information for ESPN game_id: 400869130
[398/3350] Started processing game information for ESPN game_id: 400869130
[398/3350] Completed processing game information for ESPN game_id: 400869130
[398/3350] Aggreggating game_id 400869130 to master data copy
[399/3350] Getting game information for ESPN game_id: 400869215
[399/3350] Started processing game information for ESPN game_id: 400869215
[399/3350] Completed processing game information for ESPN game_id: 400869215
[399/3350] Aggreggating game_id 400869215 to master data copy
[400/3350] Getting game information for ESPN game_id: 400869218
[400/3350] Started processing game information for ESPN game_id: 400869218
[400/3350] Completed processing game information for ESPN game_id: 400869218
[400/3350] Aggreggating game_id 400869218 to master data copy
[401/3350] Getting game inf

In [None]:
stored_game_boxes.head()

In [None]:
%matplotlib inline
import numpy as np
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(15,8))
ax.scatter(stored_game_boxes['5FRDiff'], stored_game_boxes.PtsDiff);
ax.set_xlabel("Five Factors Rating Difference")
ax.set_ylabel("Point Differential");

In [None]:
# Eliminate outliers
# basis = stored_game_boxes[stored_game_boxes['5FRDiff'] != 0]
stored_game_boxes['5fr_z_score'] = np.abs(stats.zscore(stored_game_boxes['5FRDiff']))
stored_game_boxes['pts_z_score'] = np.abs(stats.zscore(stored_game_boxes['PtsDiff']))
outliers = stored_game_boxes[(stored_game_boxes['5fr_z_score'] >= 3) | (stored_game_boxes['pts_z_score'] >= 3)]
basis = stored_game_boxes[(stored_game_boxes['5fr_z_score'] < 3) & (stored_game_boxes['pts_z_score'] < 3)]
msk = np.random.rand(len(basis)) < 0.80
train_data = basis[msk]
test_data = basis[~msk]

In [None]:
outliers

In [None]:
train_data.head()

In [None]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train_data['5FRDiff'][:, np.newaxis], train_data.PtsDiff)

f, ax = plt.subplots(figsize=(15,8))
ax.scatter(basis['5FRDiff'], basis.PtsDiff)
ax.set_xlabel("Five Factors Rating Difference")
ax.set_ylabel("Point Differential")

xfit = test_data['5FRDiff']
yfit = model.predict(xfit[:, np.newaxis])
ax.plot(xfit, yfit, color='red', label='Linear Regression');

In [None]:
print(f'Linear Regression: y = {model.coef_[0]:.5f}x + {model.intercept_:.5f}')

test = pd.DataFrame({
    'ActPtsDiff':test_data['PtsDiff'],
    'PredPtsDiff' : yfit
})

test.corr()

In [None]:
def generate_win_prob(game_id, year):
    sample_box = calculate_box_score(game_id, year)
    mu = yfit.mean()
    std = yfit.std()

    max_box_row = sample_box[sample_box['PtsDiff'] == max(sample_box['PtsDiff'])]
    print(f"Actual Winner: {stringify_entry(max_box_row.Team)}")
    print(f"MOV: {stringify_entry(max_box_row.Team)} by {stringify_entry(max_box_row.PtsDiff)}")
    print(f"5FRDiff for {stringify_entry(max_box_row.Team)}: {stringify_entry(max_box_row['5FRDiff'])}")
    print("")
    proj_point_diff = model.coef_[0] * stringify_entry(max_box_row['5FRDiff']) + model.intercept_
    print(f"Proj MOV by 5FRDiff: {stringify_entry(max_box_row.Team)} by {round(proj_point_diff)} (exact value: {proj_point_diff})")
    z = (proj_point_diff - mu) / std
    print(f"Z score: {z}")
    print(f"Win Prob for {stringify_entry(max_box_row.Team)}: {(100 * stats.norm.cdf(z)):.2f}%")

In [None]:
generate_win_prob(401013183, 2018)  # 2018 UVA at VT for sample (this should be in the dataset, so not ideal)
calculate_box_score(401013183, 2018)

In [None]:
generate_win_prob(401112488, 2019)  # 2019 GT at MIA
calculate_box_score(401112488, 2019)

In [None]:
generate_win_prob(401112513, 2019)  # 2019 NCST at GT
calculate_box_score(401112513, 2019)

In [None]:
generate_win_prob(401110863, 2019)  # 2019 Ole Miss at MSST
calculate_box_score(401110863, 2019)

In [None]:
generate_win_prob(401012356, 2018) # 2018 LSU vs TAMU  (this should be in the dataset, so not ideal)
calculate_box_score(401012356, 2018)