In [10]:
# group plays by game
# process each game for five factors diff
# get df of ff diff and point diff
# run lin reg
# check against other more recent games

import requests
import pandas as pd
import json
import html
import os.path
import numpy as np
from scipy import stats
    
teams = pd.read_csv("data/teams/2018.csv", encoding = 'latin-1')

games = pd.read_csv("data/games/2018.csv", encoding = 'latin-1')
games = games.append(pd.read_csv("data/games/2017.csv", encoding = 'latin-1'))
games = games.append(pd.read_csv("data/games/2016.csv", encoding = 'latin-1'))

pbp_data = pd.read_csv("data/pbp/2018.csv", encoding = 'latin-1')
pbp_data = pbp_data.append(pd.read_csv("data/pbp/2017.csv", encoding = 'latin-1'))
pbp_data = pbp_data.append(pd.read_csv("data/pbp/2016.csv", encoding = 'latin-1'))

base_drives = pd.read_json("data/drives/2018.json")
base_drives = base_drives.append(pd.read_csv("data/drives/2017.csv", encoding = 'latin-1'))
base_drives = base_drives.append(pd.read_csv("data/drives/2016.csv", encoding = 'latin-1'))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [11]:
# Data cleaning
base_drives = base_drives[
    (~base_drives.drive_result.isin(['Uncategorized']))
]

drives = pd.merge(base_drives, games[['id','away_team','home_team']], left_on='game_id', right_on='id', how='right')
drives.rename(columns={'id_x':'drive_id'}, inplace=True)
drives.drop(['id_y'], axis = 1, inplace=True)
drives.dropna(inplace=True)
drives.drop(['offense_conference','start_time','end_time','defense_conference','elapsed'], axis = 1, inplace=True) 

drives.loc[
    drives.offense == drives.away_team, ['start_yardline']
] = 100 - drives.start_yardline
drives.loc[
    drives.offense == drives.away_team, ['end_yardline']
] = 100 - drives.end_yardline
pbp_data = pbp_data[
    (pbp_data.down != 0)
]

pbp_data.distance = pbp_data.distance.astype(float)

pbp_data.drop(['offense_conference','defense_conference'], axis = 1, inplace=True) 
# Ignore some types of plays cause they're special teams and weird
ignore_types = ["Defensive 2pt Conversion","Blocked Field Goal","Blocked Punt","Missed Field Goal Return","Blocked Punt Touchdown","Missed Field Goal Return Touchdown","Extra Point Missed","Extra Point Good","Timeout","End of Half","End of Game","Uncategorized","Penalty","Kickoff","Kickoff Return (Offense)","Kickoff Return Touchdown","Punt", "Field Goal Good","Field Goal Missed","Safety"]
pbp_data = pbp_data[~(pbp_data.play_type.isin(ignore_types))]

bad_types = ["Interception","Pass Interception Return","Interception Return Touchdown",'Fumble Recovery (Opponent)','Sack','Fumble Return Touchdown']
pbp_data.loc[
    ((pbp_data.play_type.isin(bad_types))
     & (~pbp_data.play_type.str.contains('Sack'))) ,['yards_gained']] = 0

In [12]:
def verify_division(num1, num2):
    if num2 == 0:
        return 0
    else:
        return num1 / num2
    
def calculate_success_in_scoring_opps(pbp, opps, team):
    opp_ids = opps.drive_id.unique()
    success = 0
    total = 0
    for opp_id in opp_ids:
        opp_set = pbp[(pbp.drive_id == opp_id) & (pbp.offense == team)]
        opp_s_rate = verify_division(len(opp_set[opp_set.play_successful == True]), len(opp_set))
        success += len(opp_set[opp_set.play_successful == True])
        total += len(opp_set)
    s_rate = 0 if total == 0 else (success / total)
    return s_rate
    
def is_successful(down, distance, yards_gained, play_type):
    if (play_type in bad_types):
        return False 
    if ((down == 1) & (yards_gained >= (0.5 * distance))):
        return True
    elif ((down == 2)) & (yards_gained >= (0.7 * distance)):
        return True
    elif ((down == 3) & (yards_gained >= distance)):
        return True
    elif ((down == 4) & (yards_gained >= distance)):
        return True
    else:
        return False

def is_explosive(yards_gained):
    if (yards_gained >= 15):
        return True
    else:
        return False
    
pbp_data['play_explosive'] = np.vectorize(is_explosive)(pbp_data.yards_gained)
pbp_data['play_successful'] = np.vectorize(is_successful)(pbp_data.down, pbp_data.distance, pbp_data.yards_gained, pbp_data.play_type)
    
def calculate_success_rate(pbp, exclude_types):
    return verify_division(len(pbp[(pbp.play_successful == True) & (~pbp.play_type.isin(exclude_types))]), len(pbp[(~pbp.play_type.isin(exclude_types))]))
    
def calculate_exp_rate(pbp, exclude_types):
    return verify_division(len(pbp[(pbp.play_explosive == True) & (~pbp.play_type.isin(exclude_types))]), len(pbp[(~pbp.play_type.isin(exclude_types))]))
    
standard_downs = pbp_data[
    (pbp_data.down == 1)
    | ((pbp_data.down == 2) & (pbp_data.distance <= 7))
    | ((pbp_data.down == 3) & (pbp_data.distance <= 4))
    | ((pbp_data.down == 4) & (pbp_data.distance <= 4)) 
]

passing_downs = pbp_data[
    ((pbp_data.down == 2) & (pbp_data.distance >= 8))
    | ((pbp_data.down == 3) & (pbp_data.distance >= 5))
    | ((pbp_data.down == 4) & (pbp_data.distance >= 5)) 
]

pass_types = ["Pass Reception","Pass Incompletion","Passing Touchdown","Interception","Pass Interception Return","Interception Return Touchdown","Sack"]
rush_types = ["Rush","Rushing Touchdown",'Fumble Recovery (Opponent)','Fumble Return Touchdown']

In [13]:
def generate_team_play_stats(pbp, team):
    team_off_plays = pbp[pbp.offense == team]
    off_sr = calculate_success_rate(team_off_plays, [])
    off_er = calculate_exp_rate(team_off_plays, [])
    return pd.DataFrame({
        'team': [team],
        "OffSR": [off_sr],
        "OffER" : [off_er]
    })

def generate_team_drive_stats(drvs, pbp, gm, points, team):
    team_drives = drvs[drvs.offense == team]
    scoring_opps = team_drives[
        ((team_drives.start_yardline + team_drives.yards) >= 60)
    ]
    avg_fp = verify_division(sum(team_drives.start_yardline), len(team_drives))
    ypp = verify_division(sum(team_drives.yards), sum(team_drives.plays))
    ppd = verify_division(points, len(team_drives))
    opp_effcy = verify_division(len(scoring_opps[scoring_opps.scoring == True]), len(scoring_opps))
    opp_rate = verify_division(len(scoring_opps), len(team_drives))
    opp_sr = calculate_success_in_scoring_opps(pbp, scoring_opps, team)
    return pd.DataFrame({
        'team': [team],
        'FP': [avg_fp],
        'YPP': [ypp],
        'PPD': [ppd],
        'OppEff': [opp_effcy],
        'OppRate': [opp_rate],
        'OppSR': [opp_sr]
    })

def generate_team_turnover_stats(pbp, team):
    adj_turnover_plays = pbp[
        (pbp.play_type.str.contains('Interception', regex=False))
        | ((pbp.play_type == 'Pass Incompletion')
        & (pbp.play_text.str.contains('broken up', regex=False)))
        | (pbp.play_type.str.contains('Fumble', regex=False))
    ]

    fum_plays = adj_turnover_plays[
        (adj_turnover_plays.play_type.str.contains('Fumble', regex=False))
    ]

    # away_team Adj Turnovers
    team_tos = adj_turnover_plays[
        (adj_turnover_plays.offense == team)
        | (adj_turnover_plays.defense == team)
    ]

    team_ints_off = team_tos[
       (team_tos.play_type.str.contains('Interception', regex=False))
        & (team_tos.offense == team)
    ]

    team_pds = team_tos[
       (team_tos.play_type == 'Pass Incompletion')
        & (team_tos.play_text.str.contains('broken up', regex=False))
        & (team_tos.offense == team)
    ]
    
    exp_to = (0.22 * (len(team_pds) + len(team_ints_off))) + (0.49 * len(fum_plays))

    return pd.DataFrame({
        'team' : [team],
        'ExpTO': [exp_to]
    })

def stringify_entry(team_entry):
    return team_entry.tolist()[0]

def calculate_five_factors_rating(team_stat_pack):
    return (35 * team_stat_pack.OffER) + (25 * (team_stat_pack.OffSR + team_stat_pack.YPP) + (15 * team_stat_pack.FP) + (15 * (team_stat_pack.PPD + team_stat_pack.OppSR + team_stat_pack.OppEff)) + (10 * team_stat_pack.ExpTODiff))

def calculate_box_score(game_id):
    game_data = games[games.id == game_id]
    
    home_team = stringify_entry(game_data.home_team)
    away_team = stringify_entry(game_data.away_team)
    home_score = stringify_entry(game_data.home_points)
    away_score = stringify_entry(game_data.away_points)
    
    game_drives = drives[drives.game_id == game_id]
    
    game_pbp = pbp_data[pbp_data.drive_id.isin(game_drives.drive_id.tolist())]

    home_team_play_stats = generate_team_play_stats(game_pbp, home_team)
    away_team_play_stats = generate_team_play_stats(game_pbp, away_team)
    
    home_team_drv_stats = generate_team_drive_stats(game_drives, game_pbp, game_data, home_score, home_team)
    away_team_drv_stats = generate_team_drive_stats(game_drives, game_pbp, game_data, away_score, away_team)
    
    home_team_stats = pd.merge(home_team_play_stats, home_team_drv_stats, left_on="team", right_on="team", how='right')
    away_team_stats = pd.merge(away_team_play_stats, away_team_drv_stats, left_on="team", right_on="team", how='right')
    
    home_team_tos = generate_team_turnover_stats(game_pbp, home_team)
    away_team_tos = generate_team_turnover_stats(game_pbp, away_team)
    
    home_team_tos['ExpTODiff'] = home_team_tos['ExpTO'] - away_team_tos['ExpTO']
    away_team_tos['ExpTODiff'] = away_team_tos['ExpTO'] - home_team_tos['ExpTO']
    
    home_team_stats = pd.merge(home_team_stats, home_team_tos, left_on="team", right_on="team", how='right')
    home_team_stats['5FR'] = calculate_five_factors_rating(home_team_stats)
    away_team_stats = pd.merge(away_team_stats, away_team_tos, left_on="team", right_on="team", how='right')
    away_team_stats['5FR'] = calculate_five_factors_rating(away_team_stats)
    
    home_team_stats['5FRDiff'] = home_team_stats['5FR'] - away_team_stats['5FR']
    away_team_stats['5FRDiff'] = away_team_stats['5FR'] - home_team_stats['5FR']
    
    comb_stat_pack = away_team_stats.append(home_team_stats)
    
    box = pd.DataFrame({
        "team" : [away_team, home_team],
        "Pts" : [away_score, home_score],
        "PtsDiff" : [away_score - home_score, home_score - away_score],
        "CfbDataWinProb" : [stringify_entry(game_data.away_post_win_prob),stringify_entry(game_data.home_post_win_prob)]
    })
    
    box = pd.merge(box, comb_stat_pack, left_on="team", right_on="team", how="right")
    box.rename(columns={"team": "Team"}, inplace=True)
    
    return box

In [None]:
stored_game_boxes = pd.DataFrame()
game_ids = games.id.unique()
team_list = teams.school.tolist()
for i in range(len(game_ids)):
    gameId = game_ids[i]
    print(f"[{i+1}/{len(game_ids)}] Getting game information for ESPN game_id: {gameId}")
    box_score = calculate_box_score(gameId)
    game_tms = box_score.Team.tolist()
    if ((game_tms[0] in team_list) & (game_tms[1] in team_list)):
        print(f"[{i+1}/{len(game_ids)}] Started processing game information for ESPN game_id: {gameId}")
        stored_game_boxes = stored_game_boxes.append(box_score)
        print(f"[{i+1}/{len(game_ids)}] Completed processing game information for ESPN game_id: {gameId}")
    else:
        print(f"[{i+1}/{len(game_ids)}] Skipping processing for game_id {gameId} bc one of the teams isn't FBS")

[1/2511] Getting game information for ESPN game_id: 401013357
[1/2511] Skipping processing for game_id 401013357 bc one of the teams isn't FBS
[2/2511] Getting game information for ESPN game_id: 401014972
[2/2511] Skipping processing for game_id 401014972 bc one of the teams isn't FBS
[3/2511] Getting game information for ESPN game_id: 401022510
[3/2511] Started processing game information for ESPN game_id: 401022510
[3/2511] Completed processing game information for ESPN game_id: 401022510
[4/2511] Getting game information for ESPN game_id: 401013437
[4/2511] Started processing game information for ESPN game_id: 401013437
[4/2511] Completed processing game information for ESPN game_id: 401013437
[5/2511] Getting game information for ESPN game_id: 401020671
[5/2511] Skipping processing for game_id 401020671 bc one of the teams isn't FBS
[6/2511] Getting game information for ESPN game_id: 401019470
[6/2511] Started processing game information for ESPN game_id: 401019470
[6/2511] Complet

[48/2511] Started processing game information for ESPN game_id: 401012862
[48/2511] Completed processing game information for ESPN game_id: 401012862
[49/2511] Getting game information for ESPN game_id: 401012247
[49/2511] Skipping processing for game_id 401012247 bc one of the teams isn't FBS
[50/2511] Getting game information for ESPN game_id: 401012255
[50/2511] Skipping processing for game_id 401012255 bc one of the teams isn't FBS
[51/2511] Getting game information for ESPN game_id: 401012684
[51/2511] Started processing game information for ESPN game_id: 401012684
[51/2511] Completed processing game information for ESPN game_id: 401012684
[52/2511] Getting game information for ESPN game_id: 401012678
[52/2511] Started processing game information for ESPN game_id: 401012678
[52/2511] Completed processing game information for ESPN game_id: 401012678
[53/2511] Getting game information for ESPN game_id: 401014973
[53/2511] Skipping processing for game_id 401014973 bc one of the teams

[96/2511] Started processing game information for ESPN game_id: 401013367
[96/2511] Completed processing game information for ESPN game_id: 401013367
[97/2511] Getting game information for ESPN game_id: 401012271
[97/2511] Started processing game information for ESPN game_id: 401012271
[97/2511] Completed processing game information for ESPN game_id: 401012271
[98/2511] Getting game information for ESPN game_id: 401012266
[98/2511] Started processing game information for ESPN game_id: 401012266
[98/2511] Completed processing game information for ESPN game_id: 401012266
[99/2511] Getting game information for ESPN game_id: 401013100
[99/2511] Started processing game information for ESPN game_id: 401013100
[99/2511] Completed processing game information for ESPN game_id: 401013100
[100/2511] Getting game information for ESPN game_id: 401013104
[100/2511] Started processing game information for ESPN game_id: 401013104
[100/2511] Completed processing game information for ESPN game_id: 40101

[143/2511] Skipping processing for game_id 401014987 bc one of the teams isn't FBS
[144/2511] Getting game information for ESPN game_id: 401012852
[144/2511] Started processing game information for ESPN game_id: 401012852
[144/2511] Completed processing game information for ESPN game_id: 401012852
[145/2511] Getting game information for ESPN game_id: 401013329
[145/2511] Started processing game information for ESPN game_id: 401013329
[145/2511] Completed processing game information for ESPN game_id: 401013329
[146/2511] Getting game information for ESPN game_id: 401012262
[146/2511] Skipping processing for game_id 401012262 bc one of the teams isn't FBS
[147/2511] Getting game information for ESPN game_id: 401014992
[147/2511] Skipping processing for game_id 401014992 bc one of the teams isn't FBS
[148/2511] Getting game information for ESPN game_id: 401012263
[148/2511] Started processing game information for ESPN game_id: 401012263
[148/2511] Completed processing game information for

[187/2511] Started processing game information for ESPN game_id: 401014996
[187/2511] Completed processing game information for ESPN game_id: 401014996
[188/2511] Getting game information for ESPN game_id: 401012747
[188/2511] Skipping processing for game_id 401012747 bc one of the teams isn't FBS
[189/2511] Getting game information for ESPN game_id: 401012283
[189/2511] Started processing game information for ESPN game_id: 401012283
[189/2511] Completed processing game information for ESPN game_id: 401012283
[190/2511] Getting game information for ESPN game_id: 401012890
[190/2511] Started processing game information for ESPN game_id: 401012890
[190/2511] Completed processing game information for ESPN game_id: 401012890
[191/2511] Getting game information for ESPN game_id: 401020679
[191/2511] Started processing game information for ESPN game_id: 401020679
[191/2511] Completed processing game information for ESPN game_id: 401020679
[192/2511] Getting game information for ESPN game_id:

[231/2511] Started processing game information for ESPN game_id: 401012740
[231/2511] Completed processing game information for ESPN game_id: 401012740
[232/2511] Getting game information for ESPN game_id: 401012737
[232/2511] Skipping processing for game_id 401012737 bc one of the teams isn't FBS
[233/2511] Getting game information for ESPN game_id: 401019489
[233/2511] Started processing game information for ESPN game_id: 401019489
[233/2511] Completed processing game information for ESPN game_id: 401019489
[234/2511] Getting game information for ESPN game_id: 401019490
[234/2511] Started processing game information for ESPN game_id: 401019490
[234/2511] Completed processing game information for ESPN game_id: 401019490
[235/2511] Getting game information for ESPN game_id: 401012807
[235/2511] Started processing game information for ESPN game_id: 401012807
[235/2511] Completed processing game information for ESPN game_id: 401012807
[236/2511] Getting game information for ESPN game_id:

In [None]:
stored_game_boxes

In [None]:
%matplotlib inline
import numpy as np
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(15,8))
ax.scatter(stored_game_boxes['5FRDiff'], stored_game_boxes.PtsDiff);
ax.set_xlabel("Five Factors Rating Difference")
ax.set_ylabel("Point Differential");

In [None]:
# Eliminate outliers
stored_game_boxes['5fr_z_score'] = np.abs(stats.zscore(stored_game_boxes['5FRDiff']))
stored_game_boxes['pts_z_score'] = np.abs(stats.zscore(stored_game_boxes['PtsDiff']))
basis = stored_game_boxes[(stored_game_boxes['5fr_z_score'] < 3) & (stored_game_boxes['pts_z_score'] < 3)]
msk = np.random.rand(len(basis)) < 0.80
train_data = basis[msk]
test_data = basis[~msk]

In [None]:
train_data.head()

In [None]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train_data['5FRDiff'][:, np.newaxis], train_data.PtsDiff)

f, ax = plt.subplots(figsize=(15,8))
ax.scatter(basis['5FRDiff'], basis.PtsDiff)
ax.set_xlabel("Five Factors Rating Difference")
ax.set_ylabel("Point Differential")

xfit = test_data['5FRDiff']
yfit = model.predict(xfit[:, np.newaxis])
ax.plot(xfit, yfit, color='red', label='Linear Regression');

In [None]:
print(f'Linear Regression: y = {model.coef_[0]:.5f}x + {model.intercept_:.5f}')

In [None]:
sample_box = calculate_box_score(401013183) # 2018 UVA at VT for sample
mu = basis['5FRDiff'].mean()
std = basis['5FRDiff'].std()

max_box_row = sample_box[sample_box['PtsDiff'] == max(sample_box['PtsDiff'])]
print(f"Actual Winner: {stringify_entry(max_box_row.Team)}")
print(f"MOV: {stringify_entry(max_box_row.Team)} by {stringify_entry(max_box_row.PtsDiff)}")
print(f"5FRDiff for {stringify_entry(max_box_row.Team)}: {stringify_entry(max_box_row['5FRDiff'])}")
print("")
proj_point_diff = model.coef_[0] * stringify_entry(max_box_row['5FRDiff']) + model.intercept_
print(f"Proj MOV by 5FRDiff: {stringify_entry(max_box_row.Team)} by {proj_point_diff}")
z = (proj_point_diff - mu) / std
print(f"Win Prob for {stringify_entry(max_box_row.Team)}: {100 * stats.norm.cdf(z)}%")