Importing libraries

In [27]:
import pandas as pd
import numpy as np

import os
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait

import glob

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

Importing existing data to be augmented with new features

In [28]:
directory = '/users/blaizelahman/Desktop/CFB Model/Original Data'
pattern = os.path.join(directory, '*model*.csv')

teamFiles = glob.glob(pattern)

teamDict = {}

for file in teamFiles:

    teamDF = pd.read_csv(file)

    if teamDF.shape[0] >= 56:

        # throwing out first 20 rows due to missing rolling_sum data
        teamDF = teamDF.drop(teamDF.index[:20])
        teamDF = teamDF.reset_index(drop = True)
    
        key = teamDF['School'][0]
        teamDict[key] = teamDF
    
        print(f'Added: {key}')


    

Added: Louisiana Tech
Added: Southern Mississippi
Added: Arizona State
Added: Auburn
Added: Texas Tech
Added: Minnesota
Added: NC State
Added: Georgia
Added: USC
Added: South Florida
Added: Wisconsin
Added: Miami
Added: Mississippi State
Added: Houston
Added: San José State
Added: Oklahoma State
Added: UCLA
Added: Rice
Added: Texas State
Added: Iowa
Added: Western Michigan
Added: Charlotte
Added: Florida International
Added: Ole Miss
Added: UTEP
Added: Boston College
Added: Troy
Added: Wake Forest
Added: Baylor
Added: Bowling Green
Added: TCU
Added: Virginia
Added: Utah State
Added: Appalachian State
Added: Michigan
Added: New Mexico State
Added: SMU
Added: South Carolina
Added: Georgia Southern
Added: Vanderbilt
Added: California
Added: Ohio State
Added: Louisiana Monroe
Added: Louisiana
Added: Eastern Michigan
Added: UMass
Added: Alabama
Added: Miami (OH)
Added: Iowa State
Added: Arizona
Added: UAB
Added: Akron
Added: Cincinnati
Added: Virginia Tech
Added: Tulane
Added: Texas
Added: 

Grabbing team talent composite rankings from collegefootballdata.com
Stat is made by 24/7 Sports and is based off of recruit/transfer player rankings

In [3]:
# setting the download directory and Chrome settings
directory = "/Users/blaizelahman/Desktop/CFBData"
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory": directory}
chromeOptions.add_experimental_option("prefs", prefs)

# creating Chrome driver
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options = chromeOptions)

# making a dictionary to hold team talent rankings
talentRatings = {}

# downloading team talent composite ratings for years 2015-2023 from collegefootballdata.com
for year in range(2015, 2024):

    # skipping 2020 because it has bad data
    if year == 2020: 
        continue
        
    try:
        url = f'https://collegefootballdata.com/exporter/talent?year={year}'
        driver.get(url)
        time.sleep(4) 
            
        # clicking the query button
        query = driver.find_element(By.XPATH, "//button[contains(span/text(), 'Query')]")
        query.click()
        time.sleep(3) 
            
        # clicking the export button
        export = driver.find_element(By.XPATH, "//button[contains(span/text(), 'Export')]")
        export.click()
        time.sleep(3)
        
        key = str(year)
            
        # grabs files from CFBData folder
        files = os.listdir(directory)
        
        # grab the file paths for all files ending in .csv
        filePaths = [os.path.join(directory, name) for name in files if name.endswith('.csv')]

        # grabbing the most recently made file out of those in paths
        file = max(filePaths, key=os.path.getctime)
            
        # loading csv file
        talentRatings[key] = pd.read_csv(file)

        # deleting the file after it has been added
        os.remove(file)

    except Exception as e:
        print(f'Cannot grab data for {year}. Error: {e}')

    print(f'Successfully grabbed data for {year}')

driver.quit()

Successfully grabbed data for 2015
Successfully grabbed data for 2016
Successfully grabbed data for 2017
Successfully grabbed data for 2018
Successfully grabbed data for 2019
Successfully grabbed data for 2021
Successfully grabbed data for 2022
Successfully grabbed data for 2023


In [4]:
talentRatings['2023'].head(10)

Unnamed: 0,Year,School,Talent
0,2023,Alabama,1015.43
1,2023,Georgia,977.87
2,2023,Ohio State,974.79
3,2023,Texas A&M,925.92
4,2023,Clemson,917.88
5,2023,Texas,913.24
6,2023,LSU,899.31
7,2023,USC,896.41
8,2023,Oklahoma,884.97
9,2023,Oregon,874.74


Will need to keep in mind that military teams do not have talent ratings

Now we need to augment original team's dataframe with the ratings

In [5]:
for team, teamDF in teamDict.items():

    for year in teamDF['Year'].unique():
        
        # checks if year is in talentRating
        if str(year) in talentRatings.keys():

            # grabs the dataframe from that year and the talent rating of that team in that year
            talentDF = talentRatings[str(year)]
            talent = talentDF.loc[talentDF['School'] == team, 'Talent']

            # assigns talent column with the corresponding talent ratings for games in the given year
            teamDF.loc[teamDF['Year'] == year, 'talent'] = talent.values[0]

    teamDict[team] = teamDF

        

In [6]:
teamDict['Alabama']['talent'].unique()

array([    nan,  981.9 ,  982.66,  997.57,  978.54,  984.96, 1004.04,
       1016.79, 1015.43])

Now let's merge the opponent's talent ratings 

In [7]:
def mergeTalent(team):

    # making a copy of the teams dataframe from teamDict
    teamDF = teamDict[team].copy() 

    teamDF['talent_opp'] = np.nan

    # going through the dataframe and merging the opponent's talent rating column row by row
    for index, row in teamDF.iterrows():

        # grabbing year and each opponent's name to access their talent rating in talentRatings
        year = row['Year']
        oppName = row['School_opp']

        if str(year) in talentRatings.keys():

            yearDF = talentRatings[str(year)]

            
            # getting the opponent's talent rating column from the game they played the given team
            oppRow = yearDF[yearDF['School'] == oppName]

            if not oppRow.empty:

                oppTalent = oppRow.iloc[0]['Talent']
    
                # merging the opponent's talent column on the row the team plays them
                teamDF.loc[index, 'talent_opp'] = oppTalent 
        
    return teamDF

In [8]:
for team in teamDict.keys():
    
    updatedDF = mergeTalent(team)
    teamDict[team] = updatedDF
    print(f'Merged: {team}')

Merged: Boston College
Merged: Rutgers
Merged: Auburn
Merged: North Texas
Merged: Nebraska
Merged: Oklahoma State
Merged: Arizona State
Merged: Eastern Michigan
Merged: Louisiana
Merged: Colorado State
Merged: Idaho
Merged: Illinois
Merged: Air Force
Merged: Kent State
Merged: Louisiana Monroe
Merged: Iowa
Merged: Akron
Merged: Ohio
Merged: Georgia
Merged: South Alabama
Merged: Georgia Tech
Merged: Western Kentucky
Merged: Maryland
Merged: Arizona
Merged: Minnesota
Merged: Pittsburgh
Merged: Marshall
Merged: Louisiana Tech
Merged: Virginia Tech
Merged: California
Merged: Georgia Southern
Merged: Rice
Merged: Missouri
Merged: UCF
Merged: Kansas State
Merged: Appalachian State
Merged: Michigan
Merged: Clemson
Merged: Oregon
Merged: Florida Atlantic
Merged: Central Michigan
Merged: Louisville
Merged: Navy
Merged: Washington State
Merged: Tennessee
Merged: Arkansas State
Merged: Kansas
Merged: Miami
Merged: Alabama
Merged: Vanderbilt
Merged: USC
Merged: Indiana
Merged: UMass
Merged: Utah S

In [9]:
teamDict['Alabama'][-10:-1]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Game Id,School,Conference,HomeAway,Points,Week,Year,completionAttempts,defensiveTDs,firstDowns,fourthDownEff,fumblesLost,fumblesRecovered,interceptionTDs,interceptionYards,interceptions,kickReturnTDs,kickReturnYards,kickReturns,kickingPoints,netPassingYards,passesDeflected,passesIntercepted,passingTDs,possessionTime,puntReturnTDs,puntReturnYards,puntReturns,qbHurries,rushingAttempts,rushingTDs,rushingYards,sacks,tackles,tacklesForLoss,thirdDownEff,totalFumbles,totalPenaltiesYards,totalYards,turnovers,yardsPerPass,yardsPerRushAttempt,totalTDs,School_opp,Conference_opp,HomeAway_opp,Points_opp,Week_opp,Year_opp,completionAttempts_opp,defensiveTDs_opp,firstDowns_opp,fourthDownEff_opp,fumblesLost_opp,fumblesRecovered_opp,interceptionTDs_opp,interceptionYards_opp,interceptions_opp,kickReturnTDs_opp,kickReturnYards_opp,kickReturns_opp,kickingPoints_opp,netPassingYards_opp,passesDeflected_opp,passesIntercepted_opp,passingTDs_opp,possessionTime_opp,puntReturnTDs_opp,puntReturnYards_opp,puntReturns_opp,qbHurries_opp,rushingAttempts_opp,rushingTDs_opp,rushingYards_opp,sacks_opp,tackles_opp,tacklesForLoss_opp,thirdDownEff_opp,totalFumbles_opp,totalPenaltiesYards_opp,totalYards_opp,turnovers_opp,yardsPerPass_opp,yardsPerRushAttempt_opp,totalTDs_opp,scoreDiff,pointTotal,Win,rolling_sum_Points20,rolling_sum_firstDowns20,rolling_sum_fumblesLost20,rolling_sum_fumblesRecovered20,rolling_sum_interceptions20,rolling_sum_kickReturnYards20,rolling_sum_kickingPoints20,rolling_sum_netPassingYards20,rolling_sum_passesDeflected20,rolling_sum_passesIntercepted20,rolling_sum_passingTDs20,rolling_sum_puntReturns20,rolling_sum_qbHurries20,rolling_sum_rushingAttempts20,rolling_sum_rushingTDs20,rolling_sum_rushingYards20,rolling_sum_sacks20,rolling_sum_tacklesForLoss20,rolling_sum_totalFumbles20,rolling_sum_totalPenaltiesYards20,rolling_sum_totalYards20,rolling_sum_turnovers20,rolling_sum_yardsPerPass20,rolling_sum_yardsPerRushAttempt20,rolling_sum_totalTDs20,rolling_sum_Points8,rolling_sum_firstDowns8,rolling_sum_fumblesLost8,rolling_sum_fumblesRecovered8,rolling_sum_interceptions8,rolling_sum_kickReturnYards8,rolling_sum_kickingPoints8,rolling_sum_netPassingYards8,rolling_sum_passesDeflected8,rolling_sum_passesIntercepted8,rolling_sum_passingTDs8,rolling_sum_puntReturns8,rolling_sum_qbHurries8,rolling_sum_rushingAttempts8,rolling_sum_rushingTDs8,rolling_sum_rushingYards8,rolling_sum_sacks8,rolling_sum_tacklesForLoss8,rolling_sum_totalFumbles8,rolling_sum_totalPenaltiesYards8,rolling_sum_totalYards8,rolling_sum_turnovers8,rolling_sum_yardsPerPass8,rolling_sum_yardsPerRushAttempt8,rolling_sum_totalTDs8,rolling_sum_Points20_opp,rolling_sum_firstDowns20_opp,rolling_sum_fumblesLost20_opp,rolling_sum_fumblesRecovered20_opp,rolling_sum_interceptions20_opp,rolling_sum_kickReturnYards20_opp,rolling_sum_kickingPoints20_opp,rolling_sum_netPassingYards20_opp,rolling_sum_passesDeflected20_opp,rolling_sum_passesIntercepted20_opp,rolling_sum_passingTDs20_opp,rolling_sum_puntReturns20_opp,rolling_sum_qbHurries20_opp,rolling_sum_rushingAttempts20_opp,rolling_sum_rushingTDs20_opp,rolling_sum_rushingYards20_opp,rolling_sum_sacks20_opp,rolling_sum_tacklesForLoss20_opp,rolling_sum_totalFumbles20_opp,rolling_sum_totalPenaltiesYards20_opp,rolling_sum_totalYards20_opp,rolling_sum_turnovers20_opp,rolling_sum_yardsPerPass20_opp,rolling_sum_yardsPerRushAttempt20_opp,rolling_sum_totalTDs20_opp,rolling_sum_Points8_opp,rolling_sum_firstDowns8_opp,rolling_sum_fumblesLost8_opp,rolling_sum_fumblesRecovered8_opp,rolling_sum_interceptions8_opp,rolling_sum_kickReturnYards8_opp,rolling_sum_kickingPoints8_opp,rolling_sum_netPassingYards8_opp,rolling_sum_passesDeflected8_opp,rolling_sum_passesIntercepted8_opp,rolling_sum_passingTDs8_opp,rolling_sum_puntReturns8_opp,rolling_sum_qbHurries8_opp,rolling_sum_rushingAttempts8_opp,rolling_sum_rushingTDs8_opp,rolling_sum_rushingYards8_opp,rolling_sum_sacks8_opp,rolling_sum_tacklesForLoss8_opp,rolling_sum_totalFumbles8_opp,rolling_sum_totalPenaltiesYards8_opp,rolling_sum_totalYards8_opp,rolling_sum_turnovers8_opp,rolling_sum_yardsPerPass8_opp,rolling_sum_yardsPerRushAttempt8_opp,rolling_sum_totalTDs8_opp,talent,talent_opp
183,203,893,401520250,Alabama,SEC,home,24,4,2023,17.21,0.0,20.0,0.0,0.0,0.0,0.0,37.0,1.0,,,,10.0,225.0,4.0,1.0,1.0,34.23,0.0,4.0,1.0,8.0,45.0,1.0,131.0,5.0,40.0,10.0,6.13,1.0,6.6,356.0,1.0,10.7,2.9,2.0,Ole Miss,SEC,away,10,4,2023,21.36,0.0,17.0,3.4,0.0,0.0,0.0,0.0,1.0,0.0,26.0,3.0,4.0,245.0,0.0,1.0,0.0,25.37,,,,1.0,29.0,1.0,56.0,4.0,38.0,9.0,3.14,2.0,8.69,301.0,1.0,6.8,1.9,1.0,14,34,True,772.0,467.0,12.0,14.0,11.0,454.0,186.0,5866.0,82.0,11.0,54.0,48.0,81.0,708.0,37.0,3435.0,68.0,152.0,29.0,159.462,9301.0,23.0,8.48,4.705,95.0,271.0,173.0,4.0,8.0,5.0,218.0,77.0,1993.0,39.0,4.0,16.0,12.0,31.0,289.0,16.0,1225.0,25.0,55.0,9.0,52.412,3218.0,9.0,7.9375,4.075,32.0,696.0,484.0,9.0,23.0,13.0,683.0,168.0,5187.0,81.0,16.0,34.0,24.0,76.0,861.0,49.0,4417.0,62.0,131.0,30.0,147.017,9604.0,22.0,8.67,4.935,85.0,272.0,185.0,5.0,9.0,3.0,195.0,72.0,2143.0,27.0,4.0,16.0,7.0,27.0,338.0,15.0,1709.0,24.0,56.0,10.0,57.726,3852.0,8.0,8.6875,4.7,32.0,1015.43,756.04
184,204,897,401520285,Alabama,SEC,away,40,5,2023,10.13,1.0,17.0,0.0,0.0,0.0,1.0,41.0,0.0,0.0,73.0,3.0,16.0,164.0,2.0,3.0,0.0,30.1,0.0,0.4,1.0,5.0,43.0,3.0,193.0,4.0,20.0,8.0,5.12,2.0,6.4,357.0,0.0,12.6,4.5,4.0,Mississippi State,SEC,home,17,5,2023,15.27,0.0,15.0,0.3,0.0,0.0,,,3.0,0.0,110.0,5.0,5.0,107.0,0.0,,1.0,29.5,,,,1.0,35.0,1.0,154.0,4.0,21.0,7.0,5.13,,3.15,261.0,3.0,4.0,4.4,2.0,23,57,True,776.0,471.0,11.0,13.0,12.0,432.0,194.0,5789.0,80.0,11.0,53.0,46.0,81.0,727.0,37.0,3560.0,68.0,154.0,27.0,156.422,9349.0,23.0,8.605,4.84,94.0,265.0,172.0,4.0,7.0,6.0,218.0,75.0,1957.0,28.0,5.0,15.0,13.0,32.0,307.0,16.0,1327.0,26.0,59.0,7.0,55.812,3284.0,10.0,8.3875,4.3,31.0,651.0,443.0,14.0,17.0,11.0,1289.0,139.0,6095.0,59.0,19.0,55.0,32.0,47.0,486.0,23.0,1943.0,45.0,128.0,24.0,121.757,8038.0,25.0,6.8,3.89,84.0,239.0,149.0,4.0,5.0,6.0,445.0,59.0,1932.0,21.0,9.0,17.0,12.0,19.0,225.0,10.0,943.0,17.0,57.0,7.0,38.12,2875.0,10.0,6.6375,4.0125,30.0,1015.43,746.29
185,205,901,401520310,Alabama,SEC,away,26,6,2023,21.33,0.0,16.0,0.1,1.0,0.0,0.0,0.0,1.0,0.0,80.0,3.0,6.0,321.0,2.0,1.0,3.0,29.09,0.0,0.1,1.0,7.0,26.0,0.0,23.0,5.0,32.0,6.0,4.12,4.0,14.99,344.0,2.0,9.7,0.9,3.0,Texas A&M,SEC,home,20,6,2023,14.25,0.0,16.0,0.1,0.0,1.0,0.0,15.0,1.0,0.0,26.0,3.0,8.0,239.0,2.0,1.0,1.0,30.51,0.0,46.0,2.0,3.0,35.0,1.0,67.0,6.0,34.0,8.0,4.12,2.0,4.2,306.0,1.0,9.6,1.9,2.0,6,46,True,757.0,458.0,10.0,12.0,12.0,490.0,199.0,5613.0,77.0,14.0,48.0,42.0,82.0,729.0,37.0,3506.0,65.0,150.0,28.0,158.452,9119.0,22.0,8.775,4.765,90.0,274.0,164.0,4.0,7.0,5.0,291.0,78.0,1793.0,28.0,8.0,14.0,14.0,36.0,320.0,17.0,1383.0,24.0,56.0,9.0,52.292,3176.0,9.0,9.1625,4.2875,32.0,562.0,404.0,16.0,22.0,11.0,781.0,152.0,4697.0,76.0,7.0,38.0,47.0,52.0,637.0,24.0,2909.0,55.0,140.0,31.0,131.35,7606.0,27.0,7.435,4.455,65.0,271.0,171.0,8.0,6.0,4.0,268.0,79.0,2000.0,29.0,3.0,18.0,23.0,21.0,287.0,11.0,1356.0,29.0,65.0,14.0,52.8,3356.0,12.0,8.4875,4.6375,31.0,1015.43,925.92
186,206,905,401520316,Alabama,SEC,home,24,7,2023,10.21,0.0,18.0,0.0,0.0,0.0,,,0.0,,,,6.0,238.0,1.0,,2.0,29.49,0.0,11.0,1.0,4.0,42.0,1.0,177.0,4.0,35.0,7.0,6.14,1.0,5.45,415.0,0.0,11.3,4.2,3.0,Arkansas,SEC,away,21,7,2023,14.24,0.0,13.0,0.0,0.0,0.0,,,0.0,0.0,36.0,2.0,7.0,150.0,3.0,,2.0,30.11,0.0,5.0,2.0,1.0,36.0,0.0,100.0,5.0,36.0,5.0,4.14,,2.13,250.0,0.0,6.3,2.8,2.0,3,45,True,741.0,445.0,10.0,11.0,13.0,570.0,195.0,5375.0,77.0,15.0,46.0,41.0,84.0,719.0,37.0,3417.0,67.0,146.0,30.0,167.992,8792.0,23.0,8.56,4.655,88.0,270.0,162.0,4.0,5.0,6.0,340.0,72.0,1905.0,22.0,9.0,14.0,15.0,40.0,310.0,17.0,1298.0,26.0,55.0,12.0,61.832,3203.0,10.0,9.5875,4.025,32.0,611.0,422.0,12.0,16.0,12.0,374.2,145.0,4494.0,70.0,15.0,40.0,29.0,60.0,848.0,31.0,3623.0,60.0,124.0,31.0,138.917,8117.0,24.0,8.46,4.185,76.0,236.0,148.0,2.0,4.0,7.0,249.0,64.0,1690.0,28.0,7.0,17.0,12.0,16.0,299.0,7.0,882.0,23.0,55.0,13.0,52.955,2572.0,9.0,7.8375,2.875,28.0,1015.43,730.93
187,207,909,401520333,Alabama,SEC,home,34,8,2023,14.21,0.0,20.0,0.0,1.0,1.0,,,1.0,0.0,21.0,1.0,10.0,220.0,4.0,,2.0,32.19,0.0,0.7,1.0,4.0,42.0,,138.0,3.0,45.0,8.0,5.13,1.0,1.5,358.0,2.0,10.5,3.3,2.0,Tennessee,SEC,away,20,8,2023,28.41,0.0,22.0,0.3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0,271.0,1.0,1.0,2.0,27.41,0.0,5.0,1.0,3.0,38.0,0.0,133.0,4.0,35.0,5.0,8.18,1.0,8.55,404.0,1.0,6.6,3.5,2.0,14,54,True,741.0,445.0,10.0,11.0,12.0,557.0,193.0,5296.0,76.0,14.0,46.0,41.0,86.0,724.0,38.0,3523.0,65.0,138.0,31.0,162.313,8819.0,22.0,8.82,4.77,89.0,260.0,153.0,2.0,4.0,5.0,332.0,68.0,1879.0,19.0,7.0,14.0,14.0,42.0,307.0,16.0,1212.0,30.0,58.0,11.0,62.982,3091.0,7.0,9.9,3.825,31.0,834.0,485.0,13.0,16.0,7.0,448.0,184.0,5690.0,70.0,17.0,49.0,33.0,112.0,801.0,55.0,4275.0,57.0,149.0,26.0,165.059,9965.0,20.0,8.955,5.45,108.0,277.0,166.0,4.0,4.0,4.0,149.0,67.0,1697.0,29.0,7.0,13.0,17.0,32.0,305.0,19.0,1883.0,31.0,70.0,10.0,65.015,3580.0,8.0,6.6875,6.375,35.0,1015.43,821.07
188,208,913,401520362,Alabama,SEC,home,42,10,2023,15.23,0.0,28.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,88.0,4.0,6.0,219.0,6.0,1.0,0.0,33.33,,,,2.0,46.0,6.0,288.0,1.0,29.0,1.0,11.14,,6.5,507.0,0.0,9.5,6.3,6.0,LSU,SEC,away,28,10,2023,20.34,0.0,21.0,0.2,0.0,0.0,,,1.0,0.0,13.0,1.0,4.0,272.0,3.0,,2.0,26.27,,,,2.0,24.0,2.0,206.0,2.0,39.0,3.0,4.9,,8.6,478.0,1.0,8.0,8.6,4.0,14,70,True,734.0,440.0,11.0,12.0,13.0,578.0,192.0,5095.0,76.0,12.0,45.0,42.0,85.0,740.0,37.0,3546.0,65.0,140.0,31.0,155.963,8641.0,24.0,8.865,4.715,86.0,245.0,147.0,3.0,3.0,5.0,237.0,71.0,1756.0,22.0,7.0,13.0,13.0,38.0,315.0,12.0,1177.0,29.0,58.0,11.0,53.38,2933.0,8.0,9.7875,3.6,26.0,739.0,506.0,12.0,12.0,8.0,539.0,151.0,5864.0,67.0,14.0,42.0,15.0,62.0,739.0,53.0,3888.0,38.0,99.0,18.0,133.52,9752.0,20.0,8.895,5.245,96.0,383.0,213.0,1.0,2.0,3.0,222.0,79.0,2646.0,32.0,8.0,27.0,2.0,23.0,292.0,22.0,1796.0,16.0,42.0,4.0,56.19,4442.0,4.0,10.775,6.275,50.0,1015.43,899.31
189,209,917,401520386,Alabama,SEC,away,49,11,2023,16.25,0.0,23.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,103.0,4.0,7.0,285.0,4.0,1.0,3.0,32.06,0.0,22.0,2.0,7.0,39.0,4.0,159.0,3.0,25.0,9.0,7.11,1.0,6.64,444.0,2.0,11.4,4.1,7.0,Kentucky,SEC,home,21,11,2023,17.31,0.0,11.0,0.1,1.0,1.0,0.0,0.0,1.0,0.0,30.0,2.0,3.0,158.0,2.0,1.0,1.0,27.54,,,,3.0,24.0,2.0,95.0,0.0,34.0,6.0,3.11,1.0,4.19,253.0,2.0,5.1,4.0,3.0,28,70,True,721.0,438.0,11.0,12.0,12.0,648.0,185.0,5033.0,79.0,13.0,39.0,39.0,83.0,754.0,42.0,3556.0,66.0,136.0,31.0,156.063,8589.0,23.0,8.99,4.595,85.0,231.0,152.0,3.0,2.0,5.0,302.0,69.0,1749.0,27.0,7.0,10.0,7.0,34.0,321.0,13.0,1260.0,27.0,52.0,10.0,57.69,3009.0,8.0,9.8,3.75,24.0,493.0,357.0,10.0,13.0,17.0,760.0,115.0,4156.0,51.0,17.0,36.0,22.0,49.0,631.0,22.0,2686.0,36.0,106.0,19.0,141.698,6842.0,27.0,7.52,4.32,62.0,219.0,137.0,4.0,2.0,6.0,240.0,45.0,1528.0,21.0,8.0,14.0,2.0,24.0,221.0,12.0,1135.0,17.0,49.0,7.0,53.378,2663.0,10.0,6.6375,4.9125,29.0,1015.43,740.54
190,210,921,401520401,Alabama,SEC,home,66,12,2023,19.24,1.0,21.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,58.0,3.0,12.0,259.0,5.0,1.0,3.0,26.29,1.0,96.0,3.0,3.0,33.0,5.0,315.0,0.0,31.0,2.0,6.9,1.0,3.25,574.0,0.0,10.8,9.5,9.0,Chattanooga,Southern,away,10,12,2023,10.21,0.0,11.0,0.0,0.0,0.0,,,1.0,0.0,56.0,3.0,4.0,107.0,1.0,,0.0,33.31,,,,1.0,35.0,1.0,126.0,1.0,34.0,1.0,4.14,,5.25,233.0,1.0,5.1,3.6,1.0,56,76,True,750.0,445.0,12.0,13.0,13.0,715.0,184.0,5105.0,80.0,14.0,41.0,39.0,89.0,769.0,45.0,3554.0,66.0,138.0,32.0,147.603,8659.0,25.0,9.285,4.465,90.0,256.0,157.0,4.0,3.0,4.0,387.0,66.0,1779.0,26.0,8.0,11.0,8.0,38.0,325.0,17.0,1312.0,30.0,59.0,11.0,53.43,3091.0,8.0,10.05,3.875,29.0,577.0,434.0,6.0,18.0,19.0,679.0,152.0,5135.0,5.0,17.0,36.0,22.0,3.0,763.0,29.0,2904.0,4.0,6.0,16.0,90.46,8039.0,25.0,8.815,3.795,67.0,224.0,165.0,3.0,7.0,7.0,313.0,70.0,2002.0,1.0,7.0,12.0,6.0,1.0,300.0,11.0,1047.0,1.0,1.0,3.0,43.11,3049.0,10.0,8.6,3.5125,24.0,1015.43,201.79
191,211,925,401520427,Alabama,SEC,away,27,13,2023,16.24,0.0,21.0,2.2,0.0,1.0,0.0,22.0,0.0,0.0,17.0,1.0,9.0,259.0,2.0,2.0,2.0,32.43,0.0,0.3,2.0,1.0,44.0,1.0,192.0,4.0,43.0,6.0,6.16,0.0,8.72,451.0,0.0,10.8,4.4,3.0,Auburn,SEC,home,24,13,2023,6.17,0.0,18.0,0.0,1.0,0.0,,,2.0,0.0,67.0,4.0,6.0,93.0,3.0,,1.0,27.17,0.0,5.0,2.0,2.0,42.0,2.0,244.0,1.0,45.0,5.0,4.12,2.0,5.28,337.0,3.0,5.5,5.8,3.0,3,51,True,753.0,443.0,12.0,13.0,11.0,716.0,187.0,5128.0,82.0,14.0,41.0,33.0,88.0,763.0,47.0,3596.0,62.0,131.0,30.0,144.253,8724.0,23.0,9.235,4.59,90.0,305.0,163.0,3.0,2.0,4.0,423.0,73.0,1931.0,28.0,8.0,14.0,10.0,40.0,316.0,20.0,1424.0,25.0,51.0,11.0,51.33,3355.0,7.0,10.8125,4.4625,36.0,529.0,360.0,13.0,16.0,17.0,917.0,141.0,3146.0,68.0,16.0,24.0,32.0,52.0,806.0,36.0,4155.0,45.0,114.0,29.0,143.715,7301.0,30.0,6.345,5.005,63.0,199.0,142.0,1.0,3.0,8.0,376.0,53.0,1192.0,30.0,6.0,12.0,13.0,21.0,312.0,11.0,1590.0,17.0,49.0,6.0,44.74,2782.0,9.0,6.8,4.9,24.0,1015.43,770.94


We can see if it's a valuable test statistic, let's build on our earlier model to include talent as a feature

First let's recreate our predictMissing function, this time adding in a set random state for better reproducibility

In [30]:
# slightly modifying predictMissing to take fixCols as an argument so that it's easier to add columns
def predictMissing(modelTeam, fixCols, randomState):
                   
    for col in fixCols:

        # replacing zeroes with null values 
        modelTeam[col].replace(0, np.nan, inplace=True) 
        
        # creating a boolean series to show which indeces are non-nulls
        nonNulls = ~modelTeam[col].isnull()
    
        if nonNulls.any():

            # setting up X and y data for making a linear model based off the existing data
            X = modelTeam.loc[nonNulls, ['scoreDiff']].values.reshape(-1, 1)
            y = modelTeam.loc[nonNulls, col].values.reshape(-1, 1)
            
            # fit linear regression model on known data
            lr = LinearRegression()
            lr.fit(X, y)
            
            # grabbing rows where scoreDiff exists but the predictor is missing
            missingVals = modelTeam.loc[modelTeam[col].isnull(), ['scoreDiff']].values.reshape(-1, 1)
            
            if missingVals.shape[0] > 0:

                 # predicting values for the missing rows based on scoreDiff
                preds = lr.predict(missingVals).flatten()
                
                # grabbing the standard deviation of the existing data
                std = modelTeam[col].std(skipna = True)

                # setting the random state that the sampleVals will be predicted from
                random = np.random.default_rng(seed = randomState)

                # using the predicted values to generate a random distribution based off of the predictions
                # and spread of the existing data
                sampleVals = random.normal(loc = preds, scale = std, size = len(preds))
                
                # replacing the null values with generated sample values
                modelTeam.loc[modelTeam[col].isnull(), col] = sampleVals



In [31]:
# picking out the columns with missing data (now including talent)
rollingCols = [col for col in teamDict['Florida State'].columns if 'rolling_sum' in col]
fixCols = [col for col in rollingCols if any(word in col for word in ['tackle', 'sacks', 'Deflected', 'defensive', 'qbHurries', 'fumbles', 'kickReturn', 'penalty', 'Fumble'])]
fixCols.extend(['talent','talent_opp'])

Making a model building function that will also include bootstrap aggregation when predicting results. Also using grid search to find the best number of n estimators.

In [81]:
def buildModel(modelTeam, randomState):

    # throwing out first 20 rows due to missing rolling_sum data
    modelTeam = modelTeam.drop(modelTeam.index[:20])
    modelTeam = modelTeam.reset_index(drop = True)

    # calling function to predict the missing values
    predictMissing(modelTeam, fixCols, randomState)

    features = [col for col in modelTeam.columns if 'rolling_sum' in col or 'talent' in col]
    
    # setting up training and test sets
    X = modelTeam[features]
    y = modelTeam[['scoreDiff']].values.ravel()


    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.2)

    # setting up pipeline
    pipe = Pipeline([
        ('imputer', SimpleImputer(strategy = 'constant', fill_value = 0)),
        ('scaler', StandardScaler()),
        ('regressor', Lasso())
    ])

    # setting up and performing grid search to find best alpha level
    paramGrid = {
        'regressor__alpha': np.logspace(-4, 4, 50)
    }

    gridSearch = GridSearchCV(pipe, paramGrid, cv = 5, scoring = 'neg_mean_absolute_error')

    gridSearch.fit(X, y)

    bestAlpha = gridSearch.best_params_['regressor__alpha']

    print(f'Best alpha level: {bestAlpha}')

    # setting up new pipeline with the ideal alpha level and a bagging regressor
    betterPipe = Pipeline([
        ('imputer', SimpleImputer(strategy = 'constant', fill_value = 0)),
        ('scaler', StandardScaler()),
        ('regressor', BaggingRegressor(estimator = Lasso(alpha = bestAlpha), n_estimators = 50, n_jobs = -1))
    ])

    betterPipe.fit(xTrain, yTrain)

    testPreds = betterPipe.predict(xTest)

    # grabbing mean MAE
    cvScore = cross_val_score(betterPipe, xTrain, yTrain, cv = 5, scoring = 'neg_mean_absolute_error')
    print('Mean MAE: ', -cvScore.mean())

    # getting R2 scores from model
    trainPreds = betterPipe.predict(xTrain)
    testPreds = betterPipe.predict(xTest)
    print('Training R2 score: ' + str(r2_score(yTrain, trainPreds)))
    print('Test R2 score: ' + str(r2_score(yTest, testPreds)))


In [82]:
FSU = teamDict['Florida State']
buildModel(FSU, 50)

Best alpha level: 0.8286427728546842
Mean MAE:  13.436705082603051
Training R2 score: 0.673470779133823
Test R2 score: 0.51973491054485


Can see that MAE and R2 score improved, let's see if this improved it across all teams

Adapting build function to just give MAE and R2 scores

In [87]:
def buildModelScore(modelTeam, features, randomState):

    # throwing out first 20 rows due to missing rolling_sum data
    modelTeam = modelTeam.drop(modelTeam.index[:20])
    modelTeam = modelTeam.reset_index(drop = True)

    # calling function to predict the missing values
    predictMissing(modelTeam, fixCols, randomState)
    
    # setting up training and test sets
    X = modelTeam[features]
    y = modelTeam[['scoreDiff']].values.ravel()

    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)

    # setting up pipeline
    pipe = Pipeline([
        ('imputer', SimpleImputer(strategy = 'constant', fill_value = 0)),
        ('scaler', StandardScaler()),
        ('regressor', Lasso(max_iter = 3000))
    ])

    # setting up and performing grid search to find best alpha level
    paramGrid = {
        'regressor__alpha': np.logspace(-4, 4, 50)
    }

    gridSearch = GridSearchCV(pipe, paramGrid, cv = 5, scoring = 'neg_mean_absolute_error')

    gridSearch.fit(X, y)
    
    bestAlpha = gridSearch.best_params_['regressor__alpha']

    # setting up new pipeline with the ideal alpha level and a bagging regressor
    betterPipe = Pipeline([
        ('imputer', SimpleImputer(strategy = 'constant', fill_value = 0)),
        ('scaler', StandardScaler()),
        ('regressor', BaggingRegressor(estimator = Lasso(alpha = bestAlpha, max_iter = 3000), n_estimators = 50, n_jobs = -1))
    ])

    betterPipe.fit(xTrain, yTrain)

    testPreds = betterPipe.predict(xTest)

    # grabbing mean MAE
    cvScore = cross_val_score(betterPipe, xTrain, yTrain, cv = 5, scoring = 'neg_mean_absolute_error')

    # grabbing r2 score
    r2 = r2_score(yTest, testPreds)

    modelScores = [-cvScore.mean(), r2]
    
    return modelScores

Adding talent to the list of features considered

In [46]:
features = [col for col in teamDict['Florida State'].columns if 'rolling_sum' in col]
features.extend(['talent', 'talent_opp'])

In [88]:
# dictionary to hold each team and their model's score with talent as the newest added feature
modelScoresTalent = {}

for team, teamDF in teamDict.items():

    scores = buildModelScore(teamDF, features, 50)
    modelScoresTalent[team] = scores
    print(f'{team}: {scores}')    


Louisiana Tech: [11.7092185648993, 0.2834725914463776]
Southern Mississippi: [12.512968356634767, 0.48142385675241517]
Arizona State: [10.0131904137656, 0.6343732756819935]
Auburn: [11.731518085150114, 0.600949284264241]
Texas Tech: [13.703953160132935, 0.46141379191486054]
Minnesota: [13.847382247912865, 0.28179605448064493]
NC State: [10.15960675046146, 0.6712633087841854]
Georgia: [10.345260547194272, 0.7817468758716233]
USC: [11.477034034723705, 0.359913427598067]
South Florida: [10.90741580824148, 0.5526473384296506]
Wisconsin: [12.295591566380098, 0.39575580055653137]
Miami: [13.167654795573913, 0.5001615858925539]
Mississippi State: [11.858168884437877, 0.5427012177994603]
Houston: [13.962928192432992, 0.3687665797069921]
San José State: [10.71094670316715, 0.36816342964095605]
Oklahoma State: [14.753525526364934, 0.47806039020283564]
UCLA: [14.166815021695546, 0.4506349125954098]
Rice: [10.353438722081396, 0.3622511171305137]
Texas State: [13.593413140781314, 0.308035534421234]

In [90]:
maeScoresTalent = [value[0] for value in modelScoresTalent.values()]
r2ScoresTalent = [value[1] for value in modelScoresTalent.values()]
print(f'Mean MAE: {np.mean(maeScoresTalent)} \nMean R2 Score: {np.mean(r2ScoresTalent)}')

Mean MAE: 12.502987643519452 
Mean R2 Score: 0.4236140104064987


Some improvement overall, next let's add S&P+ 

In [18]:
# setting the download directory and Chrome settings
directory = "/Users/blaizelahman/Desktop/CFBData"
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory": directory}
chromeOptions.add_experimental_option("prefs", prefs)

# creating Chrome driver
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options = chromeOptions)

# making a dictionary to hold team S&P+ ratings
spRatings = {}

# downloading team S&P+ ratings for years 2005-2023 from collegefootballdata.com
for year in range(2005, 2024):

    # skipping 2020 because it has bad data
    if year == 2020: 
        continue
        
    try:
        url = f'https://collegefootballdata.com/exporter/ratings/sp?year={year}'
        driver.get(url)
        time.sleep(4) 
            
        # clicking the query button
        query = driver.find_element(By.XPATH, "//button[contains(span/text(), 'Query')]")
        query.click()
        time.sleep(3) 
            
        # clicking the export button
        export = driver.find_element(By.XPATH, "//button[contains(span/text(), 'Export')]")
        export.click()
        time.sleep(3)
        
        key = str(year)
            
        # grabs files from CFBData folder
        files = os.listdir(directory)

       # grab the file paths for all files ending in .csv
        filePaths = [os.path.join(directory, name) for name in files if name.endswith('.csv')]

        # grabbing the most recently made file out of those in paths
        file = max(filePaths, key=os.path.getctime)
            
        # loading csv file
        spRatings[key] = pd.read_csv(file)

        # deleting the file after it has been added
        os.remove(file)

    except Exception as e:
        print(f'Cannot grab data for {year}. Error: {e}')

    print(f'Successfully grabbed data for {year}')

driver.quit()

Successfully grabbed data for 2005
Successfully grabbed data for 2006
Successfully grabbed data for 2007
Successfully grabbed data for 2008
Successfully grabbed data for 2009
Successfully grabbed data for 2010
Successfully grabbed data for 2011
Successfully grabbed data for 2012
Successfully grabbed data for 2013
Successfully grabbed data for 2014
Successfully grabbed data for 2015
Successfully grabbed data for 2016
Successfully grabbed data for 2017
Successfully grabbed data for 2018
Successfully grabbed data for 2019
Successfully grabbed data for 2021
Successfully grabbed data for 2022
Successfully grabbed data for 2023


In [19]:
spRatings['2023'][:10]

Unnamed: 0,Year,Team,Conference,Rating,Ranking,SecondOrderWins,Sos,Offense Ranking,Offense Rating,Offense Success,Offense Explosiveness,Offense Rushing,Offense Passing,Offense StandardDowns,Offense PassingDowns,Offense RunRate,Offense Pace,Defense Ranking,Defense Rating,Defense Success,Defense Explosiveness,Defense Rushing,Defense Passing,Defense StandardDowns,Defense PassingDowns,Defense Havoc Total,Defense Havoc FrontSeven,Defense Havoc Db,SpecialTeams Rating
0,2023,Michigan,Big Ten,31.3,1.0,,,12.0,36.6,,,,,,,,,1.0,7.2,,,,,,,,,,1.8
1,2023,Georgia,SEC,31.2,2.0,,,4.0,41.5,,,,,,,,,5.0,12.2,,,,,,,,,,1.9
2,2023,Oregon,Big Ten,26.2,3.0,,,1.0,45.0,,,,,,,,,16.0,18.2,,,,,,,,,,-0.5
3,2023,Ohio State,Big Ten,25.2,4.0,,,34.0,31.5,,,,,,,,,2.0,7.8,,,,,,,,,,1.5
4,2023,Penn State,Big Ten,23.5,5.0,,,29.0,32.7,,,,,,,,,4.0,10.5,,,,,,,,,,1.3
5,2023,Texas,SEC,23.2,6.0,,,6.0,39.3,,,,,,,,,12.0,17.7,,,,,,,,,,1.6
6,2023,Alabama,SEC,23.1,7.0,,,11.0,36.8,,,,,,,,,8.0,15.6,,,,,,,,,,2.0
7,2023,Notre Dame,FBS Independents,20.1,8.0,,,14.0,36.3,,,,,,,,,10.0,16.7,,,,,,,,,,0.5
8,2023,Florida State,ACC,19.4,9.0,,,22.0,34.1,,,,,,,,,9.0,16.5,,,,,,,,,,1.8
9,2023,Missouri,SEC,19.3,10.0,,,13.0,36.5,,,,,,,,,14.0,18.0,,,,,,,,,,0.8


Augmenting orignal team's dataframe with S&P their rating

In [20]:
for team, teamDF in teamDict.items():

    for year in teamDF['Year'].unique():
        
        # checks if year and team is in talentRating
        if str(year) in spRatings.keys():
            
            if team in spRatings[str(year)]['Team'].values:
        
                # grabs the dataframe from that year and the talent rating of that team in that year
                spDF = spRatings[str(year)]
                SP = spDF.loc[spDF['Team'] == team, 'Rating']
    
                # assigns talent column with the corresponding talent ratings for games in the given year
                teamDF.loc[teamDF['Year'] == year, 'SP'] = SP.values[0]

            else:

                teamDF.loc[teamDF['Year'] == year, 'SP'] = np.nan

    teamDict[team] = teamDF

Function to merge opposing S&P ratings by game

In [21]:
def mergeSP(team):

    # making a copy of the teams dataframe from teamDict
    teamDF = teamDict[team].copy() 

    teamDF['SP_opp'] = np.nan

    # going through the dataframe and merging the opponent's talent rating column row by row
    for index, row in teamDF.iterrows():

        # grabbing year and each opponent's name to access their talent rating in talentRatings
        year = row['Year']
        oppName = row['School_opp']

        if str(year) in spRatings.keys():

            yearDF = spRatings[str(year)]

            
            # getting the opponent's talent rating column from the game they played the given team
            oppRow = yearDF[yearDF['Team'] == oppName]

            if not oppRow.empty:

                oppSP = oppRow.iloc[0]['Rating']
    
                # merging the opponent's talent column on the row the team plays them
                teamDF.loc[index, 'SP_opp'] = oppSP
        
    return teamDF

Augmenting dataframes with opponent S&P ratings

In [22]:
for team in teamDict.keys():
    
    updatedDF = mergeSP(team)
    teamDict[team] = updatedDF

Adding S&P ratings to the features list

In [23]:
features.extend(['SP','SP_opp'])

Testing how the addition of S&P scores effects the scores of all models

In [91]:
# dictionary to hold each team and their model's score with SP as the newest added feature
modelScoresSP = {}

for team, teamDF in teamDict.items():

    scores = buildModelScore(teamDF, features, 50)
    modelScoresSP[team] = scores
    print(f'{team}: {scores}')    

Louisiana Tech: [12.01436637354632, 0.24617953196651332]
Southern Mississippi: [11.916612195695507, 0.6868037048590575]
Arizona State: [10.465522042797682, 0.5789267509877005]
Auburn: [12.436568199510976, 0.7469288697038222]
Texas Tech: [13.81077731377107, 0.4161022944470657]
Minnesota: [14.552701960080771, 0.4952782072434171]
NC State: [11.417637953141835, 0.5288325991106793]
Georgia: [9.805222125846514, 0.7213179373070939]
USC: [11.77838175479626, 0.4928799469224523]
South Florida: [11.46152056930923, 0.43673714579050227]
Wisconsin: [12.774878550274389, 0.354456366282226]
Miami: [12.398711803248862, 0.5192683705116107]
Mississippi State: [12.254733483570726, 0.6395378297067174]
Houston: [14.575750182981583, 0.4171501931773456]
San José State: [12.287435183795125, 0.5931538885581595]
Oklahoma State: [14.262810374811613, 0.33078099503752756]
UCLA: [13.47893722419, 0.2171831124482676]
Rice: [11.06922263122651, 0.48555292386747406]
Texas State: [13.408034320526554, 0.2546039171384651]
Io

In [92]:
maeScoresSP = [value[0] for value in modelScoresSP.values()]
r2ScoresSP = [value[1] for value in modelScoresSP.values()]
print(f'Mean MAE: {np.mean(maeScoresSP)} \nMean R2 Score: {np.mean(r2ScoresSP)}')

Mean MAE: 12.595896319163796 
Mean R2 Score: 0.444865597469117


It seems that adding S&P ratings substantially improved the model. Going forward we will add more advanced stats and try out different modeling techniques, but for now we will see how our model stacks up against Vegas in terms of predicting score differentials. In order to do that, we need to augment our data with betting data so that we can see how our model performs against Vegas spreads.

Importing betting data

In [3]:
# setting the download directory and Chrome settings
directory = "/Users/blaizelahman/Desktop/CFBData"
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory": directory}
chromeOptions.add_experimental_option("prefs", prefs)

# creating Chrome driver
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options = chromeOptions)

# making a dictionary to hold team betting data
bettingDict = {}

# downloading betting data for years 2035-2023 from collegefootballdata.com
for year in range(2013, 2024):

    # skipping 2020 because it has bad data
    if year == 2020: 
        continue
        
    try:
        url = f'https://collegefootballdata.com/exporter/lines?year={year}&seasonType=regular'
        driver.get(url)
        time.sleep(4) 
            
        # clicking the query button
        query = driver.find_element(By.XPATH, "//button[contains(span/text(), 'Query')]")
        query.click()
        time.sleep(3) 
            
        # clicking the export button
        export = driver.find_element(By.XPATH, "//button[contains(span/text(), 'Export')]")
        export.click()
        time.sleep(3)

        key = str(year)

            
        # grabs files from CFBData folder
        files = os.listdir(directory)
        
        # grab the file paths for all files ending in .csv
        filePaths = [os.path.join(directory, name) for name in files if name.endswith('.csv')]

        # grabbing the most recently made file out of those in paths
        file = max(filePaths, key=os.path.getctime)
            
        # loading csv file
        bettingDict[key] = pd.read_csv(file)

        # deleting the file after it has been added
        os.remove(file)

    except Exception as e:
        print(f'Cannot grab data for {year}. Error: {e}')

    print(f'Successfully grabbed data for {year}')

driver.quit()

Successfully grabbed data for 2013
Successfully grabbed data for 2014
Successfully grabbed data for 2015
Successfully grabbed data for 2016
Successfully grabbed data for 2017
Successfully grabbed data for 2018
Successfully grabbed data for 2019
Successfully grabbed data for 2021
Successfully grabbed data for 2022
Successfully grabbed data for 2023


Augmenting our already existing data with the betting data from preferred line providers (sportsbooks) 

In [28]:
# setting our preferred line providers
lineProviders = ['DraftKings', 'consensus', 'Bovada']

# combining betting dataframes
combinedBetting = pd.concat(bettingDict.values(), ignore_index = True)

# function to get line from most preferred available line
def getPreferredLine(group, lineProviders):

    # go through preferred lines and if our preferred providers are there, return the line
    for provider in lineProviders:
        preferredLine = group[group['LineProvider'] == provider]

        # if preferred line provider has line, return
        if not preferredLine.empty:
            return preferredLine.iloc[0]

    # returning first available line if none of our preferred ones are available
    return group.iloc[0] 

# getting the preferred line for each individual game
preferredLines = combinedBetting.groupby('Id').apply(lambda x: getPreferredLine(x, lineProviders)).reset_index(drop = True)

# merging the preferred lines with each dataframe in teamDict
for team, teamDF in teamDict.items():
    
    # merging based on Game Id
    teamDict[team] = teamDF.merge(preferredLines, left_on = 'Game Id', right_on = 'Id', how = 'left')

Looking at new dataframe

In [None]:
teamDict['Florida State'][teamDict['Florida State']['Year'] == 2023]

Let's correct the 'Spread' column by getting it into the format of our predictions so that they can be easily compared.

In [None]:
# function that will change the sign of the spread if the given team is 
# the home team, which will standardize the column
def correctSpread(row, team):
    if row['HomeTeam'] == team:
        row['Spread'] = float(row['Spread']) * -1
    return row

In [None]:
# applying this correction to all dataframes
for team, teamDF in teamDict.items(): 
    
    teamDict[team] = teamDF.apply(correctSpread, axis = 1, team = team)

Now we'll save these updated dataframes as csv files to be used in analyzing model success against Vegas 

In [26]:
for key, team in teamDict.items():
    if '2023' in team['Year'].astype(str).values:
        name = key.replace(' ', '_') + '_updated_model.csv'
        team.to_csv(name)
        print('CSV: ' + name)

CSV: Boston_College_updated_model.csv
CSV: Rutgers_updated_model.csv
CSV: Auburn_updated_model.csv
CSV: North_Texas_updated_model.csv
CSV: Nebraska_updated_model.csv
CSV: Oklahoma_State_updated_model.csv
CSV: Arizona_State_updated_model.csv
CSV: Eastern_Michigan_updated_model.csv
CSV: Louisiana_updated_model.csv
CSV: Colorado_State_updated_model.csv
CSV: Idaho_updated_model.csv
CSV: Illinois_updated_model.csv
CSV: Air_Force_updated_model.csv
CSV: Kent_State_updated_model.csv
CSV: Louisiana_Monroe_updated_model.csv
CSV: Iowa_updated_model.csv
CSV: Akron_updated_model.csv
CSV: Ohio_updated_model.csv
CSV: Georgia_updated_model.csv
CSV: South_Alabama_updated_model.csv
CSV: Georgia_Tech_updated_model.csv
CSV: Western_Kentucky_updated_model.csv
CSV: Maryland_updated_model.csv
CSV: Arizona_updated_model.csv
CSV: Minnesota_updated_model.csv
CSV: Pittsburgh_updated_model.csv
CSV: Marshall_updated_model.csv
CSV: Louisiana_Tech_updated_model.csv
CSV: Virginia_Tech_updated_model.csv
CSV: Californi