In [1]:
import cfbd
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
configuration = cfbd.Configuration()
configuration.api_key['Authorization'] = 'JOo7NPQiRwoWx1kDmv1VDiujh+ra/nfYLRGy0TVt2KhLM+Yiu6BpXOC7W4YtjBdq'
configuration.api_key_prefix['Authorization'] = 'Bearer'

In [3]:
api_instance = cfbd.GamesApi(cfbd.ApiClient(configuration))

In [4]:
def fetch_games_data_for_years(start_year, end_year):
    dataframes = {}
    for year in range(start_year, end_year + 1):
        games = api_instance.get_games(year=year)
        big_12_games = [game for game in games if game.home_conference == 'Big 12' or game.away_conference == 'Big 12']
        
        game_data = pd.DataFrame.from_records(
            [dict(game_id=game.id,
                  year=game.season,
                  home_team=game.home_team,
                  away_team=game.away_team,
                  home_points=game.home_points,
                  away_points=game.away_points) for game in big_12_games]
        )
        dataframes[year] = game_data
    return dataframes

years_data = fetch_games_data_for_years(2016, 2020)


In [5]:
print(years_data)

{2016:       game_id  year       home_team               away_team  home_points  \
0   400868876  2016          Baylor      Northwestern State           55   
1   400869094  2016        Stanford            Kansas State           26   
2   400868976  2016   West Virginia                Missouri           26   
3   400869507  2016         Houston                Oklahoma           33   
4   400869619  2016  Oklahoma State  Southeastern Louisiana           61   
..        ...   ...             ...                     ...          ...   
70  400869604  2016    Kansas State                  Kansas           34   
71  400869596  2016      Iowa State           West Virginia           19   
72  400869611  2016             TCU            Kansas State            6   
73  400869618  2016        Oklahoma          Oklahoma State           38   
74  400868886  2016   West Virginia                  Baylor           24   

    away_points  
0             7  
1            13  
2            11  
3       

In [6]:
combined_data = pd.concat(years_data.values(), ignore_index=True)


In [7]:
def calculate_point_differential(df):
    df['point_differential'] = df['home_points'] - df['away_points']

calculate_point_differential(combined_data)


In [8]:
api_instance = cfbd.StatsApi(cfbd.ApiClient(configuration))
advanced_stats_data = {}
for year in range(2016, 2020):
    advanced_stats = api_instance.get_advanced_team_game_stats(year=year)
    advanced_stats_data[year] = pd.DataFrame.from_records(
        [dict(game_id=stat.game_id,
              team=stat.team,
              opponent=stat.opponent,
              line_yards_total=stat.offense.line_yards_total,
              second_level_yards_total=stat.offense.second_level_yards_total,
              open_field_yards_total=stat.offense.open_field_yards_total,
              total_points=stat.offense.total_ppa,
              total_plays=stat.offense.plays) for stat in advanced_stats]
    )

In [9]:
advanced_stats_combined = pd.concat(advanced_stats_data.values(), ignore_index=True)
combined_data_with_advanced_stats = combined_data.merge(advanced_stats_combined, left_on=['game_id', 'home_team'], right_on=['game_id', 'team'], how='inner')
combined_data_with_advanced_stats.drop('team', axis=1, inplace=True)
df = combined_data_with_advanced_stats.dropna()

In [10]:
X = df[['line_yards_total', 'second_level_yards_total', 'open_field_yards_total', 'total_points', 'total_plays']]
y = df['point_differential']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [11]:
# convert training and testing sets to dataframes
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(train_df, test_df)

     line_yards_total  second_level_yards_total  open_field_yards_total  \
138              91.0                      20.0                    24.0   
140             118.0                      37.0                    30.0   
180              90.0                      20.0                    10.0   
222              76.0                      51.0                    44.0   
21               96.0                      62.0                    70.0   
..                ...                       ...                     ...   
257             137.0                      52.0                    44.0   
196              95.0                      49.0                    75.0   
120             117.0                      32.0                     6.0   
48               80.0                      33.0                    20.0   
176              99.0                      36.0                    93.0   

     total_points  total_plays  point_differential  
138     11.046009           73                

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)

In [13]:
# Predict point differentials for the test dataset
y_pred = model.predict(X_test)

# Calculate mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("Mean Squared Error: {:.2f}".format(mse))
print("R^2 Score: {:.2f}".format(r2))


Mean Squared Error: 315.22
R^2 Score: 0.45
