Import requirements

In [21]:
import os
import pandas as pd
from datetime import datetime as dt
from sklearn.linear_model import LinearRegression
import unittest
import sklearn.preprocessing
from sklearn.model_selection import train_test_split

Here, we create the CFB Model Dataframes

In [22]:
class CFBDataframe:
    def __init__(self):
        # list of dfs values[0] and empty init df values[1]
        self.data_map = {"drives": [[], pd.DataFrame()], "games": [[], pd.DataFrame()], "lines": [[], pd.DataFrame()],
                         "player_game_stats": [[], pd.DataFrame()], "player_season_stats": [[], pd.DataFrame()],
                         "player_usage": [[], pd.DataFrame()], "recruiting_groups": [[], pd.DataFrame()],
                         "recruiting_players": [[], pd.DataFrame()], "recruiting_teams": [[], pd.DataFrame()],
                         "venues": [[], pd.DataFrame()]}

    @staticmethod
    def append_dfs(df, df_type):
        # append df to list of dfs
        df_type[0].append(df)
        # concat list of dfs
        df_type[1] = pd.concat(df_type[0])

    @staticmethod
    def impute_df(file):
        # import data with Windows encoding
        try:
            df = pd.read_csv(file, encoding='ANSI')  # , na_values='?')
            # df.fillna(pd.Series.mean(df))
        # if not Windows, do Mac encoding
        except LookupError:
            try:
                df = pd.read_csv(file, encoding='ISO-8859-1')  # , na_values='?')
                # df.fillna(pd.Series.mean(df))
            except pd.errors.EmptyDataError:
                print(file, "is empty")
                return
        # if empty data, skip file
        except pd.errors.EmptyDataError:
            print(file, "is empty")
            return
        # set first row as headers
        df.rename(columns=df.iloc[0])
        return df

    def csv_to_df(self, args):
        # not time based
        if len(args) == 1:
            file = "../data/{i}/{i}.csv".format(i=args[0])
        # season based
        elif len(args) == 2:
            file = "../data/{i}/{j}_{i}.csv".format(i=args[0], j=args[1])
        # game based
        else:
            file = "../data/{i}/{k}_week_{j}_{i}.csv".format(i=args[0], j=args[1], k=args[2])
        # import csv to df
        df = self.impute_df(file)
        # append non-empty dfs
        if type(df) is not None:
            self.append_dfs(df, self.data_map[args[0]])

Build the CFB dataframe with the scraped game data

In [23]:
cfb_df = CFBDataframe()
for directory in next(os.walk('../data'))[1]:
    # non time specific dfs
    if directory in ["recruiting_groups", "recruiting_teams", "venues"]:
        try:
            cfb_df.csv_to_df([directory])
        except FileNotFoundError:
            print("Missing data from", directory)
    else:
        # season specific dfs
        for year in range(2010, dt.today().year):
            # week specific df
            if directory == "player_game_stats":
                for week in range(1, 17):
                    try:
                        cfb_df.csv_to_df([directory, week, year])
                    except FileNotFoundError:
                        print("Missing game data for", year, "week", week)
            elif directory == "__pycache__":
                continue
            else:
                try:
                    cfb_df.csv_to_df([directory, year])
                except FileNotFoundError:
                    print("Missing", year, "data for", directory)

Missing 2010 data for lines
Missing 2011 data for lines
Missing 2012 data for lines
Missing game data for 2010 week 16
Missing game data for 2011 week 16
Missing game data for 2012 week 16
Missing game data for 2015 week 16
Missing game data for 2016 week 16
Missing game data for 2017 week 16
Missing game data for 2018 week 16
Missing game data for 2019 week 16
Missing 2010 data for player_usage
Missing 2011 data for player_usage
Missing 2012 data for player_usage


Here, we initialize attributes and methods used to manipulate CFB Model

In [24]:
class CFBModel:
    def __init__(self, df):
        # dict of dfs
        self.data = {k: df[k][1] for k in df}

    def home_favored(self):
        mean_home_points = pd.Series.mean(self.data["games"]["_home_points"])
        mean_away_points = pd.Series.mean(self.data["games"]["_away_points"])
        return mean_home_points - mean_away_points

    def regression_predict(self, predictors):
        regression_model = LinearRegression()
        indep_vars = [self.data['games']['_home_post_win_prob'], self.data['games']['_home_points']]
        dep_vars = self.data['games']['_home_points'] - self.data['games']['_away_points']
        regression_model.fit(indep_vars, dep_vars)
        prediction = regression_model.predict(predictors)
        return prediction

    def spread_prediction(self):
        regression_model = LinearRegression()
        game_data = self.data['games']

        # Select columns
        x = game_data[['_season','_week','_season_type','_neutral_site','_conference_game','_venue_id','_home_id','_away_id','_home_conference','_away_conference']]
        y = game_data['_home_points'] - game_data['_away_points']

        # Data Transforms
        x.loc[:,'_season_type'] = sklearn.preprocessing.LabelEncoder().fit_transform(x['_season_type'])
        x.loc[:,'_home_conference'] = sklearn.preprocessing.LabelEncoder().fit_transform(x['_home_conference'])
        x.loc[:,'_away_conference'] = sklearn.preprocessing.LabelEncoder().fit_transform(x['_away_conference'])
        x.loc[:,'_venue_id'] = sklearn.preprocessing.LabelEncoder().fit_transform(x['_venue_id'])
        x.loc[:,'_home_id'] = sklearn.preprocessing.LabelEncoder().fit_transform(x['_home_id'])
        x.loc[:,'_away_id'] = sklearn.preprocessing.LabelEncoder().fit_transform(x['_away_id'])

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=0)
        regression_model.fit(x_train, y_train)
        return regression_model.score(x_test, y_test)

Create and perform operations on model

In [25]:
cfb_model = CFBModel(cfb_df.data_map)
home_favor = cfb_model.home_favored()
print("Home teams win by an average of {:.2f} points".format(home_favor))
num_cells = sum(cfb_model.data[k].size for k in cfb_model.data)
print("We have {} cells of data to work with 👀".format(num_cells))
# print("Prediction", cfb_model.regression_predict([0.77, 34]))

Home teams win by an average of 7.18 points
We have 12507246 cells of data to work with 👀


In [26]:
spread_regression_model = CFBModel(cfb_df.data_map)
spread_model_score = spread_regression_model.spread_prediction()
print(spread_model_score)

0.13046156139719112


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['_home_conference'] = sklearn.preprocessing.LabelEncoder().fit_transform(x['_home_conference'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['_away_conference'] = sklearn.preprocessing.LabelEncoder

Unit testing

In [27]:
class TestRanking(unittest.TestCase):

    def test_dfs(self):
        self.assertGreater(num_cells, 0)
        self.assertEqual(type(cfb_model.data), dict)
        self.assertEqual(len(cfb_model.data), 10)


unittest.main(argv=[''], verbosity=0, exit=False)

----------------------------------------------------------------------
Ran 1 test in 0.000s

OK


<unittest.main.TestProgram at 0x1fc1121d2e0>