Import requirements

In [56]:
import os
import pandas as pd
from datetime import datetime as dt
from sklearn.linear_model import LinearRegression
import unittest

Here, we create the CFB Model Dataframes

In [57]:
class CFBDataframe:
    def __init__(self):
        # list of dfs and empty init df
        self.drives = [[], pd.DataFrame()]
        self.games = [[], pd.DataFrame()]
        self.lines = [[], pd.DataFrame()]
        self.player_game_stats = [[], pd.DataFrame()]
        self.player_season_stats = [[], pd.DataFrame()]
        self.player_usage = [[], pd.DataFrame()]
        self.recruiting_groups = [[], pd.DataFrame()]
        self.recruiting_players = [[], pd.DataFrame()]
        self.recruiting_teams = [[], pd.DataFrame()]
        self.venues = [[], pd.DataFrame()]

    @staticmethod
    def append_dfs(df, df_type):
        # append df to list of dfs
        df_type[0].append(df)
        # concat list of dfs
        df_type[1] = pd.concat(df_type[0])

    @staticmethod
    def impute_df(file):
        # import data with Windows encoding
        try:
            df = pd.read_csv(file, encoding='ANSI')  # , na_values='?')
            # df.fillna(pd.Series.mean(df))
        # if not Windows, do Mac encoding
        except LookupError:
            try:
                df = pd.read_csv(file, encoding='ISO-8859-1')  # , na_values='?')
                # df.fillna(pd.Series.mean(df))
            except pd.errors.EmptyDataError:
                print(file, "is empty")
                return
        # if empty data, skip file
        except pd.errors.EmptyDataError:
            print(file, "is empty")
            return
        # set first row as headers
        df.rename(columns=df.iloc[0])
        return df

    def csv_to_df(self, args):
        # not time based
        if len(args) == 1:
            file = "../data/{i}/{i}.csv".format(i=args[0])
        # season based
        elif len(args) == 2:
            file = "../data/{i}/{j}_{i}.csv".format(i=args[0], j=args[1])
        # game based
        else:
            file = "../data/{i}/{k}_week_{j}_{i}.csv".format(i=args[0], j=args[1], k=args[2])
        # import csv to df
        df = self.impute_df(file)
        # append non-empty dfs
        if type(df) is not None:
            exec("self.append_dfs(df, self.%s)" % args[0])

Build the CFB dataframe with the scraped game data

In [58]:
cfb_df = CFBDataframe()
for directory in next(os.walk('../data'))[1]:
    # non time specific dfs
    if directory in ["recruiting_groups", "recruiting_teams", "venues"]:
        try:
            cfb_df.csv_to_df([directory])
        except FileNotFoundError:
            print("Missing data from", directory)
    else:
        # season specific dfs
        for year in range(2010, dt.today().year):
            # week specific df
            if directory == "player_game_stats":
                for week in range(1, 17):
                    try:
                        cfb_df.csv_to_df([directory, week, year])
                    except FileNotFoundError:
                        print("Missing game data for", year, "week", week)
            else:
                try:
                    cfb_df.csv_to_df([directory, year])
                except FileNotFoundError:
                    print("Missing", year, "data for", directory)

../data/player_game_stats/2010_week_16_player_game_stats.csv is empty
../data/player_game_stats/2011_week_16_player_game_stats.csv is empty
../data/player_game_stats/2012_week_16_player_game_stats.csv is empty
../data/player_game_stats/2015_week_16_player_game_stats.csv is empty
../data/player_game_stats/2016_week_16_player_game_stats.csv is empty
../data/player_game_stats/2017_week_16_player_game_stats.csv is empty
../data/player_game_stats/2018_week_16_player_game_stats.csv is empty
../data/player_game_stats/2019_week_16_player_game_stats.csv is empty
Missing 2010 data for player_usage
Missing 2011 data for player_usage
Missing 2012 data for player_usage
Missing 2010 data for lines
Missing 2011 data for lines
Missing 2012 data for lines


Here, we initialize attributes and methods used to manipulate CFB Model

In [59]:
class CFBModel:
    def __init__(self, df):
        # dict of dfs
        self.data = {"drives": df.drives[1], "games": df.games[1], "lines": df.lines[1],
                     "player_game_stats": df.player_game_stats[1], "player_season_stats": df.player_season_stats[1],
                     "player_usage": df.player_usage[1], "recruiting_groups": df.recruiting_groups[1],
                     "recruiting_players": df.recruiting_players[1], "recruiting_teams": df.recruiting_teams[1],
                     "venues": df.venues[1]}

    def home_favored(self):
        mean_home_points = pd.Series.mean(self.data["games"]["_home_points"])
        mean_away_points = pd.Series.mean(self.data["games"]["_away_points"])
        return mean_home_points - mean_away_points

    def regression_predict(self, predictors):
        regression_model = LinearRegression()
        indep_vars = [self.data['games']['_home_post_win_prob'], self.data['games']['_home_points']]
        dep_vars = self.data['games']['_home_points'] - self.data['games']['_away_points']
        regression_model.fit(indep_vars, dep_vars)
        prediction = regression_model.predict(predictors)
        return prediction

Create and perform operations on model

In [60]:
cfb_model = CFBModel(cfb_df)
home_favor = cfb_model.home_favored()
print("Home teams win by an average of {:.2f} points".format(home_favor))
num_cells = cfb_model.data['drives'].size + cfb_model.data['games'].size + cfb_model.data['lines'].size + \
            cfb_model.data['player_game_stats'].size + cfb_model.data['player_season_stats'].size + \
            cfb_model.data['player_usage'].size + cfb_model.data['recruiting_groups'].size + \
            cfb_model.data['recruiting_players'].size + cfb_model.data['recruiting_teams'].size + \
            cfb_model.data['venues'].size
print("We have {} cells of data to work with 👀".format(num_cells))
# print("Prediction", cfb_model.regression_predict([0.77, 34]))

Home teams win by an average of 7.18 points
We have 12507246 cells of data to work with 👀


Unit testing

In [61]:
class TestRanking(unittest.TestCase):

    def test_dfs(self):
        self.assertEqual(len(cfb_model.data), 10)


unittest.main(argv=[''], verbosity=0, exit=False)

----------------------------------------------------------------------
Ran 1 test in 0.000s

OK


<unittest.main.TestProgram at 0x1416ec3d0>