In [2]:
import pandas as pd
import numpy as np

# PreProcess Data for ML Model 

In [3]:
df = pd.read_csv('./cfb.csv')
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win
0,2014,1,South Carolina,28.0,Texas A&M,52.0,2014-08-28 18:00:00,False
1,2014,1,Texas A&M,52.0,South Carolina,28.0,2014-08-28 18:00:00,True
2,2014,1,Akron,41.0,Howard,0.0,2014-08-28 19:00:00,True
3,2014,1,Central Michigan,20.0,Chattanooga,16.0,2014-08-28 19:00:00,True
4,2014,1,Presbyterian,3.0,Northern Illinois,55.0,2014-08-28 19:00:00,False
...,...,...,...,...,...,...,...,...
14797,2023,15,Florida State,16.0,Louisville,6.0,2023-12-02 20:00:00,True
14798,2023,15,Louisville,6.0,Florida State,16.0,2023-12-02 20:00:00,False
14799,2023,15,Iowa,0.0,Michigan,26.0,2023-12-02 20:00:00,False
14800,2023,16,Army,17.0,Navy,11.0,2023-12-09 15:00:00,True


In [4]:
# Add a Target column (shows if they win the next week)
def add_target(team):
    team['Target'] = team["Win"].shift(-1)
    return team

df = df.groupby("Team", group_keys=False).apply(add_target)

In [5]:
# Check a certain team
df[df['Team'] == "Virginia Tech"]

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target
100,2014,1,Virginia Tech,34.0,William & Mary,9.0,2014-08-30 16:00:00,True,True
279,2014,2,Virginia Tech,35.0,Ohio State,21.0,2014-09-06 20:00:00,True,False
321,2014,3,Virginia Tech,21.0,East Carolina,28.0,2014-09-13 12:00:00,False,False
426,2014,4,Virginia Tech,24.0,Georgia Tech,27.0,2014-09-20 12:00:00,False,True
553,2014,5,Virginia Tech,35.0,Western Michigan,17.0,2014-09-27 12:30:00,True,True
...,...,...,...,...,...,...,...,...,...
14157,2023,10,Virginia Tech,38.0,Syracuse,10.0,2023-10-26 19:30:00,True,False
14320,2023,11,Virginia Tech,3.0,Louisville,34.0,2023-11-04 15:30:00,False,True
14421,2023,12,Virginia Tech,48.0,Boston College,22.0,2023-11-11 12:00:00,True,False
14593,2023,13,Virginia Tech,28.0,North Carolina State,35.0,2023-11-18 15:30:00,False,True


In [6]:
# Replace NaN values with 2 instead of Nan
df['Target'][pd.isnull(df['Target'])] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Target'][pd.isnull(df['Target'])] = 2


In [7]:
# Convert Target values to numbers (0 or 1) 
df['Target'] = df['Target'].astype(int, errors='ignore')
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target
0,2014,1,South Carolina,28.0,Texas A&M,52.0,2014-08-28 18:00:00,False,1
1,2014,1,Texas A&M,52.0,South Carolina,28.0,2014-08-28 18:00:00,True,1
2,2014,1,Akron,41.0,Howard,0.0,2014-08-28 19:00:00,True,0
3,2014,1,Central Michigan,20.0,Chattanooga,16.0,2014-08-28 19:00:00,True,1
4,2014,1,Presbyterian,3.0,Northern Illinois,55.0,2014-08-28 19:00:00,False,0
...,...,...,...,...,...,...,...,...,...
14797,2023,15,Florida State,16.0,Louisville,6.0,2023-12-02 20:00:00,True,2
14798,2023,15,Louisville,6.0,Florida State,16.0,2023-12-02 20:00:00,False,2
14799,2023,15,Iowa,0.0,Michigan,26.0,2023-12-02 20:00:00,False,2
14800,2023,16,Army,17.0,Navy,11.0,2023-12-09 15:00:00,True,2


In [8]:
# Check the value counts of target
df['Target'].value_counts()

1    7308
0    7248
2     246
Name: Target, dtype: int64

In [9]:
# Check for nulls
nulls = pd.isnull(df)
nulls = nulls.sum()
nulls

Season            0
Wk                0
Team              0
Points Scored     0
Opponent          0
Points Allowed    0
DateTime          0
Win               0
Target            0
dtype: int64

# Model

In [10]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score


rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=5, direction='forward', cv=split)

In [11]:
# Columns we do not scale 
removed_columns = ['Season', 'Wk', 'Day', 'Team', 'Opponent', 'DateTime', 'Win', 'Target']
# Columns we do scale (points scored and points allowed)
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [12]:
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target
0,2014,1,South Carolina,0.337349,Texas A&M,0.626506,2014-08-28 18:00:00,False,1
1,2014,1,Texas A&M,0.626506,South Carolina,0.337349,2014-08-28 18:00:00,True,1
2,2014,1,Akron,0.493976,Howard,0.000000,2014-08-28 19:00:00,True,0
3,2014,1,Central Michigan,0.240964,Chattanooga,0.192771,2014-08-28 19:00:00,True,1
4,2014,1,Presbyterian,0.036145,Northern Illinois,0.662651,2014-08-28 19:00:00,False,0
...,...,...,...,...,...,...,...,...,...
14797,2023,15,Florida State,0.192771,Louisville,0.072289,2023-12-02 20:00:00,True,2
14798,2023,15,Louisville,0.072289,Florida State,0.192771,2023-12-02 20:00:00,False,2
14799,2023,15,Iowa,0.000000,Michigan,0.313253,2023-12-02 20:00:00,False,2
14800,2023,16,Army,0.204819,Navy,0.132530,2023-12-09 15:00:00,True,2


In [13]:
# Select predictors 
predictors = list(df[['Points Scored', 'Points Allowed']])
predictors

['Points Scored', 'Points Allowed']

In [14]:
# Create a function to make predictions
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    # All seasons in our dataset
    seasons = sorted(data["Season"].unique()) 

    for i in range(start, len(seasons), step):
        season = seasons[i]

        # Train on all data before our current season
        train = data[data["Season"] < season]
        # Test on our current season data
        test = data[data["Season"] == season]

        # Fit model 
        model.fit(train[predictors], train['Target'])

        # Generate predictions
        preds = model.predict(test[predictors])
        # Convert to pandas series instead of numpy array
        preds = pd.Series(preds, index=test.index)

        # Combine the Target and Prediction values
        combined = pd.concat([test['Target'], preds], axis=1)
        combined.columns = ['Actual', 'Prediction']
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [15]:
predictions = backtest(df, rr, predictors)
predictions

Unnamed: 0,Actual,Prediction
3362,1,0
3363,1,1
3364,1,1
3365,0,1
3366,0,0
...,...,...
14797,2,1
14798,2,0
14799,2,0
14800,2,1


In [16]:
# Accuracy of our model
predictions = predictions[predictions["Actual"] != 2]
accuracy_score(predictions['Actual'], predictions['Prediction'])

0.5790978115230013

# Improve Model

In [17]:
# Use rolling averages to improve model
df_rolling = df[['Season', 'Team', 'Points Scored', 'Points Allowed', 'Win']]
df_rolling

Unnamed: 0,Season,Team,Points Scored,Points Allowed,Win
0,2014,South Carolina,0.337349,0.626506,False
1,2014,Texas A&M,0.626506,0.337349,True
2,2014,Akron,0.493976,0.000000,True
3,2014,Central Michigan,0.240964,0.192771,True
4,2014,Presbyterian,0.036145,0.662651,False
...,...,...,...,...,...
14797,2023,Florida State,0.192771,0.072289,True
14798,2023,Louisville,0.072289,0.192771,False
14799,2023,Iowa,0.000000,0.313253,False
14800,2023,Army,0.204819,0.132530,True


In [18]:
def find_team_averages(team):
    # Group rows by previous 2 rows + current row averages
    rolling = team[['Points Scored', 'Points Allowed']].rolling(3).mean()
    return rolling

df_rolling = df_rolling.groupby(['Season', 'Team'], group_keys=False).apply(find_team_averages)
df_rolling

Unnamed: 0,Points Scored,Points Allowed
0,,
1,,
2,,
3,,
4,,
...,...,...
14797,0.393574,0.136546
14798,0.301205,0.341365
14799,0.112450,0.196787
14800,0.248996,0.184739


In [19]:
# Rename the rolling cols so we can merge with original df
rolling_cols = [f'{col}_3' for col in df_rolling.columns]
df_rolling.columns = rolling_cols

# Concatenate
df = pd.concat([df, df_rolling], axis=1)
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target,Points Scored_3,Points Allowed_3
0,2014,1,South Carolina,0.337349,Texas A&M,0.626506,2014-08-28 18:00:00,False,1,,
1,2014,1,Texas A&M,0.626506,South Carolina,0.337349,2014-08-28 18:00:00,True,1,,
2,2014,1,Akron,0.493976,Howard,0.000000,2014-08-28 19:00:00,True,0,,
3,2014,1,Central Michigan,0.240964,Chattanooga,0.192771,2014-08-28 19:00:00,True,1,,
4,2014,1,Presbyterian,0.036145,Northern Illinois,0.662651,2014-08-28 19:00:00,False,0,,
...,...,...,...,...,...,...,...,...,...,...,...
14797,2023,15,Florida State,0.192771,Louisville,0.072289,2023-12-02 20:00:00,True,2,0.393574,0.136546
14798,2023,15,Louisville,0.072289,Florida State,0.192771,2023-12-02 20:00:00,False,2,0.301205,0.341365
14799,2023,15,Iowa,0.000000,Michigan,0.313253,2023-12-02 20:00:00,False,2,0.112450,0.196787
14800,2023,16,Army,0.204819,Navy,0.132530,2023-12-09 15:00:00,True,2,0.248996,0.184739


In [20]:
# Drop rows with missing rows
df = df.dropna()
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target,Points Scored_3,Points Allowed_3
301,2014,3,Louisiana Tech,0.506024,North Texas,0.253012,2014-09-11 20:00:00,True,0,0.425703,0.357430
302,2014,3,Houston,0.301205,Brigham Young,0.397590,2014-09-11 21:00:00,False,1,0.317269,0.240964
303,2014,3,Brigham Young,0.397590,Houston,0.301205,2014-09-11 21:00:00,True,1,0.437751,0.168675
307,2014,3,Buffalo,0.253012,Baylor,0.759036,2014-09-12 20:00:00,False,1,0.393574,0.554217
309,2014,3,Boise State,0.457831,Connecticut,0.253012,2014-09-13 12:00:00,True,1,0.353414,0.321285
...,...,...,...,...,...,...,...,...,...,...,...
14797,2023,15,Florida State,0.192771,Louisville,0.072289,2023-12-02 20:00:00,True,2,0.393574,0.136546
14798,2023,15,Louisville,0.072289,Florida State,0.192771,2023-12-02 20:00:00,False,2,0.301205,0.341365
14799,2023,15,Iowa,0.000000,Michigan,0.313253,2023-12-02 20:00:00,False,2,0.112450,0.196787
14800,2023,16,Army,0.204819,Navy,0.132530,2023-12-09 15:00:00,True,2,0.248996,0.184739


In [21]:
# Add who next opponent is to improve algorithm

# Shift to the next value given the team and column name
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

# Add a column to the dataframe applying the shift column function
def add_col(df, col_name):
    return df.groupby('Team', group_keys=False).apply(lambda x: shift_col(x, col_name))

# Add a next opponent and next date column
df['Next Opponent'] = add_col(df, 'Opponent')
df['Next Date'] = add_col(df, 'DateTime')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Opponent'] = add_col(df, 'Opponent')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Date'] = add_col(df, 'DateTime')


In [22]:
# Make a copy of the dataframe to avoid issues
df = df.copy()
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target,Points Scored_3,Points Allowed_3,Next Opponent,Next Date
301,2014,3,Louisiana Tech,0.506024,North Texas,0.253012,2014-09-11 20:00:00,True,0,0.425703,0.357430,Northwestern State,2014-09-20 19:00:00
302,2014,3,Houston,0.301205,Brigham Young,0.397590,2014-09-11 21:00:00,False,1,0.317269,0.240964,Nevada-Las Vegas,2014-09-20 20:00:00
303,2014,3,Brigham Young,0.397590,Houston,0.301205,2014-09-11 21:00:00,True,1,0.437751,0.168675,Virginia,2014-09-20 15:30:00
307,2014,3,Buffalo,0.253012,Baylor,0.759036,2014-09-12 20:00:00,False,1,0.393574,0.554217,Norfolk State,2014-09-20 15:30:00
309,2014,3,Boise State,0.457831,Connecticut,0.253012,2014-09-13 12:00:00,True,1,0.353414,0.321285,Louisiana,2014-09-20 22:40:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14797,2023,15,Florida State,0.192771,Louisville,0.072289,2023-12-02 20:00:00,True,2,0.393574,0.136546,,
14798,2023,15,Louisville,0.072289,Florida State,0.192771,2023-12-02 20:00:00,False,2,0.301205,0.341365,,
14799,2023,15,Iowa,0.000000,Michigan,0.313253,2023-12-02 20:00:00,False,2,0.112450,0.196787,,
14800,2023,16,Army,0.204819,Navy,0.132530,2023-12-09 15:00:00,True,2,0.248996,0.184739,,


In [23]:
# Get rolling data for opponent 
full = df.merge(df[rolling_cols + ['Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )
full

Unnamed: 0,Season,Wk,Team_x,Points Scored,Opponent,Points Allowed,DateTime,Win,Target,Points Scored_3_x,Points Allowed_3_x,Next Opponent_x,Next Date,Points Scored_3_y,Points Allowed_3_y,Next Opponent_y,Team_y
0,2014,3,Houston,0.301205,Brigham Young,0.397590,2014-09-11 21:00:00,False,1,0.317269,0.240964,Nevada-Las Vegas,2014-09-20 20:00:00,0.240964,0.473896,Houston,Nevada-Las Vegas
1,2014,3,Boise State,0.457831,Connecticut,0.253012,2014-09-13 12:00:00,True,1,0.353414,0.321285,Louisiana,2014-09-20 22:40:00,0.321285,0.441767,Boise State,Louisiana
2,2014,3,Pittsburgh,0.506024,Florida International,0.301205,2014-09-13 12:00:00,True,0,0.538153,0.180723,Iowa,2014-09-20 12:00:00,0.261044,0.224900,Pittsburgh,Iowa
3,2014,3,Vanderbilt,0.409639,Massachusetts,0.373494,2014-09-13 12:00:00,True,0,0.176707,0.437751,South Carolina,2014-09-20 19:30:00,0.397590,0.441767,Vanderbilt,South Carolina
4,2014,3,West Virginia,0.481928,Maryland,0.445783,2014-09-13 12:00:00,True,0,0.469880,0.281124,Oklahoma,2014-09-20 19:30:00,0.538153,0.132530,West Virginia,Oklahoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10971,2023,14,Appalachian State,0.662651,Georgia Southern,0.325301,2023-11-25 15:30:00,True,0,0.493976,0.257028,Troy,2023-12-02 16:00:00,0.445783,0.220884,Appalachian State,Troy
10972,2023,14,Washington,0.289157,Washington State,0.253012,2023-11-25 16:00:00,True,1,0.325301,0.277108,Oregon,2023-12-01 20:00:00,0.465863,0.188755,Washington,Oregon
10973,2023,14,New Mexico State,0.240964,Jacksonville State,0.204819,2023-11-25 16:00:00,True,0,0.357430,0.224900,Liberty,2023-12-01 19:00:00,0.518072,0.253012,New Mexico State,Liberty
10974,2023,14,Florida State,0.289157,Florida,0.180723,2023-11-25 19:00:00,True,1,0.437751,0.192771,Louisville,2023-12-02 20:00:00,0.401606,0.373494,Florida State,Louisville


In [24]:
# Visualize the merge
full[['Team_x', 'Next Opponent_x', 'Team_y', 'Next Opponent_y', 'Next Date']]

Unnamed: 0,Team_x,Next Opponent_x,Team_y,Next Opponent_y,Next Date
0,Houston,Nevada-Las Vegas,Nevada-Las Vegas,Houston,2014-09-20 20:00:00
1,Boise State,Louisiana,Louisiana,Boise State,2014-09-20 22:40:00
2,Pittsburgh,Iowa,Iowa,Pittsburgh,2014-09-20 12:00:00
3,Vanderbilt,South Carolina,South Carolina,Vanderbilt,2014-09-20 19:30:00
4,West Virginia,Oklahoma,Oklahoma,West Virginia,2014-09-20 19:30:00
...,...,...,...,...,...
10971,Appalachian State,Troy,Troy,Appalachian State,2023-12-02 16:00:00
10972,Washington,Oregon,Oregon,Washington,2023-12-01 20:00:00
10973,New Mexico State,Liberty,Liberty,New Mexico State,2023-12-01 19:00:00
10974,Florida State,Louisville,Louisville,Florida State,2023-12-02 20:00:00


In [25]:
# Use sequential feature selector to find features
# Get columns that have the object datatype (our model cannot use them)
removed_columns = list(full.columns[full.dtypes == 'object']) + removed_columns
removed_columns

['Team_x',
 'Opponent',
 'DateTime',
 'Next Opponent_x',
 'Next Date',
 'Next Opponent_y',
 'Team_y',
 'Season',
 'Wk',
 'Day',
 'Team',
 'Opponent',
 'DateTime',
 'Win',
 'Target']

In [26]:
# Get columns that are not in our removed_columns list
selected_columns = full.columns[~full.columns.isin(removed_columns)]
selected_columns

Index(['Points Scored', 'Points Allowed', 'Points Scored_3_x',
       'Points Allowed_3_x', 'Points Scored_3_y', 'Points Allowed_3_y'],
      dtype='object')

In [27]:
# Going to use all selected_columns as features rather than doing a feature selector

# sfs.fit(full[selected_columns], full['Target'])

In [28]:
# predictors = list(selected_columns[sfs.get_support()])
# predictors

In [29]:
# Predictions
predictions = backtest(full, rr, selected_columns)
accuracy_score(predictions['Actual'], predictions['Prediction'])

0.6423582336924352

In [30]:
predictions

Unnamed: 0,Actual,Prediction
2529,1,1
2530,1,1
2531,1,1
2532,0,0
2533,0,1
...,...,...
10971,0,1
10972,1,0
10973,0,0
10974,1,1


In [33]:
# Predict future winners function
def predict_winner(model, data, team1, team2, date, predictors):
    # Get latest statistics for each team
    team1_stats = data[data['Team'] == team1].tail(1)
    team2_stats = data[data['Team'] == team2].tail(1)

    # Combine datasets
    combined = pd.concat([team1_stats, team2_stats], ignore_index=True)
    combined = combined.copy()

    # Add next opponent and next date to the dataset
    combined['Next Opponent'][0] = team2
    combined['Next Opponent'][1] = team1
    combined['Next Date'] = date

    predict_data = combined.merge(combined[['Points Scored_3', 'Points Allowed_3', 'Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )

    # Make the prediction
    prediction = model.predict(predict_data[predictors])
    
    # Get predicted winner and loser
    winner = team1 if prediction[0] == 1 else team2
    loser = team2 if prediction[0] == 1 else team1
    return {'Winner': winner, 'Loser': loser}

# Change team names and then display predicted winner and loser
winner_loser = predict_winner(rr, df, 'Virginia Tech', 'Tulane', '2023-12-30 20:00:00', selected_columns)
winner_loser

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][0] = team2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][1] = team1


{'Winner': 'Virginia Tech', 'Loser': 'Tulane'}

In [32]:
# Save full dataframe to use in different models
full.to_csv('fulldata.csv', index=False, na_rep='NA', encoding='utf-8')