In [150]:
# Import Packages 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf

In [151]:
# Import dataset
df = pd.read_csv('./cfb.csv')
df.head()

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win
0,2014,1,South Carolina,28.0,Texas A&M,52.0,2014-08-28 18:00:00,False
1,2014,1,Texas A&M,52.0,South Carolina,28.0,2014-08-28 18:00:00,True
2,2014,1,Akron,41.0,Howard,0.0,2014-08-28 19:00:00,True
3,2014,1,Central Michigan,20.0,Chattanooga,16.0,2014-08-28 19:00:00,True
4,2014,1,Presbyterian,3.0,Northern Illinois,55.0,2014-08-28 19:00:00,False


In [152]:
# Convert Win Boolean to a numeric (0 or 1) 
df['Win'] = df['Win'].astype(int, errors='ignore')
df.head()

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win
0,2014,1,South Carolina,28.0,Texas A&M,52.0,2014-08-28 18:00:00,0
1,2014,1,Texas A&M,52.0,South Carolina,28.0,2014-08-28 18:00:00,1
2,2014,1,Akron,41.0,Howard,0.0,2014-08-28 19:00:00,1
3,2014,1,Central Michigan,20.0,Chattanooga,16.0,2014-08-28 19:00:00,1
4,2014,1,Presbyterian,3.0,Northern Illinois,55.0,2014-08-28 19:00:00,0


In [153]:
# Add a Next Score column (shows if the score of the next week)
def add_next_score(team):
    team['Next Score'] = team['Points Scored'].shift(-1)
    return team

df = df.groupby("Team", group_keys=False).apply(add_next_score)

# Add rolling win-loss ratio
df = df.sort_values(by=['Season', 'Wk', 'Team'])
df['Cumulative Wins'] = df.groupby(['Season', 'Team'])['Win'].cumsum()
df['Cumulative Games'] = df.groupby(['Season', 'Team']).cumcount() + 1
df['Win-Loss'] = df['Cumulative Wins'] / df['Cumulative Games']

# Check a certain team
df[df['Team'] == "Virginia Tech"]

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Cumulative Wins,Cumulative Games,Win-Loss
100,2014,1,Virginia Tech,34.0,William & Mary,9.0,2014-08-30 16:00:00,1,35.0,1,1,1.000000
279,2014,2,Virginia Tech,35.0,Ohio State,21.0,2014-09-06 20:00:00,1,21.0,2,2,1.000000
321,2014,3,Virginia Tech,21.0,East Carolina,28.0,2014-09-13 12:00:00,0,24.0,2,3,0.666667
426,2014,4,Virginia Tech,24.0,Georgia Tech,27.0,2014-09-20 12:00:00,0,35.0,2,4,0.500000
553,2014,5,Virginia Tech,35.0,Western Michigan,17.0,2014-09-27 12:30:00,1,34.0,3,5,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...
14157,2023,10,Virginia Tech,38.0,Syracuse,10.0,2023-10-26 19:30:00,1,3.0,4,8,0.500000
14320,2023,11,Virginia Tech,3.0,Louisville,34.0,2023-11-04 15:30:00,0,48.0,4,9,0.444444
14421,2023,12,Virginia Tech,48.0,Boston College,22.0,2023-11-11 12:00:00,1,28.0,5,10,0.500000
14593,2023,13,Virginia Tech,28.0,North Carolina State,35.0,2023-11-18 15:30:00,0,55.0,5,11,0.454545


In [154]:
# Replace NaN values with 2 instead of Nan
df['Next Score'][pd.isnull(df['Next Score'])] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Score'][pd.isnull(df['Next Score'])] = -1


In [155]:
# Use rolling averages to improve model
df_rolling = df[['Season', 'Team', 'Points Scored', 'Points Allowed', 'Win']]

def find_team_averages(team):
    # Group rows by previous 2 rows + current row averages
    rolling = team[['Points Scored', 'Points Allowed']].rolling(3).mean()
    return rolling

df_rolling = df_rolling.groupby(['Season', 'Team'], group_keys=False).apply(find_team_averages)

# Rename the rolling cols so we can merge with original df
rolling_cols = [f'{col}_3' for col in df_rolling.columns]
df_rolling.columns = rolling_cols

# Concatenate
df = pd.concat([df, df_rolling], axis=1)
# Drop rows with missing rows
df = df.dropna()
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Cumulative Wins,Cumulative Games,Win-Loss,Points Scored_3,Points Allowed_3
332,2014,3,Air Force,48.0,Georgia State,38.0,2014-09-13 14:00:00,1,28.0,2,3,0.666667,35.000000,23.666667
368,2014,3,Alabama,52.0,Southern Mississippi,12.0,2014-09-13 18:00:00,1,42.0,3,3,1.000000,42.000000,11.666667
351,2014,3,Alabama-Birmingham,41.0,Alabama A&M,14.0,2014-09-13 15:30:00,1,20.0,2,3,0.666667,41.000000,23.666667
413,2014,3,Arizona,35.0,Nevada,28.0,2014-09-13 23:00:00,1,49.0,3,3,1.000000,39.666667,21.333333
408,2014,3,Arizona State,38.0,Colorado,24.0,2014-09-13 22:00:00,1,27.0,3,3,1.000000,47.000000,20.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14795,2023,15,Troy,49.0,Appalachian State,23.0,2023-12-02 16:00:00,1,-1.0,11,13,0.846154,38.333333,21.333333
14790,2023,15,Tulane,14.0,Southern Methodist,26.0,2023-12-02 16:00:00,0,-1.0,11,13,0.846154,22.333333,16.666667
14782,2023,15,Washington,34.0,Oregon,31.0,2023-12-01 20:00:00,1,-1.0,13,13,1.000000,26.666667,24.000000
14800,2023,16,Army,17.0,Navy,11.0,2023-12-09 15:00:00,1,-1.0,6,12,0.500000,20.666667,15.333333


In [156]:
# Shift to the next value given the team and column name
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

# Add a column to the dataframe applying the shift column function
def add_col(df, col_name):
    return df.groupby('Team', group_keys=False).apply(lambda x: shift_col(x, col_name))

# Add a next opponent and next date column
df['Next Opponent'] = add_col(df, 'Opponent')
df['Next Date'] = add_col(df, 'DateTime')
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Cumulative Wins,Cumulative Games,Win-Loss,Points Scored_3,Points Allowed_3,Next Opponent,Next Date
332,2014,3,Air Force,48.0,Georgia State,38.0,2014-09-13 14:00:00,1,28.0,2,3,0.666667,35.000000,23.666667,Boise State,2014-09-27 19:00:00
368,2014,3,Alabama,52.0,Southern Mississippi,12.0,2014-09-13 18:00:00,1,42.0,3,3,1.000000,42.000000,11.666667,Florida,2014-09-20 15:30:00
351,2014,3,Alabama-Birmingham,41.0,Alabama A&M,14.0,2014-09-13 15:30:00,1,20.0,2,3,0.666667,41.000000,23.666667,Florida International,2014-09-27 15:30:00
413,2014,3,Arizona,35.0,Nevada,28.0,2014-09-13 23:00:00,1,49.0,3,3,1.000000,39.666667,21.333333,California,2014-09-20 22:00:00
408,2014,3,Arizona State,38.0,Colorado,24.0,2014-09-13 22:00:00,1,27.0,3,3,1.000000,47.000000,20.333333,UCLA,2014-09-25 22:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14795,2023,15,Troy,49.0,Appalachian State,23.0,2023-12-02 16:00:00,1,-1.0,11,13,0.846154,38.333333,21.333333,,
14790,2023,15,Tulane,14.0,Southern Methodist,26.0,2023-12-02 16:00:00,0,-1.0,11,13,0.846154,22.333333,16.666667,,
14782,2023,15,Washington,34.0,Oregon,31.0,2023-12-01 20:00:00,1,-1.0,13,13,1.000000,26.666667,24.000000,,
14800,2023,16,Army,17.0,Navy,11.0,2023-12-09 15:00:00,1,-1.0,6,12,0.500000,20.666667,15.333333,,


In [157]:
# Get rolling data for opponent 
full = df.merge(df[rolling_cols + ['Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )
full

Unnamed: 0,Season,Wk,Team_x,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Cumulative Wins,Cumulative Games,Win-Loss,Points Scored_3_x,Points Allowed_3_x,Next Opponent_x,Next Date,Points Scored_3_y,Points Allowed_3_y,Next Opponent_y,Team_y
0,2014,3,Air Force,48.0,Georgia State,38.0,2014-09-13 14:00:00,1,28.0,2,3,0.666667,35.000000,23.666667,Boise State,2014-09-27 19:00:00,36.333333,18.000000,Air Force,Boise State
1,2014,3,Alabama-Birmingham,41.0,Alabama A&M,14.0,2014-09-13 15:30:00,1,20.0,2,3,0.666667,41.000000,23.666667,Florida International,2014-09-27 15:30:00,20.666667,26.333333,Alabama-Birmingham,Florida International
2,2014,3,Arizona State,38.0,Colorado,24.0,2014-09-13 22:00:00,1,27.0,3,3,1.000000,47.000000,20.333333,UCLA,2014-09-25 22:00:00,30.000000,24.000000,Arizona State,UCLA
3,2014,3,Arkansas,49.0,Texas Tech,28.0,2014-09-13 15:30:00,1,52.0,2,3,0.666667,47.666667,26.666667,Northern Illinois,2014-09-20 19:00:00,42.000000,17.333333,Arkansas,Northern Illinois
4,2014,3,Arkansas State,20.0,Miami (FL),41.0,2014-09-13 15:30:00,0,21.0,1,3,0.333333,25.333333,28.333333,Utah State,2014-09-20 19:00:00,27.666667,27.333333,Arkansas State,Utah State
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10971,2023,14,Texas,57.0,Texas Tech,7.0,2023-11-24 19:30:00,1,49.0,11,12,0.916667,37.333333,16.333333,Oklahoma State,2023-12-02 12:00:00,28.666667,36.333333,Texas,Oklahoma State
10972,2023,14,Toledo,32.0,Central Michigan,17.0,2023-11-24 12:00:00,1,14.0,11,12,0.916667,37.666667,23.666667,Miami (OH),2023-12-02 12:00:00,19.666667,8.333333,Toledo,Miami (OH)
10973,2023,14,Troy,35.0,Southern Mississippi,17.0,2023-11-25 12:00:00,1,49.0,10,12,0.833333,37.000000,18.333333,Appalachian State,2023-12-02 16:00:00,41.000000,21.333333,Troy,Appalachian State
10974,2023,14,Tulane,29.0,Texas-San Antonio,16.0,2023-11-24 15:30:00,1,14.0,11,12,0.916667,25.666667,15.333333,Southern Methodist,2023-12-02 16:00:00,47.333333,23.000000,Tulane,Southern Methodist


In [158]:
# Get columns with object datatype (model can't use them)
removed_columns = list(full.columns[full.dtypes == 'object'])
removed_columns

['Team_x',
 'Opponent',
 'DateTime',
 'Next Opponent_x',
 'Next Date',
 'Next Opponent_y',
 'Team_y']

In [183]:
# Features and labels 
X = full[['Points Scored', 'Points Allowed','Cumulative Wins', 'Win-Loss',
          'Points Scored_3_x', 'Points Allowed_3_x', 'Points Scored_3_y', 'Points Allowed_3_y']]
y = full['Next Score']
X

Unnamed: 0,Points Scored,Points Allowed,Cumulative Wins,Win-Loss,Points Scored_3_x,Points Allowed_3_x,Points Scored_3_y,Points Allowed_3_y
0,48.0,38.0,2,0.666667,35.000000,23.666667,36.333333,18.000000
1,41.0,14.0,2,0.666667,41.000000,23.666667,20.666667,26.333333
2,38.0,24.0,3,1.000000,47.000000,20.333333,30.000000,24.000000
3,49.0,28.0,2,0.666667,47.666667,26.666667,42.000000,17.333333
4,20.0,41.0,1,0.333333,25.333333,28.333333,27.666667,27.333333
...,...,...,...,...,...,...,...,...
10971,57.0,7.0,11,0.916667,37.333333,16.333333,28.666667,36.333333
10972,32.0,17.0,11,0.916667,37.666667,23.666667,19.666667,8.333333
10973,35.0,17.0,10,0.833333,37.000000,18.333333,41.000000,21.333333
10974,29.0,16.0,11,0.916667,25.666667,15.333333,47.333333,23.000000


In [184]:
# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [186]:
# Create the model
ann_model = tf.keras.models.Sequential()
# Input layer
ann_model.add(tf.keras.layers.Dense(128, activation='relu', input_shape=(8, )))
#Hidden layers
ann_model.add(tf.keras.layers.Dense(64, activation='relu'))
ann_model.add(tf.keras.layers.Dense(32, activation='relu'))
# Output layer (1 for regression)
ann_model.add(tf.keras.layers.Dense(1, activation='linear'))
# Compile model
ann_model.compile(optimizer='adam', loss='mean_squared_error', metrics='mean_absolute_error')

# Train the model
ann_model.fit(X_train, y_train, epochs=50, batch_size=50, validation_data=(X_test, y_test))

# Make predictions
predictions = ann_model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error (RMSE): {rmse}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Root Mean Squared Error (RMSE): 13.171400698277964


In [162]:
# Save the model
ann_model.save('ann_model.h5')

  saving_api.save_model(


In [199]:
# Load model
ann_model = tf.keras.models.load_model('ann_model.h5')

score_predictors = ['Points Scored', 'Points Allowed', 'Cumulative Wins',
                    'Win-Loss', 'Points Scored_3_x', 'Points Allowed_3_x', 'Points Scored_3_y', 'Points Allowed_3_y']
# Predict future scores function
def predict_score(model, data, team1, team2, date, predictors):
    # Get latest statistics for each team
    team1_stats = data[data['Team'] == team1].tail(1)
    team2_stats = data[data['Team'] == team2].tail(1)

    # Combine datasets
    combined = pd.concat([team1_stats, team2_stats], ignore_index=True)
    combined = combined.copy()

    # Add next opponent and next date to the dataset
    combined['Next Opponent'][0] = team2
    combined['Next Opponent'][1] = team1
    combined['Next Date'] = date

    predict_data = combined.merge(combined[['Points Scored_3', 'Points Allowed_3', 'Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )
    # Make the prediction
    prediction = model.predict(predict_data[predictors])
    
    # Get predicted score
    team1_score = prediction[0]
    team2_score = prediction[1]

    return {'Team1': {'Name': team1, 'Predicted_Score': round(team1_score[0])},
        'Team2': {'Name': team2, 'Predicted_Score': round(team2_score[0])}}

# Change team names and then display predicted winner and loser
score_prediction = predict_score(ann_model, df, 'James Madison', 'Texas', '2023-12-30 20:00:00', score_predictors)
score_prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][0] = team2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][1] = team1




{'Team1': {'Name': 'James Madison', 'Predicted_Score': 33},
 'Team2': {'Name': 'Texas', 'Predicted_Score': 35}}