In [2]:
# Import Packages 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [3]:
# Import dataset
df = pd.read_csv('./cfb.csv')
df.tail()

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win
14797,2023,15,Florida State,16.0,Louisville,6.0,2023-12-02 20:00:00,True
14798,2023,15,Louisville,6.0,Florida State,16.0,2023-12-02 20:00:00,False
14799,2023,15,Iowa,0.0,Michigan,26.0,2023-12-02 20:00:00,False
14800,2023,16,Army,17.0,Navy,11.0,2023-12-09 15:00:00,True
14801,2023,16,Navy,11.0,Army,17.0,2023-12-09 15:00:00,False


In [4]:
# Convert Win Boolean to a numeric (0 or 1) 
df['Win'] = df['Win'].astype(int, errors='ignore')
df.head()

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win
0,2014,1,South Carolina,28.0,Texas A&M,52.0,2014-08-28 18:00:00,0
1,2014,1,Texas A&M,52.0,South Carolina,28.0,2014-08-28 18:00:00,1
2,2014,1,Akron,41.0,Howard,0.0,2014-08-28 19:00:00,1
3,2014,1,Central Michigan,20.0,Chattanooga,16.0,2014-08-28 19:00:00,1
4,2014,1,Presbyterian,3.0,Northern Illinois,55.0,2014-08-28 19:00:00,0


In [5]:
# Add a Next Score column (shows if the score of the next week)
def add_next_score(team):
    team['Next Score'] = team['Points Scored'].shift(-1)
    return team

df = df.groupby("Team", group_keys=False).apply(add_next_score)

# Check a certain team
df[df['Team'] == "Virginia Tech"]

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score
100,2014,1,Virginia Tech,34.0,William & Mary,9.0,2014-08-30 16:00:00,1,35.0
279,2014,2,Virginia Tech,35.0,Ohio State,21.0,2014-09-06 20:00:00,1,21.0
321,2014,3,Virginia Tech,21.0,East Carolina,28.0,2014-09-13 12:00:00,0,24.0
426,2014,4,Virginia Tech,24.0,Georgia Tech,27.0,2014-09-20 12:00:00,0,35.0
553,2014,5,Virginia Tech,35.0,Western Michigan,17.0,2014-09-27 12:30:00,1,34.0
...,...,...,...,...,...,...,...,...,...
14157,2023,10,Virginia Tech,38.0,Syracuse,10.0,2023-10-26 19:30:00,1,3.0
14320,2023,11,Virginia Tech,3.0,Louisville,34.0,2023-11-04 15:30:00,0,48.0
14421,2023,12,Virginia Tech,48.0,Boston College,22.0,2023-11-11 12:00:00,1,28.0
14593,2023,13,Virginia Tech,28.0,North Carolina State,35.0,2023-11-18 15:30:00,0,55.0


In [6]:
# Replace NaN values with 2 instead of Nan
df['Next Score'][pd.isnull(df['Next Score'])] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Score'][pd.isnull(df['Next Score'])] = -1


In [7]:
# Use rolling averages to improve model
df_rolling = df[['Season', 'Team', 'Points Scored', 'Points Allowed', 'Win']]

def find_team_averages(team):
    # Group rows by previous 2 rows + current row averages
    rolling = team[['Points Scored', 'Points Allowed']].rolling(3).mean()
    return rolling

df_rolling = df_rolling.groupby(['Season', 'Team'], group_keys=False).apply(find_team_averages)

# Rename the rolling cols so we can merge with original df
rolling_cols = [f'{col}_3' for col in df_rolling.columns]
df_rolling.columns = rolling_cols

# Concatenate
df = pd.concat([df, df_rolling], axis=1)
# Drop rows with missing rows
df = df.dropna()
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Points Scored_3,Points Allowed_3
301,2014,3,Louisiana Tech,42.0,North Texas,21.0,2014-09-11 20:00:00,1,27.0,35.333333,29.666667
302,2014,3,Houston,25.0,Brigham Young,33.0,2014-09-11 21:00:00,0,47.0,26.333333,20.000000
303,2014,3,Brigham Young,33.0,Houston,25.0,2014-09-11 21:00:00,1,41.0,36.333333,14.000000
307,2014,3,Buffalo,21.0,Baylor,63.0,2014-09-12 20:00:00,0,36.0,32.666667,46.000000
309,2014,3,Boise State,38.0,Connecticut,21.0,2014-09-13 12:00:00,1,34.0,29.333333,26.666667
...,...,...,...,...,...,...,...,...,...,...,...
14797,2023,15,Florida State,16.0,Louisville,6.0,2023-12-02 20:00:00,1,-1.0,32.666667,11.333333
14798,2023,15,Louisville,6.0,Florida State,16.0,2023-12-02 20:00:00,0,-1.0,25.000000,28.333333
14799,2023,15,Iowa,0.0,Michigan,26.0,2023-12-02 20:00:00,0,-1.0,9.333333,16.333333
14800,2023,16,Army,17.0,Navy,11.0,2023-12-09 15:00:00,1,-1.0,20.666667,15.333333


In [8]:
# Shift to the next value given the team and column name
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

# Add a column to the dataframe applying the shift column function
def add_col(df, col_name):
    return df.groupby('Team', group_keys=False).apply(lambda x: shift_col(x, col_name))

# Add a next opponent and next date column
df['Next Opponent'] = add_col(df, 'Opponent')
df['Next Date'] = add_col(df, 'DateTime')
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Points Scored_3,Points Allowed_3,Next Opponent,Next Date
301,2014,3,Louisiana Tech,42.0,North Texas,21.0,2014-09-11 20:00:00,1,27.0,35.333333,29.666667,Northwestern State,2014-09-20 19:00:00
302,2014,3,Houston,25.0,Brigham Young,33.0,2014-09-11 21:00:00,0,47.0,26.333333,20.000000,Nevada-Las Vegas,2014-09-20 20:00:00
303,2014,3,Brigham Young,33.0,Houston,25.0,2014-09-11 21:00:00,1,41.0,36.333333,14.000000,Virginia,2014-09-20 15:30:00
307,2014,3,Buffalo,21.0,Baylor,63.0,2014-09-12 20:00:00,0,36.0,32.666667,46.000000,Norfolk State,2014-09-20 15:30:00
309,2014,3,Boise State,38.0,Connecticut,21.0,2014-09-13 12:00:00,1,34.0,29.333333,26.666667,Louisiana,2014-09-20 22:40:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14797,2023,15,Florida State,16.0,Louisville,6.0,2023-12-02 20:00:00,1,-1.0,32.666667,11.333333,,
14798,2023,15,Louisville,6.0,Florida State,16.0,2023-12-02 20:00:00,0,-1.0,25.000000,28.333333,,
14799,2023,15,Iowa,0.0,Michigan,26.0,2023-12-02 20:00:00,0,-1.0,9.333333,16.333333,,
14800,2023,16,Army,17.0,Navy,11.0,2023-12-09 15:00:00,1,-1.0,20.666667,15.333333,,


In [9]:
# Get rolling data for opponent 
full = df.merge(df[rolling_cols + ['Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )
full

Unnamed: 0,Season,Wk,Team_x,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Points Scored_3_x,Points Allowed_3_x,Next Opponent_x,Next Date,Points Scored_3_y,Points Allowed_3_y,Next Opponent_y,Team_y
0,2014,3,Houston,25.0,Brigham Young,33.0,2014-09-11 21:00:00,0,47.0,26.333333,20.000000,Nevada-Las Vegas,2014-09-20 20:00:00,20.000000,39.333333,Houston,Nevada-Las Vegas
1,2014,3,Boise State,38.0,Connecticut,21.0,2014-09-13 12:00:00,1,34.0,29.333333,26.666667,Louisiana,2014-09-20 22:40:00,26.666667,36.666667,Boise State,Louisiana
2,2014,3,Pittsburgh,42.0,Florida International,25.0,2014-09-13 12:00:00,1,20.0,44.666667,15.000000,Iowa,2014-09-20 12:00:00,21.666667,18.666667,Pittsburgh,Iowa
3,2014,3,Vanderbilt,34.0,Massachusetts,31.0,2014-09-13 12:00:00,1,34.0,14.666667,36.333333,South Carolina,2014-09-20 19:30:00,33.000000,36.666667,Vanderbilt,South Carolina
4,2014,3,West Virginia,40.0,Maryland,37.0,2014-09-13 12:00:00,1,33.0,39.000000,23.333333,Oklahoma,2014-09-20 19:30:00,44.666667,11.000000,West Virginia,Oklahoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10971,2023,14,Appalachian State,55.0,Georgia Southern,27.0,2023-11-25 15:30:00,1,23.0,41.000000,21.333333,Troy,2023-12-02 16:00:00,37.000000,18.333333,Appalachian State,Troy
10972,2023,14,Washington,24.0,Washington State,21.0,2023-11-25 16:00:00,1,34.0,27.000000,23.000000,Oregon,2023-12-01 20:00:00,38.666667,15.666667,Washington,Oregon
10973,2023,14,New Mexico State,20.0,Jacksonville State,17.0,2023-11-25 16:00:00,1,35.0,29.666667,18.666667,Liberty,2023-12-01 19:00:00,43.000000,21.000000,New Mexico State,Liberty
10974,2023,14,Florida State,24.0,Florida,15.0,2023-11-25 19:00:00,1,16.0,36.333333,16.000000,Louisville,2023-12-02 20:00:00,33.333333,31.000000,Florida State,Louisville


In [10]:
# Get columns with object datatype (model can't use them)
removed_columns = list(full.columns[full.dtypes == 'object'])
removed_columns

['Team_x',
 'Opponent',
 'DateTime',
 'Next Opponent_x',
 'Next Date',
 'Next Opponent_y',
 'Team_y']

In [11]:
# Features and labels 
X = full[['Points Scored', 'Points Allowed','Win','Points Scored_3_x', 'Points Allowed_3_x', 'Points Scored_3_y', 'Points Allowed_3_y']]
y = full['Next Score']
X

Unnamed: 0,Points Scored,Points Allowed,Win,Points Scored_3_x,Points Allowed_3_x,Points Scored_3_y,Points Allowed_3_y
0,25.0,33.0,0,26.333333,20.000000,20.000000,39.333333
1,38.0,21.0,1,29.333333,26.666667,26.666667,36.666667
2,42.0,25.0,1,44.666667,15.000000,21.666667,18.666667
3,34.0,31.0,1,14.666667,36.333333,33.000000,36.666667
4,40.0,37.0,1,39.000000,23.333333,44.666667,11.000000
...,...,...,...,...,...,...,...
10971,55.0,27.0,1,41.000000,21.333333,37.000000,18.333333
10972,24.0,21.0,1,27.000000,23.000000,38.666667,15.666667
10973,20.0,17.0,1,29.666667,18.666667,43.000000,21.000000
10974,24.0,15.0,1,36.333333,16.000000,33.333333,31.000000


In [12]:
# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set hyperparameters
params = {
    'objective': 'reg:squarederror', # Regression task
    'eval_metric': 'rmse', # Use root mean squared error to evaluate
    'eta': 0.1, # Learning rate
    'max_depth': 3,  # Maximum depth of trees
    'subsample': 0.8,  # Fraction of samples used for training each tree
    'colsample_bytree': 0.8,  # Fraction of features used for training each tree
    'seed': 42
}

In [14]:
# Train model
xgb_model = xgb.train(params, dtrain, 100)

In [15]:
# Make predictions on test set
predictions = xgb_model.predict(dtest)
predictions

array([27.639788, 27.230093, 33.761345, ..., 37.65855 , 22.305431,
       39.589214], dtype=float32)

In [16]:
# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 12.979249398385122


In [41]:
score_predictors = ['Points Scored', 'Points Allowed', 'Win', 'Points Scored_3_x', 'Points Allowed_3_x', 'Points Scored_3_y', 'Points Allowed_3_y']
# Predict future scores function
def predict_score(model, data, team1, team2, date, predictors):
    # Get latest statistics for each team
    team1_stats = data[data['Team'] == team1].tail(1)
    team2_stats = data[data['Team'] == team2].tail(1)

    # Combine datasets
    combined = pd.concat([team1_stats, team2_stats], ignore_index=True)
    combined = combined.copy()

    # Add next opponent and next date to the dataset
    combined['Next Opponent'][0] = team2
    combined['Next Opponent'][1] = team1
    combined['Next Date'] = date

    predict_data = combined.merge(combined[['Points Scored_3', 'Points Allowed_3', 'Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )

    # Create DMatrix for XGBoost
    dmatrix = xgb.DMatrix(predict_data[predictors])
    # Make the prediction
    prediction = model.predict(dmatrix)
    
    # Get predicted score
    team1_score = prediction[0]
    team2_score = prediction[1]

    return {'Team1': {'Name': team1, 'Predicted_Score': round(team1_score)},
        'Team2': {'Name': team2, 'Predicted_Score': round(team2_score)}}

# Change team names and then display predicted winner and loser
score_prediction = predict_score(xgb_model, df, 'Hawaii', 'UCLA', '2023-12-30 20:00:00', score_predictors)
score_prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][0] = team2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][1] = team1


{'Team1': {'Name': 'Hawaii', 'Predicted_Score': 25},
 'Team2': {'Name': 'UCLA', 'Predicted_Score': 25}}