In [108]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier # ML model that picks up NON-LINEARITIES in data
from sklearn.metrics import precision_score # metrics to test precision

In [109]:
matches = pd.read_csv("matches.csv", index_col = 0) # reading matches.csv

# creating columns of wanted predictors
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_code"] = matches["home/away"].astype("category").cat.codes # creating venue codes for 0 (home) -1 (away)
matches["opp_code"] = matches["opponent"].astype("category").cat.codes # same thing with opponent column
matches["hour"] = matches["time"].str.replace(r':.*[AP]M', '', regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["target"] = (matches["w/l"] == "W").astype("int") # creates target for model to look for

# list of initial predictors
predictors = ["venue_code", "opp_code", "hour", "day_code", "target"]

In [110]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split = 10, random_state=1)
# random forest has series of decision trees
# each decision trees has different parameters
# higher n_estimators -> longer algorithm -> more accurate
# min_samples_split: # samples in each leaf, higher -> less likely to overfit -> lower accuracy
# random_state: same result if run multiple times

In [111]:
# creating rolling averages column
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean() # take previous 3 weeks
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["yds_passing", "yds_rushing", "yds_tot_off", "pct_passing"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

Unnamed: 0,g,date,time,day,school,home/away,opponent,conf,w/l,pts,...,team,venue_code,opp_code,hour,day_code,target,yds_passing_rolling,yds_rushing_rolling,yds_tot_off_rolling,pct_passing_rolling
3,4,2023-09-16,7:00 PM,Sat,San Jose State,@,Toledo,MAC,L,17,...,san jose state,0,54,7,5,0,211.666667,173.666667,385.333333,59.5
4,5,2023-09-22,10:30 PM,Fri,San Jose State,,Air Force,MWC,L,20,...,san jose state,-1,14,10,4,0,221.666667,121.0,342.666667,61.066667
5,6,2023-10-07,8:00 PM,Sat,San Jose State,@,Boise State,MWC,L,27,...,san jose state,0,19,8,5,0,224.0,140.666667,364.666667,64.1
6,7,2023-10-14,6:00 PM,Sat,San Jose State,@,New Mexico,MWC,W,52,...,san jose state,0,40,6,5,1,264.0,86.333333,350.333333,62.533333
7,8,2023-10-21,7:00 PM,Sat,San Jose State,,Utah State,MWC,W,42,...,san jose state,-1,58,7,5,1,278.666667,159.333333,438.0,58.733333
8,9,2023-10-29,12:00 AM,Sun,San Jose State,@,Hawaii,MWC,W,35,...,san jose state,0,28,12,6,1,238.666667,204.666667,443.333333,60.866667
9,10,2023-11-11,10:30 PM,Sat,San Jose State,,Fresno State,MWC,W,42,...,san jose state,-1,26,10,5,1,214.0,242.333333,456.333333,61.2
10,11,2023-11-18,10:30 PM,Sat,San Jose State,,San Diego State,MWC,W,24,...,san jose state,-1,48,10,5,1,172.0,260.333333,432.333333,62.166667
11,12,2023-11-25,3:00 PM,Sat,San Jose State,@,Nevada-Las Vegas,MWC,W,37,...,san jose state,0,39,3,5,1,190.333333,237.0,427.333333,62.166667
12,13,2023-12-23,10:30 PM,Sat,San Jose State,N,Coastal Carolina,Sun Belt,L,14,...,san jose state,1,22,10,5,0,189.666667,242.333333,432.0,66.033333


In [112]:
# create new table with matches_rolling
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols),include_groups=False)
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

In [113]:
def make_predictions(data, predictors):
    # training first half of season to be able to predict second half of season
    train = data[data["date"] < '2023-10-24']
    test = data[data["date"] > '2023-10-24']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    precision = precision_score(test["target"], preds)
    return precision

In [114]:
# make prediction based on original predictors + rolling average
predictors += new_cols
precision = make_predictions(matches_rolling, predictors)

In [115]:
precision # 100% accuracy

np.float64(1.0)