In [53]:
import pandas as pd
matches = pd.read_csv("matches.csv")


In [54]:
#changes date to not be an object so that it can be fed to model
matches["date"] = pd.to_datetime(matches["date"]) 

In [55]:
#converts home/away column into numeric column 
matches["venue_code"] = matches["venue"].astype("category").cat.codes
#numeric indicator for each team 
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
#numeric indicator for when a team plays their game 
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
#number for each day of the week
matches["day_code"] = matches["date"].dt.dayofweek

In [56]:
#predict whether the team won or not 
matches["target"] = (matches["result"] == 'W').astype("int")

In [57]:
#training model
from sklearn.ensemble import RandomForestClassifier

#n number of decision trees we want to train 
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
#we split train and test so that model has new data to see if its correct or not
train = matches[matches["date"] < '2024-01-01']
test = matches[matches["date"] > '2024-01-01']
predictors = ["venue_code", "opp_code", "hour", "day_code"]
rf.fit(train[predictors], train["target"])
RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)
predictions= rf.predict(test[predictors])

In [58]:
#determine accuracy of the model
from sklearn.metrics import accuracy_score
acur = accuracy_score(test["target"], predictions)

In [None]:
#to see where we were accurate and where we were not

combined = pd.DataFrame(dict(actual = test["target"], prediction=predictions))
# pd.crosstab(index=combined["actual"], columns=combined["prediction"])   

In [None]:
from sklearn.metrics import precision_score
precision_score(test["target"], predictions)

In [65]:
#creates more predictors to improve accuracy (rolling averages)
#uses a teams form to predict as well
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Manchester City")

def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean() #closed takes current week out
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group


In [None]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

#takes dataframes and groups 1 dataframe for each team and compute rolling averages using func
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])



In [73]:
#new set of predictors
def make_predictions(data, predictors):
    train =data[data["date"] < '2024-01-01']
    test = data[data["date"] > '2024-01-01']
    rf.fit(train[predictors], train["target"])
    predictions = rf.predict (test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], predicted=predictions), index=test.index)
    precision = precision_score(test["target"], predictions)
    return combined, precision

combined, precision = make_predictions(matches_rolling, predictors + new_cols)
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)


Unnamed: 0,actual,predicted,date,team,opponent,result
131,1,0,2024-01-20,Arsenal,Crystal Palace,W
132,1,1,2024-01-30,Arsenal,Nott'ham Forest,W
133,1,0,2024-02-04,Arsenal,Liverpool,W
134,1,1,2024-02-11,Arsenal,West Ham,W
135,1,1,2024-02-17,Arsenal,Burnley,W
...,...,...,...,...,...,...
3711,1,0,2025-04-26,Wolverhampton Wanderers,Leicester City,W
3712,0,0,2025-05-02,Wolverhampton Wanderers,Manchester City,L
3713,0,0,2025-05-10,Wolverhampton Wanderers,Brighton,L
3714,0,0,2025-05-20,Wolverhampton Wanderers,Crystal Palace,L
