In [125]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

In [126]:
matches = pd.read_csv("matches_copy.csv", index_col=0)

In [127]:
matches["Date"] = pd.to_datetime(matches["Date"])
matches["Venue_Code"] = matches["Venue"].astype("category").cat.codes
matches["Opp_Code"] = matches["Opponent"].astype("category").cat.codes
matches["Hour"] = matches["Time"].str.replace(":.+", "", regex=True).astype("int")
matches["Day_Code"] = matches["Date"].dt.dayofweek

In [128]:
matches["Target"] = (matches["Result"] == "W").astype("int")

In [129]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [130]:
train = matches[matches["Date"] < '2024-01-01']
test = matches[matches["Date"] > '2024-01-01']
predictors = ["Venue_Code", "Opp_Code", "Hour", "Day_Code"]
rf.fit(train[predictors], train["Target"])

In [131]:
precision_score(test["Target"], preds)

0.39473684210526316

In [132]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [15]:
cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
new_cols = [f"{c}_rolling" for c in cols]

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Day_Code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,5,1,3.0,1.0,17.666667,6.0,17.466667,0.666667,0.333333,0.333333
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,2,1,3.666667,1.666667,19.333333,7.333333,15.933333,0.333333,0.0,0.0
6,2022-09-03,17:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,Aston Villa,...,5,0,4.333333,1.666667,18.666667,8.0,15.033333,0.333333,0.0,0.0
9,2022-09-17,12:30,Premier League,Matchweek 8,Sat,Away,W,3.0,0.0,Wolves,...,5,1,3.666667,1.0,16.0,6.0,15.233333,0.333333,0.0,0.0
10,2022-10-02,14:00,Premier League,Matchweek 9,Sun,Home,W,6.0,3.0,Manchester Utd,...,6,1,3.333333,0.333333,15.333333,6.666667,17.0,0.333333,0.0,0.0
12,2022-10-08,15:00,Premier League,Matchweek 10,Sat,Home,W,4.0,0.0,Southampton,...,5,1,3.333333,1.333333,17.0,6.666667,16.8,0.666667,0.0,0.0
14,2022-10-16,16:30,Premier League,Matchweek 11,Sun,Away,L,0.0,1.0,Liverpool,...,6,0,4.333333,1.0,19.666667,8.0,15.833333,0.333333,0.0,0.0
15,2022-10-22,15:00,Premier League,Matchweek 13,Sat,Home,W,3.0,1.0,Brighton,...,5,1,3.333333,1.333333,19.666667,7.666667,15.5,0.333333,0.0,0.0
17,2022-10-29,12:30,Premier League,Matchweek 14,Sat,Away,W,1.0,0.0,Leicester City,...,5,1,2.333333,0.666667,15.333333,5.666667,15.766667,0.0,0.333333,0.333333
19,2022-11-05,15:00,Premier League,Matchweek 15,Sat,Home,W,2.0,1.0,Fulham,...,5,1,1.333333,0.666667,13.333333,4.666667,18.066667,0.333333,0.333333,0.333333


In [140]:
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols), include_groups=False)
matches_rolling = matches_rolling.reset_index()
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling = matches_rolling.drop("level_1", axis=1)

In [134]:
def make_predictions(data, predictors):
    train = data[data["Date"] < '2024-01-01']
    test = data[data["Date"] > '2024-01-01']
    rf.fit(train[predictors], train["Target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
    error = precision_score(test["Target"], preds)
    return combined, error

In [135]:
combined, error = make_predictions(matches_rolling, predictors + new_cols)
error

0.6

In [141]:
matches_rolling.dtypes
combined = combined.merge(matches_rolling[["Date", "Team", "Opponent", "Result"]], left_index=True, right_index=True)

In [142]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", 
              "Manchester United": "Manchester Utd", 
              "Newcastle United": "Newcastle Utd", 
              "Tottenham Hotspur": "Tottenham", 
              "West Ham United": "West Ham", 
              "Wolverhampton Wanderers": "Wolves",
              "Nottingham Forest": "Nott'ham Forest",
              "Sheffield United": "Sheffield Utd"
             } 

mapping = MissingDict(**map_values)
combined["New_Team"] = combined["Team"].map(mapping)
merged = combined.merge(combined, left_on=["Date", "New_Team"], right_on=["Date", "Opponent"])
merged

Unnamed: 0,actual_x,predicted_x,Date_x_x,Team_x_x,Opponent_x_x,Result_x_x,Date_y_x,Team_y_x,Opponent_y_x,Result_y_x,...,Opponent_x_y,Result_x_y,Date_y_y,Team_y_y,Opponent_y_y,Result_y_y,Team_y,Opponent_y,Result_y,New_Team_y
0,1,1,2024-01-20,Arsenal,Crystal Palace,W,2024-01-20,Arsenal,Crystal Palace,W,...,Arsenal,L,2024-01-20,Crystal Palace,Arsenal,L,Crystal Palace,Arsenal,L,Crystal Palace
1,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,2024-01-30,Arsenal,Nott'ham Forest,W,...,Arsenal,L,2024-01-30,Nottingham Forest,Arsenal,L,Nottingham Forest,Arsenal,L,Nott'ham Forest
2,1,1,2024-02-04,Arsenal,Liverpool,W,2024-02-04,Arsenal,Liverpool,W,...,Arsenal,L,2024-02-04,Liverpool,Arsenal,L,Liverpool,Arsenal,L,Liverpool
3,1,0,2024-02-11,Arsenal,West Ham,W,2024-02-11,Arsenal,West Ham,W,...,Arsenal,L,2024-02-11,West Ham United,Arsenal,L,West Ham United,Arsenal,L,West Ham
4,1,0,2024-02-17,Arsenal,Burnley,W,2024-02-17,Arsenal,Burnley,W,...,Arsenal,L,2024-02-17,Burnley,Arsenal,L,Burnley,Arsenal,L,Burnley
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0,1,2024-01-22,Wolverhampton Wanderers,Brighton,D,2024-01-22,Wolverhampton Wanderers,Brighton,D,...,Wolves,D,2024-01-22,Brighton and Hove Albion,Wolves,D,Brighton and Hove Albion,Wolves,D,Brighton
102,0,0,2024-02-01,Wolverhampton Wanderers,Manchester Utd,L,2024-02-01,Wolverhampton Wanderers,Manchester Utd,L,...,Wolves,W,2024-02-01,Manchester United,Wolves,W,Manchester United,Wolves,W,Manchester Utd
103,1,0,2024-02-04,Wolverhampton Wanderers,Chelsea,W,2024-02-04,Wolverhampton Wanderers,Chelsea,W,...,Wolves,L,2024-02-04,Chelsea,Wolves,L,Chelsea,Wolves,L,Chelsea
104,0,0,2024-02-10,Wolverhampton Wanderers,Brentford,L,2024-02-10,Wolverhampton Wanderers,Brentford,L,...,Wolves,W,2024-02-10,Brentford,Wolves,W,Brentford,Wolves,W,Brentford


In [143]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

actual_x
1    14
0     7
Name: count, dtype: int64