In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
matches = pd.read_csv("matchesPL.csv", index_col=0)
display(matches)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,np:g-xg.1,crdy,crdr,fls,fld,fk,ti,ck,season,team
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2.0,1.0,Nott'ham Forest,...,1.2,2,0,12.0,12.0,13.0,18.0,8.0,2024,Arsenal
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1.0,0.0,Crystal Palace,...,-1.2,3,1,10.0,13.0,12.0,10.0,8.0,2024,Arsenal
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2.0,2.0,Fulham,...,-1.4,0,0,6.0,5.0,5.0,19.0,8.0,2024,Arsenal
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,0.7,2,0,8.0,7.0,8.0,17.0,12.0,2024,Arsenal
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,0.0,1,0,10.0,12.0,13.0,24.0,11.0,2024,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,2024-02-03,15:00,Premier League,Matchweek 23,Sat,Home,D,2.0,2.0,Fulham,...,0.2,2,0,11.0,7.0,8.0,26.0,2.0,2024,Burnley
27,2024-02-10,15:00,Premier League,Matchweek 24,Sat,Away,L,1.0,3.0,Liverpool,...,-0.3,2,0,13.0,11.0,13.0,17.0,3.0,2024,Burnley
28,2024-02-17,15:00,Premier League,Matchweek 25,Sat,Home,L,0.0,5.0,Arsenal,...,-0.3,2,0,11.0,8.0,8.0,14.0,4.0,2024,Burnley
29,2024-02-24,15:00,Premier League,Matchweek 26,Sat,Away,L,0.0,3.0,Crystal Palace,...,-0.1,1,1,14.0,8.0,14.0,21.0,3.0,2024,Burnley


In [3]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 546 entries, 1 to 30
Data columns (total 37 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          546 non-null    object 
 1   time          546 non-null    object 
 2   comp          546 non-null    object 
 3   round         546 non-null    object 
 4   day           546 non-null    object 
 5   venue         546 non-null    object 
 6   result        546 non-null    object 
 7   gf            546 non-null    float64
 8   ga            546 non-null    float64
 9   opponent      546 non-null    object 
 10  xg_x          546 non-null    float64
 11  xga           546 non-null    float64
 12  poss          546 non-null    float64
 13  attendance    546 non-null    float64
 14  captain       546 non-null    object 
 15  formation     546 non-null    object 
 16  referee       546 non-null    object 
 17  match report  546 non-null    object 
 18  notes         0 non-null      f

In [4]:
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek

In [5]:
matches["target"] = (matches["result"] == "W").astype("int")

In [6]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split=10, random_state=48)
train = matches[matches["date"] < "2024-01-01"]
test = matches[matches["date"] > "2024-01-01"]
predictors = ["venue_code", "opp_code", "hour", "day_code"]
rf.fit(train[predictors], train["target"])
preds = rf.predict(test[predictors])

In [7]:
acc = accuracy_score(test["target"], preds)

In [8]:
combined = pd.DataFrame(dict(actual=test["target"], prediction = preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,67,28
1,38,19


In [9]:
precision_score(test["target"], preds)

0.40425531914893614

In [10]:
grouped_matches = matches.groupby("team")
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed="left").mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [11]:
cols = ["gf","ga", "sh", "sot", "dist", "pk", "pkatt", "xg_y", "npxg", "np:g-xg", "np:g-xg.1", "crdy",
        "crdr", "fls", "fld", "fk", "ti", "ck"]
new_cols = [f"{c}_rolling" for c in cols]
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel("team")
matches_rolling.index = range(matches_rolling.shape[0])

In [12]:
def make_predictions(data, predictiors):
    train = data[data["date"] < "2024-01-01"]
    test = data[data["date"] > "2024-01-01"]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], prediction = preds))
    error = precision_score(test["target"], preds)
    return combined, error

In [13]:
combined, error = make_predictions(matches_rolling, predictors + new_cols)
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], 
                          left_index = True, right_index = True)
combined.head(10)

Unnamed: 0,actual,prediction,date,team,opponent,result
17,1,0,2024-01-20,Arsenal,Crystal Palace,W
18,1,1,2024-01-30,Arsenal,Nott'ham Forest,W
19,1,0,2024-02-04,Arsenal,Liverpool,W
20,1,0,2024-02-11,Arsenal,West Ham,W
21,1,1,2024-02-17,Arsenal,Burnley,W
22,1,0,2024-02-24,Arsenal,Newcastle Utd,W
23,1,1,2024-03-04,Arsenal,Sheffield Utd,W
24,1,1,2024-03-09,Arsenal,Brentford,W
42,0,0,2024-01-14,Aston Villa,Everton,D
43,0,0,2024-01-30,Aston Villa,Newcastle Utd,L


In [14]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [15]:
combined["new_team"] = combined["team"].map(mapping)
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [16]:
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
1,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal,0,0,Nottingham Forest,Arsenal,L,Nottingham Forest
2,1,0,2024-02-04,Arsenal,Liverpool,W,Arsenal,0,0,Liverpool,Arsenal,L,Liverpool
3,1,0,2024-02-11,Arsenal,West Ham,W,Arsenal,0,0,West Ham United,Arsenal,L,West Ham
4,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal,0,0,Burnley,Arsenal,L,Burnley
...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,0,1,2024-02-10,Wolverhampton Wanderers,Brentford,L,Wolves,1,0,Brentford,Wolves,W,Brentford
133,1,0,2024-02-17,Wolverhampton Wanderers,Tottenham,W,Wolves,0,1,Tottenham Hotspur,Wolves,L,Tottenham
134,1,0,2024-02-25,Wolverhampton Wanderers,Sheffield Utd,W,Wolves,0,0,Sheffield United,Wolves,L,Sheffield United
135,0,0,2024-03-02,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,1,Newcastle United,Wolves,W,Newcastle Utd


In [18]:
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] ==0)]["actual_x"].value_counts()

0    21
1    16
Name: actual_x, dtype: int64