In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("../data/matches.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,Match Report,,13.0,1.0,18.7,1.0,1,1,2024,ManchesterCity
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,Match Report,,19.0,7.0,17.5,0.0,0,0,2024,ManchesterCity
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,Match Report,,21.0,10.0,16.2,1.0,0,0,2024,ManchesterCity
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,Match Report,,18.0,5.0,14.1,0.0,0,0,2024,ManchesterCity
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,Match Report,,17.0,9.0,14.8,0.0,0,0,2024,ManchesterCity


In [4]:
matches.shape

(3040, 27)

In [5]:
38 * 20 * 4

3040

In [6]:
matches["team"].value_counts()

ManchesterCity            152
CrystalPalace             152
Southampton               152
LeicesterCity             152
Everton                   152
WestHamUnited             152
Arsenal                   152
Chelsea                   152
WolverhamptonWanderers    152
TottenhamHotspur          152
AstonVilla                152
BrightonandHoveAlbion     152
Liverpool                 152
NewcastleUnited           152
ManchesterUnited          152
LeedsUnited               114
Burnley                   114
Fulham                     76
Brentford                  76
Bournemouth                76
Watford                    76
NorwichCity                76
SheffieldUnited            76
NottinghamForest           38
WestBromwichAlbion         38
Name: team, dtype: int64

In [7]:
matches["round"].value_counts()

Matchweek 1     80
Matchweek 30    80
Matchweek 23    80
Matchweek 12    80
Matchweek 24    80
Matchweek 25    80
Matchweek 26    80
Matchweek 27    80
Matchweek 29    80
Matchweek 31    80
Matchweek 2     80
Matchweek 33    80
Matchweek 34    80
Matchweek 28    80
Matchweek 35    80
Matchweek 36    80
Matchweek 37    80
Matchweek 32    80
Matchweek 22    80
Matchweek 21    80
Matchweek 7     80
Matchweek 20    80
Matchweek 3     80
Matchweek 4     80
Matchweek 5     80
Matchweek 6     80
Matchweek 8     80
Matchweek 9     80
Matchweek 10    80
Matchweek 11    80
Matchweek 13    80
Matchweek 14    80
Matchweek 15    80
Matchweek 16    80
Matchweek 17    80
Matchweek 18    80
Matchweek 19    80
Matchweek 38    80
Name: round, dtype: int64

In [8]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [9]:
matches["date"] = pd.to_datetime(matches["date"])

In [10]:
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                       int64
ga                       int64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                       int64
pkatt                    int64
season                   int64
team                    object
dtype: object

In [11]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [13]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [14]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [16]:
matches["day_code"] = matches["date"].dt.day_of_week

In [18]:
matches["target"] = (matches["result"] == "W").astype("int")

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [23]:
train = matches[matches["date"] < '2022-12-01']

In [27]:
test = matches[matches["date"] > '2022-12-01']

In [24]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [25]:
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [28]:
preds = rf.predict(test[predictors])

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
acc = accuracy_score(test["target"], preds)

In [31]:
acc

0.5982905982905983

In [32]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [33]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,224,63
1,125,56


In [34]:
from sklearn.metrics import precision_score

In [35]:
precision_score(test["target"], preds)

0.47058823529411764

In [37]:
grouped_matches = matches.groupby("team")

In [40]:
group = grouped_matches.get_group("ManchesterCity")

In [42]:
def rolling_average(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed="left").mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [43]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [44]:
rolling_average(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2019-08-31,15:00,Premier League,Matchweek 4,Sat,Home,W,4,0,Brighton,...,5,1,3.333333,1.000000,20.333333,7.666667,15.633333,0.000000,0.333333,0.333333
5,2019-09-14,17:30,Premier League,Matchweek 5,Sat,Away,L,2,3,Norwich City,...,5,0,3.000000,1.000000,21.000000,7.000000,15.200000,0.333333,0.000000,0.000000
7,2019-09-21,15:00,Premier League,Matchweek 6,Sat,Home,W,8,0,Watford,...,5,1,3.000000,1.333333,19.333333,6.333333,15.100000,0.333333,0.000000,0.000000
9,2019-09-28,17:30,Premier League,Matchweek 7,Sat,Away,W,3,1,Everton,...,5,1,4.666667,1.000000,22.333333,8.000000,15.133333,1.000000,0.333333,0.333333
11,2019-10-06,14:00,Premier League,Matchweek 8,Sun,Home,L,0,2,Wolves,...,6,0,4.333333,1.333333,23.666667,8.666667,14.900000,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2023-05-06,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Leeds United,...,5,1,3.000000,0.666667,13.666667,8.000000,15.433333,0.000000,0.333333,0.333333
54,2023-05-14,14:00,Premier League,Matchweek 36,Sun,Away,W,3,0,Everton,...,6,1,2.333333,0.666667,14.666667,7.000000,16.366667,0.666667,0.333333,0.666667
56,2023-05-21,16:00,Premier League,Matchweek 37,Sun,Home,W,1,0,Chelsea,...,6,1,2.666667,0.333333,14.000000,5.666667,18.100000,1.333333,0.000000,0.333333
57,2023-05-24,20:00,Premier League,Matchweek 32,Wed,Away,D,1,1,Brighton,...,2,0,2.000000,0.333333,13.666667,4.000000,18.933333,1.333333,0.000000,0.333333


In [45]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_average(x, cols,new_cols))

In [47]:
matches_rolling = matches_rolling.droplevel("team")

In [49]:
matches_rolling.index = range(matches_rolling.shape[0])

In [55]:
def make_predictions(data, predictors):
    train = data[data["date"] < "2022-12-01"]
    test = data[data["date"] > "2022-12-01"]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [56]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [57]:
precision

0.6105263157894737

In [58]:
combined

Unnamed: 0,actual,prediction
125,1,1
126,1,1
127,0,1
128,1,1
129,1,0
...,...,...
2957,0,0
2958,1,0
2959,0,0
2960,0,0


In [59]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [60]:
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
125,1,1,2022-12-26,Arsenal,West Ham,W
126,1,1,2022-12-31,Arsenal,Brighton,W
127,0,1,2023-01-03,Arsenal,Newcastle Utd,D
128,1,1,2023-01-15,Arsenal,Tottenham,W
129,1,0,2023-01-22,Arsenal,Manchester Utd,W
...,...,...,...,...,...,...
2957,0,0,2023-04-29,WolverhamptonWanderers,Brighton,L
2958,1,0,2023-05-06,WolverhamptonWanderers,Aston Villa,W
2959,0,0,2023-05-13,WolverhamptonWanderers,Manchester Utd,L
2960,0,0,2023-05-20,WolverhamptonWanderers,Everton,D


In [63]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "WolverhamptonWanderers": "Wolves",
    "Manchester United": "Manchester Utd",
    "Tottenham Hotspur": "Tottenham",
    "Newcastle United": "Newcastle Utd",
    "West Ham United": "West Ham Utd"
}

mapping = MissingDict(**map_values)

In [64]:
combined["new_team"] = combined["team"].map(mapping)

In [66]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [67]:
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,1,1,2022-12-26,Arsenal,West Ham,W,Arsenal,0,0,WestHamUnited,Arsenal,L,WestHamUnited
1,1,1,2022-12-31,Arsenal,Brighton,W,Arsenal,0,0,BrightonandHoveAlbion,Arsenal,L,BrightonandHoveAlbion
2,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal,0,0,NewcastleUnited,Arsenal,D,NewcastleUnited
3,1,1,2023-01-15,Arsenal,Tottenham,W,Arsenal,0,0,TottenhamHotspur,Arsenal,L,TottenhamHotspur
4,1,0,2023-01-22,Arsenal,Manchester Utd,W,Arsenal,0,1,ManchesterUnited,Arsenal,L,ManchesterUnited
...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,0,0,2023-04-29,WolverhamptonWanderers,Brighton,L,Wolves,1,0,BrightonandHoveAlbion,Wolves,W,BrightonandHoveAlbion
206,1,0,2023-05-06,WolverhamptonWanderers,Aston Villa,W,Wolves,0,0,AstonVilla,Wolves,L,AstonVilla
207,0,0,2023-05-13,WolverhamptonWanderers,Manchester Utd,L,Wolves,1,1,ManchesterUnited,Wolves,W,ManchesterUnited
208,0,0,2023-05-20,WolverhamptonWanderers,Everton,D,Wolves,0,0,Everton,Wolves,D,Everton


In [68]:
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] ==0)]["actual_x"].value_counts()

1    21
0    14
Name: actual_x, dtype: int64