SETUP

In [37]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

In [5]:
matches = pd.read_csv('matches.csv', index_col=0)
matches.shape

(1432, 27)

In [7]:
matches["team"].value_counts()

Wolverhampton Wanderers     72
Southampton                 72
Everton                     72
Liverpool                   72
Tottenham Hotspur           72
Aston Villa                 72
Leeds United                72
Brentford                   72
Leicester City              72
Crystal Palace              72
Arsenal                     72
West Ham United             72
Manchester City             71
Chelsea                     71
Newcastle United            71
Manchester United           70
Brighton and Hove Albion    69
Burnley                     38
Watford                     38
Norwich City                38
Bournemouth                 34
Fulham                      34
Nottingham Forest           34
Name: team, dtype: int64

In [10]:
matches["round"].value_counts()

Matchweek 1     40
Matchweek 19    40
Matchweek 34    40
Matchweek 33    40
Matchweek 31    40
Matchweek 30    40
Matchweek 29    40
Matchweek 27    40
Matchweek 26    40
Matchweek 24    40
Matchweek 12    40
Matchweek 23    40
Matchweek 22    40
Matchweek 2     40
Matchweek 7     40
Matchweek 20    40
Matchweek 21    40
Matchweek 18    40
Matchweek 10    40
Matchweek 3     40
Matchweek 4     40
Matchweek 5     40
Matchweek 6     40
Matchweek 17    40
Matchweek 9     40
Matchweek 8     40
Matchweek 11    40
Matchweek 13    40
Matchweek 14    40
Matchweek 15    40
Matchweek 16    40
Matchweek 25    38
Matchweek 28    38
Matchweek 32    36
Matchweek 35    20
Matchweek 36    20
Matchweek 37    20
Matchweek 38    20
Name: round, dtype: int64

In [None]:
matches["date"] = pd.to_datetime(matches["date"])
matches.dtypes

In [20]:
# Convert column data to numeric
matches["venue_code"] = matches["venue"].astype('category').cat.codes
matches["opp_code"] = matches["opponent"].astype('category').cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek


In [21]:
matches["target"] = (matches["result"] == "W").astype("int")

START OF ML ALGORITHM 

In [25]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [26]:
train = matches[matches["date"] < '2023-01-02']
test = matches[matches["date"] > '2023-01-02']
predictors = ["venue_code", "opp_code", "hour", "day_code"]
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [27]:
preds = rf.predict(test[predictors])

In [29]:
acc = accuracy_score(test["target"], preds)

In [33]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,160,48
1,86,44


In [38]:
precision_score(test["target"], preds)

0.4782608695652174

In [39]:
grouped_matches = matches.groupby("team")

In [40]:
group = grouped_matches.get_group("Manchester City")
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,1.0,1.0,1.0,2022,Manchester City,0,21,16,6,1
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,0.0,0.0,0.0,2022,Manchester City,1,2,15,5,1
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,...,1.0,0.0,0.0,2022,Manchester City,0,15,16,6,0
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,0.0,0.0,0.0,2022,Manchester City,1,7,15,5,1
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,0.0,0.0,0.0,2022,Manchester City,1,17,19,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2022-04-30,17:30,Premier League,Matchweek 35,Sat,Away,W,4.0,0.0,Leeds United,...,0.0,0.0,0.0,2021,Manchester City,0,10,17,5,1
54,2022-05-08,16:30,Premier League,Matchweek 36,Sun,Home,W,5.0,0.0,Newcastle Utd,...,1.0,0.0,0.0,2021,Manchester City,1,15,16,6,1
55,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Away,W,5.0,1.0,Wolves,...,0.0,0.0,0.0,2021,Manchester City,0,22,20,2,1
56,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,2.0,2.0,West Ham,...,2.0,0.0,1.0,2021,Manchester City,0,21,14,6,0


In [60]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [61]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [62]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,5,1,3.333333,0.333333,19.666667,6.000000,16.866667,0.666667,0.000000,0.000000
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,5,0,3.666667,0.000000,22.000000,7.333333,15.866667,0.333333,0.000000,0.000000
8,2021-09-25,12:30,Premier League,Matchweek 6,Sat,Away,W,1.0,0.0,Chelsea,...,5,1,2.000000,0.000000,22.000000,6.333333,15.166667,0.333333,0.000000,0.000000
10,2021-10-03,16:30,Premier League,Matchweek 7,Sun,Away,D,2.0,2.0,Liverpool,...,6,0,0.666667,0.000000,18.666667,4.000000,15.933333,0.333333,0.000000,0.000000
11,2021-10-16,15:00,Premier League,Matchweek 8,Sat,Home,W,2.0,0.0,Burnley,...,5,1,1.000000,0.666667,14.333333,2.333333,16.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2023-04-08,17:30,Premier League,Matchweek 30,Sat,Away,W,4.0,1.0,Southampton,...,5,1,2.333333,0.333333,13.666667,4.666667,16.433333,1.333333,0.333333,0.333333
46,2023-04-15,17:30,Premier League,Matchweek 31,Sat,Home,W,3.0,1.0,Leicester City,...,5,1,3.000000,0.666667,14.000000,6.000000,15.566667,0.666667,0.666667,0.666667
49,2023-04-26,20:00,Premier League,Matchweek 33,Wed,Home,W,4.0,1.0,Arsenal,...,2,1,3.666667,1.000000,13.333333,6.000000,15.166667,0.333333,0.666667,0.666667
50,2023-04-30,14:00,Premier League,Matchweek 34,Sun,Away,W,2.0,1.0,Fulham,...,6,1,3.666667,1.000000,12.333333,6.333333,15.633333,0.000000,0.666667,0.666667


In [63]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [64]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
Arsenal,5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
Arsenal,7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
Arsenal,8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
Arsenal,9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,35,2023-04-08,15:00,Premier League,Matchweek 30,Sat,Home,W,1.0,0.0,Chelsea,...,5,1,1.333333,2.333333,12.666667,4.333333,17.600000,0.666667,0.000000,0.000000
Wolverhampton Wanderers,36,2023-04-15,15:00,Premier League,Matchweek 31,Sat,Home,W,2.0,0.0,Brentford,...,5,1,1.333333,1.666667,13.333333,4.333333,19.166667,0.333333,0.000000,0.000000
Wolverhampton Wanderers,37,2023-04-22,15:00,Premier League,Matchweek 32,Sat,Away,L,1.0,2.0,Leicester City,...,5,0,1.333333,0.333333,9.333333,4.666667,19.933333,0.000000,0.000000,0.000000
Wolverhampton Wanderers,38,2023-04-25,19:30,Premier League,Matchweek 33,Tue,Home,W,2.0,0.0,Crystal Palace,...,1,1,1.333333,0.666667,12.000000,5.333333,21.100000,0.666667,0.000000,0.000000


In [65]:
matches_rolling = matches_rolling.droplevel('team')

In [67]:
matches_rolling.index = range(matches_rolling.shape[0])

In [69]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
1,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
2,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
3,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
4,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1358,2023-04-08,15:00,Premier League,Matchweek 30,Sat,Home,W,1.0,0.0,Chelsea,...,5,1,1.333333,2.333333,12.666667,4.333333,17.600000,0.666667,0.000000,0.000000
1359,2023-04-15,15:00,Premier League,Matchweek 31,Sat,Home,W,2.0,0.0,Brentford,...,5,1,1.333333,1.666667,13.333333,4.333333,19.166667,0.333333,0.000000,0.000000
1360,2023-04-22,15:00,Premier League,Matchweek 32,Sat,Away,L,1.0,2.0,Leicester City,...,5,0,1.333333,0.333333,9.333333,4.666667,19.933333,0.000000,0.000000,0.000000
1361,2023-04-25,19:30,Premier League,Matchweek 33,Tue,Home,W,2.0,0.0,Crystal Palace,...,1,1,1.333333,0.666667,12.000000,5.333333,21.100000,0.666667,0.000000,0.000000


In [72]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2023-01-02']
    test = data[data["date"] > '2023-01-02']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [73]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [76]:
precision

0.5542168674698795

In [77]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [78]:
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
51,0,1,2023-01-03,Arsenal,Newcastle Utd,D
52,1,0,2023-01-15,Arsenal,Tottenham,W
53,1,1,2023-01-22,Arsenal,Manchester Utd,W
54,0,1,2023-02-04,Arsenal,Everton,L
55,0,1,2023-02-11,Arsenal,Brentford,D
...,...,...,...,...,...,...
1358,1,0,2023-04-08,Wolverhampton Wanderers,Chelsea,W
1359,1,0,2023-04-15,Wolverhampton Wanderers,Brentford,W
1360,0,0,2023-04-22,Wolverhampton Wanderers,Leicester City,L
1361,1,1,2023-04-25,Wolverhampton Wanderers,Crystal Palace,W


In [79]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
}

mapping = MissingDict(**map_values)

In [81]:
mapping["Arsenal"]

'Arsenal'

In [82]:
combined["new_team"] = combined["team"].map(mapping)

In [83]:
combined

Unnamed: 0,actual,prediction,date,team,opponent,result,new_team
51,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal
52,1,0,2023-01-15,Arsenal,Tottenham,W,Arsenal
53,1,1,2023-01-22,Arsenal,Manchester Utd,W,Arsenal
54,0,1,2023-02-04,Arsenal,Everton,L,Arsenal
55,0,1,2023-02-11,Arsenal,Brentford,D,Arsenal
...,...,...,...,...,...,...,...
1358,1,0,2023-04-08,Wolverhampton Wanderers,Chelsea,W,Wolves
1359,1,0,2023-04-15,Wolverhampton Wanderers,Brentford,W,Wolves
1360,0,0,2023-04-22,Wolverhampton Wanderers,Leicester City,L,Wolves
1361,1,1,2023-04-25,Wolverhampton Wanderers,Crystal Palace,W,Wolves


In [84]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [85]:
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal,0,0,Newcastle United,Arsenal,D,Newcastle Utd
1,1,0,2023-01-15,Arsenal,Tottenham,W,Arsenal,0,0,Tottenham Hotspur,Arsenal,L,Tottenham
2,1,1,2023-01-22,Arsenal,Manchester Utd,W,Arsenal,0,0,Manchester United,Arsenal,L,Manchester Utd
3,0,1,2023-02-04,Arsenal,Everton,L,Arsenal,1,0,Everton,Arsenal,W,Everton
4,0,1,2023-02-11,Arsenal,Brentford,D,Arsenal,0,0,Brentford,Arsenal,D,Brentford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,1,0,2023-04-08,Wolverhampton Wanderers,Chelsea,W,Wolves,0,1,Chelsea,Wolves,L,Chelsea
302,1,0,2023-04-15,Wolverhampton Wanderers,Brentford,W,Wolves,0,0,Brentford,Wolves,L,Brentford
303,0,0,2023-04-22,Wolverhampton Wanderers,Leicester City,L,Wolves,1,0,Leicester City,Wolves,W,Leicester City
304,1,1,2023-04-25,Wolverhampton Wanderers,Crystal Palace,W,Wolves,0,0,Crystal Palace,Wolves,L,Crystal Palace


In [87]:
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] == 0)]["actual_x"].value_counts()

1    39
0    24
Name: actual_x, dtype: int64