SETUP

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

In [3]:
matches = pd.read_csv('matches.csv', index_col=0)
matches.shape

(760, 153)

In [8]:
matches["team"].value_counts()

Southampton                 36
West Ham United             36
Leeds United                36
Everton                     36
Nottingham Forest           36
Tottenham Hotspur           36
Aston Villa                 36
Brentford                   36
Fulham                      36
Arsenal                     36
Crystal Palace              36
Wolverhampton Wanderers     36
Bournemouth                 36
Leicester City              35
Manchester City             35
Liverpool                   35
Manchester United           35
Newcastle United            35
Chelsea                     35
Brighton and Hove Albion    34
Name: team, dtype: int64

In [None]:
matches["round"].value_counts()

In [None]:
matches["date"] = pd.to_datetime(matches["date"])
matches.dtypes

In [7]:
# Convert column data to numeric
matches["venue_code"] = matches["venue"].astype('category').cat.codes
matches["opp_code"] = matches["opponent"].astype('category').cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["formation_code"] = matches["formation"].astype('category').cat.codes
matches = matches[matches["date"] < '2023-05-15']



CREATE ROLLING TABLE

In [12]:
grouped_matches = matches.groupby("team")

In [13]:
grouped_matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,npxg,npxg/sh,g-xg,np:g-xg,team,venue_code,opp_code,hour,day_code,formation_code
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,1.4,0.11,-0.2,-0.4,Manchester City,0,18,16,6,13
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,1.7,0.09,1.3,1.3,Manchester City,1,2,15,5,10
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,...,2.1,0.10,0.9,0.9,Manchester City,0,14,16,6,13
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,2.2,0.13,1.8,1.8,Manchester City,1,6,15,5,10
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,3.3,0.20,2.7,2.7,Manchester City,1,15,19,2,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2022-08-06,15:00,Premier League,Matchweek 1,Sat,Away,L,1.0,4.0,Tottenham,...,0.5,0.05,0.5,0.5,Southampton,0,17,15,5,17
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,D,2.0,2.0,Leeds United,...,1.2,0.09,0.8,0.8,Southampton,1,9,15,5,4
2,2022-08-20,15:00,Premier League,Matchweek 3,Sat,Away,W,2.0,1.0,Leicester City,...,0.9,0.10,1.1,1.1,Southampton,0,10,15,5,10
4,2022-08-27,12:30,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Manchester Utd,...,1.4,0.09,-1.4,-1.4,Southampton,1,13,12,5,10


In [21]:
matches["target"] = (matches["result"] == "W").astype("int")

START OF ML ALGORITHM 

In [10]:
rf = RandomForestClassifier(min_samples_split=10, random_state=1)

In [26]:
train = matches[matches["date"] < '2023-01-02']
test = matches[matches["date"] > '2023-01-02']
predictors = ["venue_code", "opp_code", "hour", "day_code"]
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [27]:
preds = rf.predict(test[predictors])

In [29]:
acc = accuracy_score(test["target"], preds)

In [33]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,160,48
1,86,44


In [38]:
precision_score(test["target"], preds)

0.4782608695652174

In [39]:
grouped_matches = matches.groupby("team")

In [40]:
group = grouped_matches.get_group("Manchester City")
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,1.0,1.0,1.0,2022,Manchester City,0,21,16,6,1
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,0.0,0.0,0.0,2022,Manchester City,1,2,15,5,1
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,...,1.0,0.0,0.0,2022,Manchester City,0,15,16,6,0
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,0.0,0.0,0.0,2022,Manchester City,1,7,15,5,1
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,0.0,0.0,0.0,2022,Manchester City,1,17,19,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2022-04-30,17:30,Premier League,Matchweek 35,Sat,Away,W,4.0,0.0,Leeds United,...,0.0,0.0,0.0,2021,Manchester City,0,10,17,5,1
54,2022-05-08,16:30,Premier League,Matchweek 36,Sun,Home,W,5.0,0.0,Newcastle Utd,...,1.0,0.0,0.0,2021,Manchester City,1,15,16,6,1
55,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Away,W,5.0,1.0,Wolves,...,0.0,0.0,0.0,2021,Manchester City,0,22,20,2,1
56,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,2.0,2.0,West Ham,...,2.0,0.0,1.0,2021,Manchester City,0,21,14,6,0


In [60]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [None]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
new_cols

In [None]:
rolling_averages(  , cols, new_cols)

In [63]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [None]:
matches_rolling

In [65]:
matches_rolling = matches_rolling.droplevel('team')

In [67]:
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
matches_rolling

In [72]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2023-01-02']
    test = data[data["date"] > '2023-01-02']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [73]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [76]:
precision

0.5542168674698795

In [77]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [None]:
combined

In [79]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
}

mapping = MissingDict(**map_values)

In [81]:
mapping["Arsenal"]

'Arsenal'

In [82]:
combined["new_team"] = combined["team"].map(mapping)

In [None]:
combined

In [84]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [None]:
merged

In [None]:
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] == 0)]["actual_x"].value_counts()