In [1402]:
import pandas as pd

In [1403]:
matches = pd.read_csv("matches.csv", index_col = 0)
pd.set_option('display.max_columns', None)  

extracing data and cleaning

In [1404]:
matches.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation',
       'Referee', 'Match Report', 'Notes', 'Sh', 'SoT', 'Dist', 'FK', 'PK',
       'PKatt', 'Season', 'Team'],
      dtype='object')

In [1405]:
matches.columns = matches.columns.str.lower()


In [1406]:
matches.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')

In [1407]:
matches.shape

(760, 27)

In [1408]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [1409]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,1.9,0.3,65.0,21572.0,Kevin De Bruyne,4-2-3-1,Craig Pawson,Match Report,,17.0,8.0,13.9,0.0,0,0,2024,ManchesterCity
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,1.0,0.3,59.0,53419.0,Kyle Walker,4-2-3-1,Robert Jones,Match Report,,14.0,4.0,17.9,0.0,0,0,2024,ManchesterCity
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,3.5,0.7,79.0,31336.0,Kyle Walker,4-2-3-1,Jarred Gillett,Match Report,,29.0,9.0,17.3,2.0,0,1,2024,ManchesterCity
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,2.2,1.4,68.0,52899.0,Kyle Walker,4-2-3-1,Michael Oliver,Match Report,,6.0,4.0,14.8,0.0,1,1,2024,ManchesterCity
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,3.6,0.9,68.0,62475.0,Kyle Walker,4-2-3-1,Andy Madley,Match Report,,29.0,13.0,16.4,1.0,0,0,2024,ManchesterCity


In [1410]:
cat_cols = ['day', 'venue', 'result', 'opponent', 'captain', 'formation', 'referee', 'team']

num_cols = ['date', 'time', 'round', 'gf', 'ga', 'xg', 'xga', 'poss', 'attendance', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt', 'season']

drop_cols = ['comp', 'match report', 'notes']

In [1411]:
matches = matches.drop(columns=drop_cols)


In [1412]:
matches.shape

(760, 24)

transforming categorical columns into numerical formating to prepare for machine learning


In [1413]:
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.day_of_week
matches["target"] = (matches["result"] == "W").astype("int")

I have encoded a win as 1 and losses/draws as 0


In [1414]:
from sklearn.ensemble import RandomForestClassifier

In [1415]:
rf = RandomForestClassifier(n_estimators=50,
                            min_samples_split=10,
                            random_state=1
                            )

In [1416]:
matches.head(5)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2023-08-11,20:00,Matchweek 1,Fri,Away,W,3,0,Burnley,1.9,0.3,65.0,21572.0,Kevin De Bruyne,4-2-3-1,Craig Pawson,17.0,8.0,13.9,0.0,0,0,2024,ManchesterCity,0,5,20,4,1
3,2023-08-19,20:00,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,1.0,0.3,59.0,53419.0,Kyle Walker,4-2-3-1,Robert Jones,14.0,4.0,17.9,0.0,0,0,2024,ManchesterCity,1,14,20,5,1
4,2023-08-27,14:00,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,3.5,0.7,79.0,31336.0,Kyle Walker,4-2-3-1,Jarred Gillett,29.0,9.0,17.3,2.0,0,1,2024,ManchesterCity,0,16,14,6,1
5,2023-09-02,15:00,Matchweek 4,Sat,Home,W,5,1,Fulham,2.2,1.4,68.0,52899.0,Kyle Walker,4-2-3-1,Michael Oliver,6.0,4.0,14.8,0.0,1,1,2024,ManchesterCity,1,9,15,5,1
6,2023-09-16,15:00,Matchweek 5,Sat,Away,W,3,1,West Ham,3.6,0.9,68.0,62475.0,Kyle Walker,4-2-3-1,Andy Madley,29.0,13.0,16.4,1.0,0,0,2024,ManchesterCity,0,18,15,5,1


In [1417]:
matches.sort_values(by='date').iloc[len(matches)//2:] 

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
20,2023-12-30,15:00,Matchweek 20,Sat,Away,L,0,2,Manchester City,0.3,2.2,20.0,53108.0,Jack Robinson,3-4-3,David Coote,4.0,2.0,14.9,0.0,0,0,2024,SheffieldUnited,0,12,15,5,0
23,2023-12-30,15:00,Matchweek 20,Sat,Away,L,0,3,Wolves,0.4,1.9,47.0,31639.0,James Tarkowski,5-4-1,Thomas Bramall,10.0,0.0,20.7,0.0,0,0,2024,Everton,0,19,15,5,0
20,2023-12-30,17:30,Matchweek 20,Sat,Home,W,2,1,Manchester Utd,0.7,0.8,44.0,29529.0,Ryan Yates,4-2-3-1,Tim Robinson,8.0,2.0,18.3,0.0,0,0,2024,NottinghamForest,1,13,17,5,1
27,2023-12-30,17:30,Matchweek 20,Sat,Away,L,1,2,Nott'ham Forest,0.8,0.7,56.0,29529.0,Bruno Fernandes,4-2-3-1,Tim Robinson,10.0,5.0,17.5,0.0,0,0,2024,ManchesterUnited,0,15,17,5,0
21,2023-12-30,15:00,Matchweek 20,Sat,Home,W,3,1,Brentford,1.0,1.4,46.0,25472.0,Marc Guéhi,4-3-3,Robert Jones,13.0,6.0,16.2,0.0,0,0,2024,CrystalPalace,1,3,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2024-05-19,16:00,Matchweek 38,Sun,Away,L,1,2,Arsenal,0.6,2.9,32.0,60312.0,Séamus Coleman,4-4-1-1,Michael Oliver,5.0,2.0,20.3,2.0,0,0,2024,Everton,0,0,16,6,0
41,2024-05-19,16:00,Matchweek 38,Sun,Home,L,2,4,Newcastle Utd,1.1,3.4,54.0,17124.0,Christian Nørgaard,4-3-3,Simon Hooper,10.0,5.0,15.9,1.0,0,0,2024,Brentford,1,14,16,6,0
43,2024-05-19,16:00,Matchweek 38,Sun,Away,W,2,1,Burnley,1.7,1.2,28.0,21109.0,Ryan Yates,4-2-3-1,Graham Scott,12.0,6.0,17.2,0.0,0,0,2024,NottinghamForest,0,5,16,6,1
52,2024-05-19,16:00,Matchweek 38,Sun,Away,L,1,3,Manchester City,0.4,1.9,29.0,55097.0,Kurt Zouma,3-4-3,John Brooks,3.0,2.0,10.3,0.0,0,0,2024,WestHamUnited,0,12,16,6,0


In [1418]:
train = matches[matches["date"] <= "2023-12-30"]
test = matches[matches["date"] > "2023-12-30"]


In [1419]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [1420]:
rf.fit(train[predictors], train["target"])

In [1421]:
preds = rf.predict(test[predictors])

In [1422]:
from sklearn.metrics import accuracy_score

In [1423]:
acc = accuracy_score(test["target"], preds)

In [1424]:
acc

0.6075268817204301

In [1425]:
combined = pd.DataFrame(dict(actual=test["target"], predictions=preds))

In [1426]:
pd.crosstab(index=combined["actual"], columns=combined["predictions"])

predictions,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,166,66
1,80,60


In [1427]:
from sklearn.metrics import precision_score

In [1428]:
precision_score(test["target"], preds)

0.47619047619047616

47.6% precision is quite low. I will now improve my feature selection.



In [1429]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("ManchesterCity")
group.shape

(38, 29)

Implementing rolling_averages to use as input data for machine learning to improve model

In [1430]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(5, closed="left").mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

identifying which columns to calculate rolling averages for

In [1431]:
cols = ['gf', 'ga', 'xg', 'xga', 'poss', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_cols = [f"{c}_rolling" for c in cols]

In [1432]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target,gf_rolling,ga_rolling,xg_rolling,xga_rolling,poss_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
8,2023-09-23,15:00,Matchweek 6,Sat,Home,W,2,0,Nott'ham Forest,1.3,1.0,57.0,53413.0,Kyle Walker,4-2-3-1,Anthony Taylor,7.0,4.0,17.2,2.0,0,0,2024,ManchesterCity,1,15,15,5,1,2.8,0.6,2.44,0.72,67.8,19.0,7.6,16.06,0.6,0.2,0.4
10,2023-09-30,15:00,Matchweek 7,Sat,Away,L,1,2,Wolves,0.9,0.4,68.0,31415.0,Kyle Walker,4-2-3-1,Craig Pawson,23.0,8.0,19.3,1.0,0,0,2024,ManchesterCity,0,19,15,5,0,2.6,0.6,2.32,0.86,66.2,17.0,6.8,16.72,1.0,0.2,0.4
12,2023-10-08,16:30,Matchweek 8,Sun,Away,L,0,1,Arsenal,0.5,0.4,51.0,60233.0,Kyle Walker,4-3-3,Michael Oliver,4.0,0.0,14.9,0.0,0,0,2024,ManchesterCity,0,0,16,6,0,2.6,1.0,2.3,0.88,68.0,18.8,7.6,17.0,1.2,0.2,0.4
13,2023-10-21,15:00,Matchweek 9,Sat,Home,W,2,1,Brighton,0.8,0.8,55.0,53466.0,Kyle Walker,3-2-4-1,Robert Jones,10.0,5.0,18.8,1.0,0,0,2024,ManchesterCity,1,4,15,5,1,2.2,1.0,1.7,0.82,62.4,13.8,5.8,16.52,0.8,0.2,0.2
15,2023-10-29,15:30,Matchweek 10,Sun,Away,W,3,0,Manchester Utd,4.0,0.9,60.0,73502.0,Kyle Walker,4-2-3-1,Paul Tierney,20.0,9.0,17.2,2.0,1,1,2024,ManchesterCity,0,13,15,6,1,1.6,1.0,1.42,0.7,59.8,14.6,6.0,17.32,1.0,0.0,0.0
16,2023-11-04,15:00,Matchweek 11,Sat,Home,W,6,1,Bournemouth,1.9,0.9,65.0,53358.0,Kyle Walker,3-4-3,Craig Pawson,21.0,8.0,16.4,0.0,0,0,2024,ManchesterCity,1,2,15,5,1,1.6,0.8,1.5,0.7,58.2,12.8,5.2,17.48,1.2,0.2,0.2
18,2023-11-12,16:30,Matchweek 12,Sun,Away,D,4,4,Chelsea,2.9,2.9,55.0,39532.0,Kyle Walker,3-2-4-1,Anthony Taylor,14.0,9.0,16.6,1.0,1,1,2024,ManchesterCity,0,6,16,6,0,2.4,1.0,1.62,0.68,59.8,15.6,6.0,17.32,0.8,0.2,0.2
19,2023-11-25,12:30,Matchweek 13,Sat,Home,D,1,1,Liverpool,1.3,0.6,59.0,53289.0,Kyle Walker,3-2-4-1,Chris Kavanagh,16.0,5.0,16.3,1.0,0,0,2024,ManchesterCity,1,10,12,5,0,3.0,1.4,2.02,1.18,57.2,13.8,6.2,16.78,0.8,0.4,0.4
21,2023-12-03,16:30,Matchweek 14,Sun,Home,D,3,3,Tottenham,2.6,0.5,55.0,53473.0,Kyle Walker,3-2-4-1,Simon Hooper,17.0,4.0,13.4,0.0,0,0,2024,ManchesterCity,1,17,16,6,0,3.2,1.4,2.18,1.22,58.8,16.2,7.2,17.06,1.0,0.4,0.4
22,2023-12-06,20:15,Matchweek 15,Wed,Away,L,0,1,Aston Villa,0.6,2.3,54.0,41421.0,Kyle Walker,3-2-4-1,John Brooks,2.0,2.0,7.4,0.0,0,0,2024,ManchesterCity,0,1,20,2,0,3.4,1.8,2.54,1.16,58.8,17.6,7.0,15.98,0.8,0.4,0.4


In [1433]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel("team")
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling.head(5)

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target,gf_rolling,ga_rolling,xg_rolling,xga_rolling,poss_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2023-09-24,14:00,Matchweek 6,Sun,Home,D,2,2,Tottenham,1.8,1.4,47.0,60156.0,Martin Ødegaard,4-3-3,Robert Jones,12.0,4.0,16.9,0.0,1,1,2024,Arsenal,1,17,14,6,0,1.8,0.8,1.86,0.8,66.2,15.2,5.4,16.34,0.0,0.4,0.4
1,2023-09-30,15:00,Matchweek 7,Sat,Away,W,4,0,Bournemouth,3.4,0.6,57.0,11193.0,Martin Ødegaard,4-3-3,Michael Salisbury,13.0,6.0,15.5,0.0,2,2,2024,Arsenal,0,2,15,5,1,1.8,1.0,2.06,0.84,60.0,14.6,4.8,15.9,0.0,0.6,0.6
2,2023-10-08,16:30,Matchweek 8,Sun,Home,W,1,0,Manchester City,0.4,0.5,49.0,60233.0,Martin Ødegaard,4-3-3,Michael Oliver,12.0,2.0,19.2,0.0,0,0,2024,Arsenal,1,12,16,6,1,2.4,1.0,2.34,0.76,60.8,14.6,5.6,15.72,0.0,0.8,0.8
3,2023-10-21,17:30,Matchweek 9,Sat,Away,D,2,2,Chelsea,1.0,1.3,56.0,39723.0,Martin Ødegaard,4-3-3,Chris Kavanagh,13.0,3.0,14.3,0.0,0,0,2024,Arsenal,0,6,17,5,0,2.2,0.6,1.78,0.74,56.4,13.4,4.2,16.8,0.0,0.6,0.6
4,2023-10-28,15:00,Matchweek 10,Sat,Home,W,5,0,Sheffield Utd,2.8,0.0,67.0,60153.0,Bukayo Saka,4-3-3,Tim Robinson,12.0,6.0,17.0,0.0,1,1,2024,Arsenal,1,16,15,5,1,2.0,0.8,1.52,0.82,56.6,12.6,3.8,16.66,0.0,0.6,0.6


defining function to perform machine learning model

In [1434]:
def make_prediction(data, predictors):
    date = "2023-12-30"
    # date = "2023-11-30"
    train = data[data["date"] <= date]
    test = data[data["date"] > date]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predictions=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [1435]:
combined, precision = make_prediction(matches, predictors)
precision

0.47619047619047616

Team names in "opponent" and "team" columns are slightly different. Defining a dictionary to handle this when merging

In [1436]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
map_values = {
    "ManchesterCity": "Manchester City",
    "ManchesterUnited": "Manchester Utd",
    "WestHamUnited": "West Ham",
    "NewcastleUnited": "Newcastle Utd",
    "TottenhamHotspur": "Tottenham",
    "AstonVilla": "Aston Villa",
    "BrightonandHoveAlbion": "Brighton",
    "LutonTown": "Luton Town",
    "WolverhamptonWanderers": "Wolves",
    "CrystalPalace": "Crystal Palace",
    "NottinghamForest": "Nott'ham Forest",
    "SheffieldUnited": "Sheffield Utd"

 
}
mapping = MissingDict(**map_values)

In [1437]:
mapping["NottinghamForest"]

"Nott'ham Forest"

retrying model with new rolling average columns

In [1438]:
combined, precision = make_prediction(matches_rolling, predictors+new_cols )
print(precision)

0.48


In [1439]:
(0.48 - 0.47619047619047616) * 100

0.3809523809523818

taking rolling averages into account on the same dataset has improved the model by 0.38%

In [1440]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined["new_team"] = combined["team"].map(mapping)
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged[(merged["predictions_x"] == 1) & (merged["predictions_y"] == 0)]["actual_x"].value_counts()


actual_x
1    53
0    44
Name: count, dtype: int64

next I'll try importing more historic match data to train my model with


In [1452]:
new_df = pd.read_csv("2020-23_matches.csv", index_col=0)
new_df.head(5)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,2.2,0.5,75.0,62443.0,İlkay Gündoğan,4-3-3,Michael Oliver,Match Report,,13.0,1.0,18.7,1.0,1.0,1.0,2023,Manchester City
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,1.7,0.1,67.0,53453.0,İlkay Gündoğan,4-2-3-1,David Coote,Match Report,,19.0,7.0,17.5,0.0,0.0,0.0,2023,Manchester City
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,2.1,1.8,69.0,52258.0,İlkay Gündoğan,4-3-3,Jarred Gillett,Match Report,,21.0,10.0,16.2,1.0,0.0,0.0,2023,Manchester City
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,0.1,74.0,53112.0,Kevin De Bruyne,4-2-3-1,Darren England,Match Report,,18.0,5.0,14.1,0.0,0.0,0.0,2023,Manchester City
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,0.7,74.0,53409.0,İlkay Gündoğan,4-2-3-1,Paul Tierney,Match Report,,17.0,9.0,14.8,0.0,0.0,0.0,2023,Manchester City


exploring the data


In [1442]:
len(new_df)/380/2

3.0

In [1443]:
new_df["season"].unique()

array([2023, 2022, 2021])

In [1444]:
new_df.groupby("team").size().reset_index(name="count").sort_values("count", ascending=0)

Unnamed: 0,team,count
0,Arsenal,114
1,Aston Villa,114
4,Brighton and Hove Albion,114
7,Crystal Palace,114
6,Chelsea,114
8,Everton,114
12,Liverpool,114
11,Leicester City,114
10,Leeds United,114
23,West Ham United,114


cleaning the dataframe

In [1445]:
new_df.drop(columns=drop_cols)
new_df["date"] = pd.to_datetime(new_df["date"])
new_df["venue_code"] = new_df["venue"].astype("category").cat.codes
new_df["opp_code"] = new_df["opponent"].astype("category").cat.codes
new_df["hour"] = new_df["time"].str.replace(":.+", "", regex=True).astype("int")
new_df["day_code"] = new_df["date"].dt.day_of_week
new_df["target"] = (new_df["result"] == "W").astype("int")

In [1446]:
new_df.sort_values("date")

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2020-09-12,20:00,Premier League,Matchweek 1,Sat,Home,L,0,2,Newcastle Utd,1.0,1.6,58.0,,Mark Noble,4-2-3-1,Stuart Attwell,Match Report,,15.0,3.0,16.5,0.0,0.0,0.0,2021,West Ham United,1,15,20,5,0
1,2020-09-12,12:30,Premier League,Matchweek 1,Sat,Away,W,3,0,Fulham,1.9,0.1,54.0,,Pierre-Emerick Aubameyang,3-4-3,Chris Kavanagh,Match Report,,13.0,5.0,14.1,2.0,0.0,0.0,2021,Arsenal,0,9,12,5,1
0,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Away,L,3,4,Liverpool,0.3,2.7,51.0,,Luke Ayling,4-1-4-1,Michael Oliver,Match Report,,6.0,3.0,19.9,1.0,0.0,0.0,2021,Leeds United,0,12,17,5,0
0,2020-09-12,15:00,Premier League,Matchweek 1,Sat,Home,W,1,0,Southampton,1.1,0.9,31.0,,Scott Dann,4-4-2,Jonathan Moss,Match Report,,5.0,3.0,10.6,0.0,0.0,0.0,2021,Crystal Palace,1,19,15,5,1
1,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Home,W,4,3,Leeds United,2.7,0.3,49.0,,Jordan Henderson,4-3-3,Michael Oliver,Match Report,,20.0,4.0,18.4,0.0,2.0,2.0,2021,Liverpool,1,10,17,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,2023-05-28,16:30,Premier League,Matchweek 38,Sun,Home,W,5,0,Wolves,2.8,0.5,51.0,60095.0,Martin Ødegaard,4-3-3,Andre Marriner,Match Report,,14.0,8.0,13.7,1.0,0.0,0.0,2023,Arsenal,1,24,16,6,1
40,2023-05-28,16:30,Premier League,Matchweek 38,Sun,Home,W,1,0,Manchester City,1.3,1.6,35.0,17120.0,David Raya,5-3-2,John Brooks,Match Report,,11.0,4.0,14.0,0.0,0.0,0.0,2023,Brentford,1,13,16,6,1
43,2023-05-28,16:30,Premier League,Matchweek 38,Sun,Away,L,0,5,Arsenal,0.5,2.8,49.0,60095.0,Max Kilman,4-3-3,Andre Marriner,Match Report,,6.0,0.0,14.7,1.0,0.0,0.0,2023,Wolverhampton Wanderers,0,0,16,6,0
55,2023-05-28,16:30,Premier League,Matchweek 38,Sun,Away,L,1,2,Leicester City,1.4,1.4,52.0,32183.0,Declan Rice,4-2-3-1,Simon Hooper,Match Report,,16.0,3.0,16.7,0.0,0.0,0.0,2023,West Ham United,0,11,16,6,0


here I split the 2020-2023 data set into roughly 80-20 split using the date "2022-12-26" to split the data.

In [1447]:
def make_prediction(data, predictors):
    date = "2022-12-26"
    # date = "2023-11-30"
    train = data[data["date"] <= date]
    test = data[data["date"] > date]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predictions=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

prediction using 21-23 match data without rolling averages

In [1448]:
combined, precision = make_prediction(new_df, predictors)
precision

0.46226415094339623

In [1449]:
matches_rolling_21_23 = new_df.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling_21_23 = matches_rolling_21_23.droplevel("team")
matches_rolling_21_23.index = range(matches_rolling_21_23.shape[0])


  matches_rolling_21_23 = new_df.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


prediction using 21-23 match data with rolling averages

In [1450]:
combined, precision = make_prediction(matches_rolling_21_23, predictors + new_cols)
precision

0.5045045045045045

now I'll try using the entirety of 20-23 (3 seasons) to predict the outcomes of the 2023-2024 season

In [1451]:
train = matches_rolling_21_23
test = matches_rolling
rf.fit(train[predictors + new_cols], train["target"])
preds = rf.predict(test[predictors + new_cols])
combined = pd.DataFrame(dict(actual=test["target"], predictions=preds), index=test.index)
precision = precision_score(test["target"], preds)
precision


0.6293706293706294

This model has netted my highest precision score of 63%. As expected a larger training set generally improves your model in combination with a high quality features

In [1401]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined["new_team"] = combined["team"].map(mapping)
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged[(merged["predictions_x"] == 1) & (merged["predictions_y"] == 0)]["actual_x"].value_counts()


actual_x
1    79
0    36
Name: count, dtype: int64