In [1]:
#Reading match data into pandas dataframe

In [2]:
import pandas as pd

In [3]:
matches = pd.read_csv("matches.csv", index_col=0)

In [4]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3.0,0.0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0,0,2024,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1.0,0.0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0,0,2024,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0,1,2024,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5.0,1.0,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1,1,2024,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0,0,2024,Manchester City


In [5]:
matches.shape

(2304, 28)

In [6]:
matches["team"].value_counts()

team
Arsenal                     198
Liverpool                   176
Manchester City             156
Nottingham Forest           139
Aston Villa                 118
Newcastle United            118
Chelsea                     118
Bournemouth                 118
Tottenham Hotspur            97
Crystal Palace               97
Manchester United            97
Brighton and Hove Albion     97
Fulham                       97
Wolverhampton Wanderers      97
West Ham United              97
Brentford                    97
Everton                      96
Luton Town                   76
Burnley                      76
Sheffield United             76
Ipswich Town                 21
Leicester City               21
Southampton                  21
Name: count, dtype: int64

In [7]:
matches["round"].value_counts()

round
Matchweek 1     76
Matchweek 2     76
Matchweek 3     76
Matchweek 4     76
Matchweek 5     76
Matchweek 6     76
Matchweek 7     76
Matchweek 8     76
Matchweek 9     76
Matchweek 10    76
Matchweek 11    76
Matchweek 12    76
Matchweek 13    76
Matchweek 14    76
Matchweek 16    76
Matchweek 17    76
Matchweek 19    76
Matchweek 20    76
Matchweek 21    76
Matchweek 18    76
Matchweek 15    70
Matchweek 22    42
Matchweek 23    42
Matchweek 24    42
Matchweek 25    42
Matchweek 26    42
Matchweek 27    42
Matchweek 28    42
Matchweek 30    42
Matchweek 31    42
Matchweek 32    42
Matchweek 33    42
Matchweek 29    42
Matchweek 35    42
Matchweek 36    42
Matchweek 37    42
Matchweek 34    42
Matchweek 38    42
Name: count, dtype: int64

In [8]:
#Cleaning data for machine learning

In [9]:
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf               float64
ga               float64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [10]:
matches["date"] = pd.to_datetime(matches["date"])

In [11]:
matches.dtypes

date             datetime64[ns]
time                     object
comp                     object
round                    object
day                      object
venue                    object
result                   object
gf                      float64
ga                      float64
opponent                 object
xg                      float64
xga                     float64
poss                    float64
attendance              float64
captain                  object
formation                object
opp formation            object
referee                  object
match report             object
notes                   float64
sh                      float64
sot                     float64
dist                    float64
fk                      float64
pk                        int64
pkatt                     int64
season                    int64
team                     object
dtype: object

In [12]:
#Creating predictors for machine learning

In [13]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes #creates column "venue_code" in table that is either 0(Away) or 1(Home)

In [14]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes #gives each team their own opp_code to differentiate eachother

In [15]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") #turns hour into integer from time column

In [16]:
matches["day_code"] = matches["date"].dt.dayofweek #creates column "day_code", which provides a code for each day of the week b/c can only take in numbers when training algorithm

In [17]:
matches["target"] = (matches["result"] == "W").astype("int") # creates column "target" that is either 0(loss) or 1(win), which is based on "result" column

In [18]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3.0,0.0,Burnley,...,0.0,0,0,2024,Manchester City,0,5,20,4,1
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1.0,0.0,Newcastle Utd,...,0.0,0,0,2024,Manchester City,1,17,20,5,1
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Sheffield Utd,...,2.0,0,1,2024,Manchester City,0,19,14,6,1
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5.0,1.0,Fulham,...,0.0,1,1,2024,Manchester City,1,9,15,5,1
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,West Ham,...,1.0,0,0,2024,Manchester City,0,22,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-04-24,20:00,Premier League,Matchweek 29,Wed,Away,L,2.0,4.0,Manchester Utd,...,1.0,0,0,2023,Sheffield United,0,16,20,2,0
37,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Away,L,1.0,5.0,Newcastle Utd,...,0.0,0,0,2023,Sheffield United,0,17,15,5,0
38,2024-05-04,15:00,Premier League,Matchweek 36,Sat,Home,L,1.0,3.0,Nott'ham Forest,...,0.0,1,1,2023,Sheffield United,1,18,15,5,0
39,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Away,L,0.0,1.0,Everton,...,0.0,0,0,2023,Sheffield United,0,8,15,5,0


In [19]:
#Creating the initial machine learning model

In [25]:
from sklearn.ensemble import RandomForestClassifier 

# Importing RandomForestClassifier from scikit-learn
# Random forest captures non-linear patterns in data
# Example: opponent codes (e.g., 18 vs 15) are not linearly related to difficulty
# Random forest handles this, unlike linear models

In [27]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1) #initializing RandomForestClassifier with specific hyperparameters

In [30]:
train = matches[matches["date"] < '2024-01-01'] #take all matches before 2024 to use for training set

In [31]:
test = matches[matches["date"] > '2024-01-01']  #take all matches in 2024 to use for test set

In [33]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [34]:
rf.fit(train[predictors], train["target"])

In [35]:
preds = rf.predict(test[predictors])

In [37]:
from sklearn.metrics import accuracy_score # Accuracy score measures overall prediction correctness as the percentage of correct win and loss predictions combined

In [40]:
error = accuracy_score(test["target"], preds)

In [41]:
error

0.5652777777777778

In [42]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [43]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,532,318
1,308,282


In [44]:
from sklearn.metrics import precision_score

In [45]:
precision_score(test["target"], preds)

0.47

In [46]:
grouped_matches = matches.groupby("team")

In [47]:
group = grouped_matches.get_group("Manchester City").sort_values("date")

In [57]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [58]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,5,1,3.000000,1.000000,17.666667,6.000000,17.466667,0.666667,0.333333,0.333333
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,2,1,3.666667,1.666667,19.333333,7.333333,15.933333,0.333333,0.000000,0.000000
6,2022-09-03,17:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,Aston Villa,...,5,0,4.333333,1.666667,18.666667,8.000000,15.033333,0.333333,0.000000,0.000000
9,2022-09-17,12:30,Premier League,Matchweek 8,Sat,Away,W,3.0,0.0,Wolves,...,5,1,3.666667,1.000000,16.000000,6.000000,15.233333,0.333333,0.000000,0.000000
10,2022-10-02,14:00,Premier League,Matchweek 9,Sun,Home,W,6.0,3.0,Manchester Utd,...,6,1,3.333333,0.333333,15.333333,6.666667,17.000000,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,2024-12-29,14:30,Premier League,Matchweek 19,Sun,Away,W,2.0,0.0,Leicester City,...,6,1,1.333333,0.666667,20.000000,4.333333,17.566667,1.000000,0.000000,0.666667
28,2025-01-04,15:00,Premier League,Matchweek 20,Sat,Home,W,4.0,1.0,West Ham,...,5,1,1.666667,0.333333,17.000000,4.666667,17.933333,1.000000,0.000000,0.333333
28,2025-01-04,15:00,Premier League,Matchweek 20,Sat,Home,W,4.0,1.0,West Ham,...,5,1,2.666667,0.333333,12.666667,5.666667,17.733333,0.666667,0.000000,0.000000
30,2025-01-14,19:30,Premier League,Matchweek 21,Tue,Away,D,2.0,2.0,Brentford,...,1,0,3.333333,0.666667,11.333333,6.333333,17.166667,0.333333,0.000000,0.000000


In [60]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [61]:
matches_rolling


Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,5,1,3.000000,0.666667,14.333333,5.000000,14.133333,0.333333,0.0,0.0
Arsenal,4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,2,1,3.000000,1.000000,18.333333,7.000000,14.433333,0.333333,0.0,0.0
Arsenal,5,2022-09-04,16:30,Premier League,Matchweek 6,Sun,Away,L,1.0,3.0,Manchester Utd,...,6,0,2.333333,0.666667,19.333333,7.333333,15.533333,0.666667,0.0,0.0
Arsenal,7,2022-09-18,12:00,Premier League,Matchweek 8,Sun,Away,W,3.0,0.0,Brentford,...,6,1,1.666667,1.666667,20.000000,6.333333,16.800000,1.000000,0.0,0.0
Arsenal,8,2022-10-01,12:30,Premier League,Matchweek 9,Sat,Home,W,3.0,1.0,Tottenham,...,5,1,2.000000,1.333333,17.000000,6.000000,17.700000,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,18,2024-12-22,14:00,Premier League,Matchweek 17,Sun,Away,W,3.0,0.0,Leicester City,...,6,1,0.666667,2.666667,13.666667,4.333333,19.000000,0.000000,0.0,0.0
Wolverhampton Wanderers,19,2024-12-26,17:30,Premier League,Matchweek 18,Thu,Home,W,2.0,0.0,Manchester Utd,...,3,1,1.666667,1.333333,14.333333,5.000000,16.966667,0.333333,0.0,0.0
Wolverhampton Wanderers,20,2024-12-29,15:00,Premier League,Matchweek 19,Sun,Away,D,2.0,2.0,Tottenham,...,6,0,2.000000,0.666667,10.333333,4.666667,19.033333,0.666667,0.0,0.0
Wolverhampton Wanderers,21,2025-01-06,20:00,Premier League,Matchweek 20,Mon,Home,L,0.0,3.0,Nott'ham Forest,...,0,0,2.333333,0.666667,8.666667,3.666667,19.566667,1.000000,0.0,0.0


In [62]:
matches_rolling = matches_rolling.droplevel('team')

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,5,1,3.000000,0.666667,14.333333,5.000000,14.133333,0.333333,0.0,0.0
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,2,1,3.000000,1.000000,18.333333,7.000000,14.433333,0.333333,0.0,0.0
5,2022-09-04,16:30,Premier League,Matchweek 6,Sun,Away,L,1.0,3.0,Manchester Utd,...,6,0,2.333333,0.666667,19.333333,7.333333,15.533333,0.666667,0.0,0.0
7,2022-09-18,12:00,Premier League,Matchweek 8,Sun,Away,W,3.0,0.0,Brentford,...,6,1,1.666667,1.666667,20.000000,6.333333,16.800000,1.000000,0.0,0.0
8,2022-10-01,12:30,Premier League,Matchweek 9,Sat,Home,W,3.0,1.0,Tottenham,...,5,1,2.000000,1.333333,17.000000,6.000000,17.700000,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,2024-12-22,14:00,Premier League,Matchweek 17,Sun,Away,W,3.0,0.0,Leicester City,...,6,1,0.666667,2.666667,13.666667,4.333333,19.000000,0.000000,0.0,0.0
19,2024-12-26,17:30,Premier League,Matchweek 18,Thu,Home,W,2.0,0.0,Manchester Utd,...,3,1,1.666667,1.333333,14.333333,5.000000,16.966667,0.333333,0.0,0.0
20,2024-12-29,15:00,Premier League,Matchweek 19,Sun,Away,D,2.0,2.0,Tottenham,...,6,0,2.000000,0.666667,10.333333,4.666667,19.033333,0.666667,0.0,0.0
21,2025-01-06,20:00,Premier League,Matchweek 20,Mon,Home,L,0.0,3.0,Nott'ham Forest,...,0,0,2.333333,0.666667,8.666667,3.666667,19.566667,1.000000,0.0,0.0


In [64]:
matches_rolling.index = range(matches_rolling.shape[0])

In [67]:
#Retraining machine learning model

In [76]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2024-01-01']
    test = data[data["date"] > '2024-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [77]:
combined, error = make_predictions(matches_rolling, predictors + new_cols)

In [78]:
error

0.5441696113074205

In [79]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [80]:
combined.head(10)

Unnamed: 0,actual,predicted,date,team,opponent,result
75,1,0,2024-01-20,Arsenal,Crystal Palace,W
76,1,1,2024-01-20,Arsenal,Crystal Palace,W
77,1,1,2024-01-30,Arsenal,Nott'ham Forest,W
78,1,1,2024-01-30,Arsenal,Nott'ham Forest,W
79,1,1,2024-02-04,Arsenal,Liverpool,W
80,1,0,2024-02-04,Arsenal,Liverpool,W
81,1,1,2024-02-11,Arsenal,West Ham,W
82,1,0,2024-02-11,Arsenal,West Ham,W
83,1,1,2024-02-17,Arsenal,Burnley,W
84,1,1,2024-02-17,Arsenal,Burnley,W


In [81]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [82]:
combined["new_team"] = combined["team"].map(mapping)

In [83]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [84]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
1,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
2,1,1,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
3,1,1,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
4,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal,0,0,Nottingham Forest,Arsenal,L,Nottingham Forest
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2349,0,0,2025-01-06,Wolverhampton Wanderers,Nott'ham Forest,L,Wolves,1,1,Nottingham Forest,Wolves,W,Nottingham Forest
2350,0,0,2025-01-06,Wolverhampton Wanderers,Nott'ham Forest,L,Wolves,1,1,Nottingham Forest,Wolves,W,Nottingham Forest
2351,0,0,2025-01-06,Wolverhampton Wanderers,Nott'ham Forest,L,Wolves,1,1,Nottingham Forest,Wolves,W,Nottingham Forest
2352,0,0,2025-01-15,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,1,Newcastle United,Wolves,W,Newcastle Utd


In [85]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

actual_x
1    359
0    281
Name: count, dtype: int64