In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("matches.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3.0,0.0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0.0,0.0,2023,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1.0,0.0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0.0,0.0,2023,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0.0,1.0,2023,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5.0,1.0,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1.0,1.0,2023,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0.0,0.0,2023,Manchester City


In [4]:
matches.shape

(2072, 27)

In [10]:
matches.dtypes

Date             object
Time             object
Comp             object
Round            object
Day              object
Venue            object
Result           object
GF              float64
GA              float64
Opponent         object
xG              float64
xGA             float64
Poss            float64
Attendance      float64
Captain          object
Formation        object
Referee          object
Match Report     object
Notes           float64
Sh              float64
SoT             float64
Dist            float64
FK              float64
PK              float64
PKatt           float64
Season            int64
Team             object
dtype: object

In [11]:
matches["Date"] = pd.to_datetime(matches["Date"])

In [17]:
matches["venue_code"] = matches["Venue"].astype("category").cat.codes

In [18]:
matches["opp_code"] = matches["Opponent"].astype("category").cat.codes

In [19]:
matches["hour"] = matches["Time"].str.replace(":.+", "", regex=True).astype("int")

In [21]:
matches["day_code"] = matches["Date"].dt.dayofweek

In [24]:
matches["target"] = (matches["Result"] == "W").astype("int")

In [25]:
matches

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,FK,PK,PKatt,Season,Team,venue_code,opp_code,hour,day_code,target
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3.0,0.0,Burnley,...,0.0,0.0,0.0,2023,Manchester City,0,5,20,4,1
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1.0,0.0,Newcastle Utd,...,0.0,0.0,0.0,2023,Manchester City,1,16,20,5,1
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Sheffield Utd,...,2.0,0.0,1.0,2023,Manchester City,0,19,14,6,1
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5.0,1.0,Fulham,...,0.0,1.0,1.0,2023,Manchester City,1,9,15,5,1
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,West Ham,...,1.0,0.0,0.0,2023,Manchester City,0,24,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,0.0,0.0,0.0,2022,Sheffield United,0,21,19,6,0
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,1.0,0.0,0.0,2022,Sheffield United,1,7,15,5,0
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,0.0,0.0,0.0,2022,Sheffield United,0,8,19,6,1
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,1.0,0.0,0.0,2022,Sheffield United,0,16,18,2,0


In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [28]:
train = matches[matches["Date"] < '2023-01-01']

In [29]:
test = matches[matches["Date"] > '2023-01-01']

In [30]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [31]:
rf.fit(train[predictors], train["target"])

In [32]:
preds = rf.predict(test[predictors])

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
acc = accuracy_score(test["target"], preds)

In [35]:
acc

0.5876010781671159

In [41]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [42]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,157,41
1,112,61


In [43]:
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

0.5980392156862745

In [45]:
grouped_matches = matches.groupby("Team")

In [46]:
group = grouped_matches.get_group("Manchester City").sort_values("Date")

In [47]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [52]:
cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [53]:
new_cols

['GF_rolling',
 'GA_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'Dist_rolling',
 'FK_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [54]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,...,5,1,2.000000,2.333333,17.333333,4.666667,19.700000,1.333333,0.333333,0.333333
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,...,5,0,1.333333,2.000000,17.333333,3.666667,18.566667,0.666667,0.000000,0.000000
9,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield Utd,...,5,1,1.000000,0.666667,16.666667,4.333333,18.933333,0.666667,0.000000,0.000000
11,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1.0,1.0,Liverpool,...,6,0,1.000000,0.333333,14.333333,6.666667,19.033333,1.000000,0.000000,0.000000
12,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0.0,2.0,Tottenham,...,5,0,1.000000,0.666667,12.000000,5.666667,20.000000,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Sheffield Utd,...,6,1,1.333333,0.333333,16.000000,4.666667,16.200000,0.000000,0.000000,0.000000
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5.0,1.0,Fulham,...,5,1,2.000000,0.333333,20.000000,7.000000,16.366667,0.666667,0.000000,0.333333
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,West Ham,...,5,1,2.666667,0.666667,16.333333,5.666667,16.666667,0.666667,0.333333,0.666667
8,2023-09-23,15:00,Premier League,Matchweek 6,Sat,Home,W,2.0,0.0,Nott'ham Forest,...,5,1,3.333333,1.000000,21.333333,8.666667,16.166667,1.000000,0.333333,0.666667


In [56]:
del matches["Comp"]

In [57]:
del matches["Notes"]

In [58]:
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [59]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,0.6,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,0.9,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,0.7,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,1.0,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,1.5,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,2,2023-08-26,15:00,Matchweek 3,Sat,Away,W,1.0,0.0,Everton,1.1,...,5,1,0.666667,2.666667,15.333333,5.333333,15.366667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,4,2023-09-03,14:00,Matchweek 4,Sun,Away,L,2.0,3.0,Crystal Palace,1.2,...,6,0,0.666667,1.666667,16.666667,4.333333,16.400000,0.333333,0.000000,0.000000
Wolverhampton Wanderers,5,2023-09-16,12:30,Matchweek 5,Sat,Home,L,1.0,3.0,Liverpool,0.6,...,5,0,1.333333,2.333333,13.000000,3.666667,16.700000,0.333333,0.000000,0.000000
Wolverhampton Wanderers,6,2023-09-23,15:00,Matchweek 6,Sat,Away,D,1.0,1.0,Luton Town,0.6,...,5,0,1.333333,2.000000,11.333333,2.666667,17.566667,0.333333,0.000000,0.000000


In [60]:
matches_rolling = matches_rolling.droplevel('Team')

In [61]:
matches_rolling

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
6,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,0.6,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
7,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,0.9,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
9,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,0.7,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
11,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,1.0,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
13,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,1.5,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,2023-08-26,15:00,Matchweek 3,Sat,Away,W,1.0,0.0,Everton,1.1,...,5,1,0.666667,2.666667,15.333333,5.333333,15.366667,0.000000,0.000000,0.000000
4,2023-09-03,14:00,Matchweek 4,Sun,Away,L,2.0,3.0,Crystal Palace,1.2,...,6,0,0.666667,1.666667,16.666667,4.333333,16.400000,0.333333,0.000000,0.000000
5,2023-09-16,12:30,Matchweek 5,Sat,Home,L,1.0,3.0,Liverpool,0.6,...,5,0,1.333333,2.333333,13.000000,3.666667,16.700000,0.333333,0.000000,0.000000
6,2023-09-23,15:00,Matchweek 6,Sat,Away,D,1.0,1.0,Luton Town,0.6,...,5,0,1.333333,2.000000,11.333333,2.666667,17.566667,0.333333,0.000000,0.000000


In [62]:
matches_rolling.index = range(matches_rolling.shape[0])

In [66]:
def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-01-01']
    test = data[data["Date"] > '2023-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [67]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [68]:
precision

0.5648854961832062

In [69]:
combined = combined.merge(matches_rolling[["Date", "Team", "Opponent", "Result"]], left_index=True, right_index=True)

In [70]:
combined.head()

Unnamed: 0,actual,predicted,Date,Team,Opponent,Result
89,0,1,2023-01-03,Arsenal,Newcastle Utd,D
90,1,1,2023-01-15,Arsenal,Tottenham,W
91,1,0,2023-01-22,Arsenal,Manchester Utd,W
92,0,1,2023-02-04,Arsenal,Everton,L
93,0,1,2023-02-11,Arsenal,Brentford,D


In [71]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [73]:
combined["new_team"] = combined["Team"].map(mapping)

In [74]:
merged = combined.merge(combined, left_on=["Date", "new_team"], right_on=["Date", "Opponent"])

In [75]:
merged

Unnamed: 0,actual_x,predicted_x,Date,Team_x,Opponent_x,Result_x,new_team_x,actual_y,predicted_y,Team_y,Opponent_y,Result_y,new_team_y
0,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal,0,0,Newcastle United,Arsenal,D,Newcastle Utd
1,1,1,2023-01-15,Arsenal,Tottenham,W,Arsenal,0,0,Tottenham Hotspur,Arsenal,L,Tottenham
2,1,0,2023-01-22,Arsenal,Manchester Utd,W,Arsenal,0,1,Manchester United,Arsenal,L,Manchester Utd
3,0,1,2023-02-11,Arsenal,Brentford,D,Arsenal,0,1,Brentford,Arsenal,D,Brentford
4,0,0,2023-02-15,Arsenal,Manchester City,L,Arsenal,1,1,Manchester City,Arsenal,W,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,1,0,2023-08-26,Wolverhampton Wanderers,Everton,W,Wolves,0,0,Everton,Wolves,L,Everton
227,0,0,2023-09-03,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,1,Crystal Palace,Wolves,W,Crystal Palace
228,0,0,2023-09-16,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
229,0,0,2023-09-23,Wolverhampton Wanderers,Luton Town,D,Wolves,0,0,Luton Town,Wolves,D,Luton Town


In [76]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

1    31
0    27
Name: actual_x, dtype: int64

In [79]:
31 / 58

0.5344827586206896

In [80]:
# add more data(season)