SETUP

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score
import numpy as np

In [2]:
matches = pd.read_csv('matches.csv', index_col=0)
matches.shape

(760, 152)

In [3]:
matches["team"].value_counts()

Manchester City             38
Arsenal                     38
Leicester City              38
Leeds United                38
Everton                     38
Nottingham Forest           38
West Ham United             38
Bournemouth                 38
Wolverhampton Wanderers     38
Crystal Palace              38
Chelsea                     38
Fulham                      38
Brentford                   38
Aston Villa                 38
Tottenham Hotspur           38
Brighton and Hove Albion    38
Liverpool                   38
Manchester United           38
Newcastle United            38
Southampton                 38
Name: team, dtype: int64

In [4]:
matches["date"] = pd.to_datetime(matches["date"])
matches.dtypes

date       datetime64[ns]
time               object
comp               object
round              object
day                object
                ...      
npxg              float64
npxg/sh           float64
g-xg              float64
np:g-xg           float64
team               object
Length: 152, dtype: object

In [5]:
# Convert column data to numeric
matches["venue_code"] = matches["venue"].astype('category').cat.codes
matches["opp_code"] = matches["opponent"].astype('category').cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["formation_code"] = matches["formation"].astype('category').cat.codes
matches = matches[matches["date"] < '2023-05-15']
matches["pts"] = matches["result"].map({'W': 3, 'D': 1, 'L': 0})



CREATE ROLLING TABLE

In [6]:
grouped_matches = matches.groupby("team")

In [7]:
grouped_matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,npxg/sh,g-xg,np:g-xg,team,venue_code,opp_code,hour,day_code,formation_code,pts
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,0.11,-0.2,-0.4,Manchester City,0,18,16,6,13,3
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,0.09,1.3,1.3,Manchester City,1,2,15,5,10,3
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,...,0.10,0.9,0.9,Manchester City,0,14,16,6,13,1
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,0.13,1.8,1.8,Manchester City,1,6,15,5,10,3
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,0.20,2.7,2.7,Manchester City,1,15,19,2,10,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2022-08-06,15:00,Premier League,Matchweek 1,Sat,Away,L,1.0,4.0,Tottenham,...,0.05,0.5,0.5,Southampton,0,17,15,5,17,0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,D,2.0,2.0,Leeds United,...,0.09,0.8,0.8,Southampton,1,9,15,5,4,1
2,2022-08-20,15:00,Premier League,Matchweek 3,Sat,Away,W,2.0,1.0,Leicester City,...,0.10,1.1,1.1,Southampton,0,10,15,5,10,3
4,2022-08-27,12:30,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Manchester Utd,...,0.09,-1.4,-1.4,Southampton,1,13,12,5,10,0


In [8]:
all_cols = matches.columns.tolist()

In [9]:
matches.dtypes

date              datetime64[ns]
time                      object
comp                      object
round                     object
day                       object
                       ...      
opp_code                    int8
hour                       int32
day_code                   int64
formation_code              int8
pts                        int64
Length: 158, dtype: object

In [10]:
matches.index = range(matches.shape[0])

In [21]:
valid_cols = matches.select_dtypes(include=['int8', 'int64', 'float64', 'int32']).columns.tolist()
tot = 0
for col in valid_cols:
    for i in range(matches.shape[0]):
        val = matches.loc[i, col]
        if pd.isnull(val):
            tot+=1
            average = sum(matches[col][:i])/(i+1)
            matches.at[i, col] = average
            # matches[col][index] = average
print(tot)

107


In [22]:
total = 0
for col in matches.columns:
    nan_locations = matches.index[matches[col].isna()].tolist()
    if nan_locations:
        total += len(nan_locations)
        print(col, ":", nan_locations,";", len(nan_locations)) 
print(total)

0


In [23]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    rolling = rolling_stats[cols].fillna(rolling_stats)
    rolling_stats_renamed = rolling_stats.rename(columns=dict(zip(cols, new_cols)))
    merged = pd.concat([group, rolling_stats_renamed], axis=1)
    merged = merged.dropna(subset=new_cols)
    return merged

In [24]:
cols = all_cols[7:9] + all_cols[10:11] + all_cols[12:151] + all_cols[-1:]
rolling_cols = [f"{c}_rolling" for c in cols]

In [25]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, rolling_cols))

In [26]:
matches_rolling = matches_rolling.droplevel('team')

In [28]:
matches_rolling.index = range(matches_rolling.shape[0])

In [31]:
matches_rolling.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,avgdistofsh_rolling,fk_rolling,pk_rolling,pkattfor_rolling,xg_rolling,npxg_rolling,npxg/sh_rolling,g-xg_rolling,np:g-xg_rolling,pts_rolling
0,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,14.133333,0.333333,0.0,0.0,1.666667,1.666667,0.12,1.0,1.0,3.0
1,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,14.433333,0.333333,0.0,0.0,2.2,2.2,0.126667,0.8,0.8,3.0
2,2022-09-04,16:30,Premier League,Matchweek 6,Sun,Away,L,1.0,3.0,Manchester Utd,...,15.533333,0.666667,0.0,0.0,2.1,2.1,0.113333,0.233333,0.233333,3.0
3,2022-09-18,12:00,Premier League,Matchweek 8,Sun,Away,W,3.0,0.0,Brentford,...,16.8,1.0,0.0,0.0,2.1,2.1,0.106667,-0.433333,-0.433333,2.0
4,2022-10-01,12:30,Premier League,Matchweek 9,Sat,Home,W,3.0,1.0,Tottenham,...,17.7,0.666667,0.0,0.0,1.733333,1.733333,0.106667,0.266667,0.266667,2.0


In [32]:
all_cols = matches_rolling.columns.tolist()
predictors = all_cols[7:9] + all_cols[10:11] + all_cols[12:151] + all_cols[152:]
len(predictors)

291

START OF ML ALGORITHM (RANDOM FOREST)

In [35]:
rf = RandomForestClassifier(min_samples_split=10, random_state=1)

In [70]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2023-03-01']
    test = data[data["date"] > '2023-03-01']
    X_train = train[predictors]
    y_train = train[["gf", "ga"]]
    X_test = test[predictors]
    y_test = test[["gf", "ga"]]
    
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    combined = pd.DataFrame({
        "Date": test["date"],
        "Team": test["team"],
        "Opponent": test["opponent"],
        "Predicted_GF": preds[:, 0],  # Assuming "gf" is the first column in predictions
        "Predicted_GA": preds[:, 1],  # Assuming "ga" is the second column in predictions
        "Actual_GF": y_test["gf"],
        "Actual_GA": y_test["ga"]
    })
    # precision = precision_score(test[["gf", "ga"]], preds)
    # return combined, precision
    return combined

In [79]:
# combined, precision = make_predictions(matches_rolling, predictors)
combined = make_predictions(matches_rolling, predictors)

In [68]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
}

mapping = MissingDict(**map_values)

In [80]:
combined["Team"] = combined["Team"].map(mapping)

In [81]:
combined

Unnamed: 0,Date,Team,Opponent,Predicted_GF,Predicted_GA,Actual_GF,Actual_GA
22,2023-03-04,Arsenal,Bournemouth,3.0,2.0,3.0,2.0
23,2023-03-12,Arsenal,Fulham,3.0,0.0,3.0,0.0
24,2023-03-19,Arsenal,Crystal Palace,3.0,1.0,4.0,1.0
25,2023-04-01,Arsenal,Leeds United,3.0,1.0,4.0,1.0
26,2023-04-09,Arsenal,Liverpool,2.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...
647,2023-04-22,Wolves,Leicester City,1.0,2.0,1.0,2.0
648,2023-04-25,Wolves,Crystal Palace,1.0,0.0,2.0,0.0
649,2023-04-29,Wolves,Brighton,0.0,3.0,0.0,6.0
650,2023-05-06,Wolves,Aston Villa,1.0,0.0,1.0,0.0


In [82]:
merged = combined.merge(combined, left_on=["Date", "Team"], right_on=["Date", "Opponent"])

In [83]:
merged

Unnamed: 0,Date,Team_x,Opponent_x,Predicted_GF_x,Predicted_GA_x,Actual_GF_x,Actual_GA_x,Team_y,Opponent_y,Predicted_GF_y,Predicted_GA_y,Actual_GF_y,Actual_GA_y
0,2023-03-04,Arsenal,Bournemouth,3.0,2.0,3.0,2.0,Bournemouth,Arsenal,2.0,2.0,2.0,3.0
1,2023-03-12,Arsenal,Fulham,3.0,0.0,3.0,0.0,Fulham,Arsenal,0.0,3.0,0.0,3.0
2,2023-03-19,Arsenal,Crystal Palace,3.0,1.0,4.0,1.0,Crystal Palace,Arsenal,1.0,2.0,1.0,4.0
3,2023-04-01,Arsenal,Leeds United,3.0,1.0,4.0,1.0,Leeds United,Arsenal,1.0,2.0,1.0,4.0
4,2023-04-09,Arsenal,Liverpool,2.0,1.0,2.0,2.0,Liverpool,Arsenal,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,2023-04-22,Wolves,Leicester City,1.0,2.0,1.0,2.0,Leicester City,Wolves,2.0,1.0,2.0,1.0
202,2023-04-25,Wolves,Crystal Palace,1.0,0.0,2.0,0.0,Crystal Palace,Wolves,0.0,1.0,0.0,2.0
203,2023-04-29,Wolves,Brighton,0.0,3.0,0.0,6.0,Brighton and Hove Albion,Wolves,3.0,0.0,6.0,0.0
204,2023-05-06,Wolves,Aston Villa,1.0,0.0,1.0,0.0,Aston Villa,Wolves,0.0,1.0,0.0,1.0
