SETUP

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score
import numpy as np

In [2]:
matches = pd.read_csv('matches.csv', index_col=0)
matches.shape

(760, 152)

In [3]:
matches["team"].value_counts()

Manchester City             38
Arsenal                     38
Leicester City              38
Leeds United                38
Everton                     38
Nottingham Forest           38
West Ham United             38
Bournemouth                 38
Wolverhampton Wanderers     38
Crystal Palace              38
Chelsea                     38
Fulham                      38
Brentford                   38
Aston Villa                 38
Tottenham Hotspur           38
Brighton and Hove Albion    38
Liverpool                   38
Manchester United           38
Newcastle United            38
Southampton                 38
Name: team, dtype: int64

In [4]:
matches["date"] = pd.to_datetime(matches["date"])
matches.dtypes

date       datetime64[ns]
time               object
comp               object
round              object
day                object
                ...      
npxg              float64
npxg/sh           float64
g-xg              float64
np:g-xg           float64
team               object
Length: 152, dtype: object

In [5]:
# Convert column data to numeric
matches["venue_code"] = matches["venue"].astype('category').cat.codes
matches["opp_code"] = matches["opponent"].astype('category').cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["formation_code"] = matches["formation"].astype('category').cat.codes
matches = matches[matches["date"] < '2023-05-15']
matches["pts"] = matches["result"].map({'W': 3, 'D': 1, 'L': 0})



CREATE ROLLING TABLE

In [6]:
grouped_matches = matches.groupby("team")

In [7]:
all_cols = matches.columns.tolist()

In [8]:
matches.index = range(matches.shape[0])

In [9]:
valid_cols = matches.select_dtypes(include=['int8', 'int64', 'float64', 'int32']).columns.tolist()
tot = 0
for col in valid_cols:
    for i in range(matches.shape[0]):
        val = matches.loc[i, col]
        if pd.isnull(val):
            tot+=1
            average = sum(matches[col][:i])/(i+1)
            matches.at[i, col] = average
            # matches[col][index] = average
print(tot)

107


In [10]:
total = 0
for col in matches.columns:
    nan_locations = matches.index[matches[col].isna()].tolist()
    if nan_locations:
        total += len(nan_locations)
        print(col, ":", nan_locations,";", len(nan_locations)) 
print(total)

0


In [11]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(5, closed='left').mean()
    rolling = rolling_stats[cols].fillna(rolling_stats)
    rolling_stats_renamed = rolling_stats.rename(columns=dict(zip(cols, new_cols)))
    merged = pd.concat([group, rolling_stats_renamed], axis=1)
    merged = merged.dropna(subset=new_cols)
    return merged

In [12]:
cols = all_cols[7:9] + all_cols[10:11] + all_cols[12:151] + all_cols[-1:]
rolling_cols = [f"{c}_rolling" for c in cols]

In [13]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, rolling_cols))

In [14]:
matches_rolling = matches_rolling.droplevel('team')

In [15]:
matches_rolling.index = range(matches_rolling.shape[0])

In [16]:
matches_rolling.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,avgdistofsh_rolling,fk_rolling,pk_rolling,pkattfor_rolling,xg_rolling,npxg_rolling,npxg/sh_rolling,g-xg_rolling,np:g-xg_rolling,pts_rolling
0,2022-09-04,16:30,Premier League,Matchweek 6,Sun,Away,L,1.0,3.0,Manchester Utd,...,14.84,0.6,0.0,0.0,2.0,2.0,0.12,0.4,0.4,3.0
1,2022-09-18,12:00,Premier League,Matchweek 8,Sun,Away,W,3.0,0.0,Brentford,...,15.64,0.6,0.0,0.0,2.06,2.06,0.116,0.34,0.34,2.4
2,2022-10-01,12:30,Premier League,Matchweek 9,Sat,Home,W,3.0,1.0,Tottenham,...,16.68,0.6,0.0,0.0,1.82,1.82,0.108,0.38,0.38,2.4
3,2022-10-09,16:30,Premier League,Matchweek 10,Sun,Home,W,3.0,2.0,Liverpool,...,17.44,0.8,0.0,0.0,2.04,2.04,0.11,0.16,0.16,2.4
4,2022-10-16,14:00,Premier League,Matchweek 11,Sun,Away,W,1.0,0.0,Leeds United,...,17.2,0.6,0.2,0.2,2.06,1.98,0.134,0.34,0.22,2.4


In [17]:
predictors = ["venue_code", "opp_code", "hour", "day_code", "formation_code"] + rolling_cols
len(predictors)

148

In [18]:
matches_rolling.to_csv('matches_rolling.csv')

In [19]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
}

mapping = MissingDict(**map_values)

START OF ML ALGORITHM

RANDOM FOREST, SINGLE TREE

In [20]:
def make_predictions(data, predictors, rf):
    train = data[data["date"] < '2023-04-01']
    test = data[data["date"] > '2023-04-01']
    X_train = train[predictors]
    y_train = train[["gf", "ga"]]
    X_test = test[predictors]
    y_test = test[["gf", "ga"]]
    
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    combined = pd.DataFrame({
        "Date": test["date"],
        "Team": test["team"],
        "Opponent": test["opponent"],
        "Predicted_GF": preds[:, 0],  # Assuming "gf" is the first column in predictions
        "Predicted_GA": preds[:, 1],  # Assuming "ga" is the second column in predictions
        "Actual_GF": y_test["gf"],
        "Actual_GA": y_test["ga"]
    })
    # precision = precision_score(test[["gf", "ga"]], preds)
    # return combined, precision
    return combined

In [21]:
rf1 = RandomForestClassifier(min_samples_split=10, random_state=1)

In [22]:
# combined, precision = make_predictions(matches_rolling, predictors)
combined = make_predictions(matches_rolling, predictors, rf1)

In [23]:
combined["Team"] = combined["Team"].map(mapping)

In [24]:
merged = combined.merge(combined, left_on=["Date", "Team"], right_on=["Date", "Opponent"])

In [25]:
merged[merged["Team_x"] == "Manchester City"]

Unnamed: 0,Date,Team_x,Opponent_x,Predicted_GF_x,Predicted_GA_x,Actual_GF_x,Actual_GA_x,Team_y,Opponent_y,Predicted_GF_y,Predicted_GA_y,Actual_GF_y,Actual_GA_y
84,2023-04-08,Manchester City,Southampton,3.0,0.0,4.0,1.0,Southampton,Manchester City,0.0,1.0,1.0,4.0
85,2023-04-15,Manchester City,Leicester City,1.0,0.0,3.0,1.0,Leicester City,Manchester City,0.0,1.0,1.0,3.0
86,2023-04-26,Manchester City,Arsenal,3.0,0.0,4.0,1.0,Arsenal,Manchester City,0.0,0.0,1.0,4.0
87,2023-04-30,Manchester City,Fulham,3.0,0.0,2.0,1.0,Fulham,Manchester City,0.0,0.0,1.0,2.0
88,2023-05-03,Manchester City,West Ham,1.0,0.0,3.0,0.0,West Ham,Manchester City,0.0,0.0,0.0,3.0
89,2023-05-06,Manchester City,Leeds United,3.0,1.0,2.0,1.0,Leeds United,Manchester City,0.0,1.0,1.0,2.0
90,2023-05-14,Manchester City,Everton,3.0,1.0,3.0,0.0,Everton,Manchester City,0.0,1.0,0.0,3.0


In [26]:
merged[merged["Team_x"] == "Arsenal"]

Unnamed: 0,Date,Team_x,Opponent_x,Predicted_GF_x,Predicted_GA_x,Actual_GF_x,Actual_GA_x,Team_y,Opponent_y,Predicted_GF_y,Predicted_GA_y,Actual_GF_y,Actual_GA_y
0,2023-04-09,Arsenal,Liverpool,3.0,0.0,2.0,2.0,Liverpool,Arsenal,0.0,0.0,2.0,2.0
1,2023-04-16,Arsenal,West Ham,1.0,0.0,2.0,2.0,West Ham,Arsenal,1.0,1.0,2.0,2.0
2,2023-04-21,Arsenal,Southampton,0.0,0.0,3.0,3.0,Southampton,Arsenal,0.0,1.0,3.0,3.0
3,2023-04-26,Arsenal,Manchester City,0.0,0.0,1.0,4.0,Manchester City,Arsenal,3.0,0.0,4.0,1.0
4,2023-05-02,Arsenal,Chelsea,1.0,0.0,3.0,1.0,Chelsea,Arsenal,2.0,0.0,1.0,3.0
5,2023-05-07,Arsenal,Newcastle Utd,1.0,0.0,2.0,0.0,Newcastle Utd,Arsenal,0.0,0.0,0.0,2.0
6,2023-05-14,Arsenal,Brighton,1.0,0.0,0.0,3.0,Brighton and Hove Albion,Arsenal,1.0,1.0,3.0,0.0


In [27]:
merged.to_csv('predictions.csv')

In [28]:
correct_preds = merged[(merged["Predicted_GF_x"] == merged["Actual_GF_x"]) & (merged["Predicted_GA_x"] == merged["Actual_GA_x"])].shape[0]
correct_wins = merged[(merged["Predicted_GF_x"] > merged["Predicted_GA_x"]) & (merged["Actual_GF_x"] > merged["Actual_GA_x"])].shape[0]
correct_loss = merged[(merged["Predicted_GF_x"] < merged["Predicted_GA_x"]) & (merged["Actual_GF_x"] < merged["Actual_GA_x"])].shape[0]
correct_draws = merged[(merged["Predicted_GF_x"] == merged["Predicted_GA_x"]) & (merged["Actual_GF_x"] == merged["Actual_GA_x"])].shape[0]

In [29]:
print("correct results:", (correct_wins + correct_loss + correct_draws)/merged.shape[0])
print("correct scorelines:", correct_preds/merged.shape[0])

correct results: 0.3380281690140845
correct scorelines: 0.07042253521126761


RANDOM FOREST (GROUP MATCHES TO MAKE 1 PREDICTION)

In [30]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,npxg/sh,g-xg,np:g-xg,team,venue_code,opp_code,hour,day_code,formation_code,pts
0,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,0.11,-0.2,-0.4,Manchester City,0,18,16,6,13,3
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,0.09,1.3,1.3,Manchester City,1,2,15,5,10,3
2,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,...,0.1,0.9,0.9,Manchester City,0,14,16,6,13,1
3,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,0.13,1.8,1.8,Manchester City,1,6,15,5,10,3
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,0.2,2.7,2.7,Manchester City,1,15,19,2,10,3


RANDOM FOREST, 1 TREE/TEAM

In [31]:
matches_rolling["team"]

0                      Arsenal
1                      Arsenal
2                      Arsenal
3                      Arsenal
4                      Arsenal
                ...           
607    Wolverhampton Wanderers
608    Wolverhampton Wanderers
609    Wolverhampton Wanderers
610    Wolverhampton Wanderers
611    Wolverhampton Wanderers
Name: team, Length: 612, dtype: object