In [2]:
#Define Project Objectives:
 #Predict the winner of football matches in the English Premier League (EPL) using machine learning.


In [6]:
#Data Collection
# Step 1: Data Collection (Web Scraping Football Matches From The EPL)

In [4]:
#Data cleaning

In [8]:
import pandas as pd

In [9]:
matches = pd.read_csv("matches_17-23.csv", index_col=0)
matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2.0,1.0,Nott'ham Forest,...,Match Report,,15.0,7.0,19.1,0.0,0,0,2022,Arsenal
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1.0,0.0,Crystal Palace,...,Match Report,,13.0,2.0,16.4,0.0,1,1,2022,Arsenal
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2.0,2.0,Fulham,...,Match Report,,18.0,9.0,13.8,0.0,1,1,2022,Arsenal
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,Match Report,,17.0,5.0,15.0,0.0,0,0,2022,Arsenal
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,Match Report,,13.0,4.0,17.4,0.0,0,0,2022,Arsenal


In [79]:
matches.shape

(3382, 27)

In [80]:
# how many matches we have for each team
matches["Team"].value_counts()

Brighton and Hove Albion    170
Aston Villa                 170
Crystal Palace              170
Arsenal                     169
Everton                     169
Liverpool                   169
Manchester City             169
Tottenham Hotspur           169
Newcastle United            169
Manchester United           169
West Ham United             169
Chelsea                     169
Wolverhampton Wanderers     169
Leicester City              152
Southampton                 152
Burnley                     131
Leeds United                114
Sheffield United             94
Brentford                    93
Fulham                       93
Bournemouth                  92
Watford                      76
Norwich City                 76
Nottingham Forest            55
West Bromwich Albion         38
Luton Town                   16
Name: Team, dtype: int64

In [82]:
#Checking the data types of each column in the 'matches' DataFrame
matches.dtypes

Date             object
Time             object
Comp             object
Round            object
Day              object
Venue            object
Result           object
GF              float64
GA              float64
Opponent         object
xG              float64
xGA             float64
Poss            float64
Attendance      float64
Captain          object
Formation        object
Referee          object
Match Report     object
Notes           float64
Sh              float64
SoT             float64
Dist            float64
FK              float64
PK                int64
PKatt             int64
Season            int64
Team             object
dtype: object

In [10]:
#those are not useful for me 
del matches["Comp"]# we only work for premier league matches
del matches["Referee"]#not useful data
del matches["Match Report"]# not useful data
del matches["Notes"]#NA values

In [12]:
# Converting the 'Date' column to datetime format because it is an Object, that will help us to extract the day, months,
#year.... to use them in our machine learning algorithm 
matches["Date"] = pd.to_datetime(matches["Date"])

In [14]:
# Define a mapping for team name abbreviations
map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd",
              "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham",
              "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
# Replace team names in the DataFrame using the defined mapping
matches['Team'] = matches['Team'].replace(map_values)
matches['Opponent'] = matches['Opponent'].replace(map_values)
    

# Explain random forest model 

The Random Forest model is an ensemble learning method that constructs a multitude of decision trees during training and outputs the mode of the classes for classification tasks or the average prediction for regression tasks. By combining multiple trees, it improves accuracy, generalization, and mitigates overfitting compared to individual decision trees.

# Decision Trees

In [None]:
#create our target, what we want. we want wining only

In [85]:
matches["Target"] = (matches["Result"] == "W").astype("int")

In [13]:
#create our predictors, we need to know what affect the result most 

In [87]:
#This code line generates a binary feature, "venue_code," 
#indicating if a match occurred at the home venue (1 for True, 0 for False).
matches["venue_code"] = (matches["Venue"] == "Home").astype("int")

In [88]:
#the opponent predictor
matches["opp_code"] = matches["Opponent"].astype("category").cat.codes

In [96]:
matches["team_code"] = matches["Team"].astype("category").cat.codes

In [89]:
#hour predictor
matches["Hour"] = matches["Time"].str.replace(":.+", "", regex=True).astype("int")

In [214]:
#Day Predictor
matches["day_code"] = matches["Date"].dt.dayofweek

In [15]:
# Importing the RandomForestClassifier from scikit-learn
from sklearn.ensemble import RandomForestClassifier

In [200]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [201]:
train = matches[matches["Date"] < '2022-01-01']
test = matches[matches["Date"] > '2022-01-01']

In [202]:
predictors = ["venue_code", "opp_code", "Hour", "day_code"]

In [203]:
rf.fit(train[predictors], train["Target"])

In [204]:
preds = rf.predict(test[predictors])

In [205]:
from sklearn.metrics import accuracy_score
error = accuracy_score(test["Target"], preds)
error

0.5845637583892618

In [206]:
combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,703,203
1,416,168


In [207]:
from sklearn.metrics import precision_score

precision_score(test["Target"], preds)

0.4528301886792453

In [208]:
grouped_matches = matches.groupby("Team")
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt","xG"]
new_cols = [f"{c}_rolling" for c in cols]


In [219]:
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,team_code,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling,xG_rolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,3,2019-09-01,16:30,Matchweek 4,Sun,Home,D,2.0,2.0,Tottenham,2.4,...,0,1.333333,1.333333,10.666667,4.666667,17.200000,0.333333,0.000000,0.000000,0.966667
Arsenal,4,2019-09-15,16:30,Matchweek 5,Sun,Away,D,2.0,2.0,Watford,0.8,...,0,1.666667,2.000000,16.666667,6.666667,18.600000,1.000000,0.000000,0.000000,1.400000
Arsenal,6,2019-09-22,16:30,Matchweek 6,Sun,Home,W,3.0,2.0,Aston Villa,2.4,...,0,1.666667,2.333333,14.000000,5.000000,19.300000,1.000000,0.000000,0.000000,1.400000
Arsenal,8,2019-09-30,20:00,Matchweek 7,Mon,Away,D,1.0,1.0,Manchester Utd,1.9,...,0,2.333333,2.000000,17.666667,5.666667,18.600000,1.333333,0.333333,0.333333,1.866667
Arsenal,10,2019-10-06,14:00,Matchweek 8,Sun,Home,W,1.0,0.0,Bournemouth,1.2,...,0,2.000000,1.666667,12.333333,4.666667,18.100000,1.000000,0.333333,0.333333,1.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolves,14,2023-11-27,20:00,Matchweek 13,Mon,Away,L,2.0,3.0,Fulham,1.4,...,25,1.666667,1.666667,12.666667,4.333333,17.733333,0.666667,0.000000,0.000000,1.400000
Wolves,15,2023-12-02,15:00,Matchweek 14,Sat,Away,L,1.0,2.0,Arsenal,0.7,...,25,1.666667,2.000000,12.000000,4.000000,17.533333,0.333333,0.333333,0.333333,1.566667
Wolves,16,2023-12-05,19:30,Matchweek 15,Tue,Home,W,1.0,0.0,Burnley,0.8,...,25,1.666667,2.000000,10.666667,4.000000,15.300000,0.000000,0.333333,0.333333,1.400000
Wolves,17,2023-12-09,15:00,Matchweek 16,Sat,Home,D,1.0,1.0,Nott'ham Forest,1.2,...,25,1.333333,1.666667,7.333333,4.000000,14.400000,0.333333,0.333333,0.333333,0.966667


In [220]:
matches_rolling = matches_rolling.droplevel('Team')
matches_rolling

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,...,team_code,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling,xG_rolling
3,2019-09-01,16:30,Matchweek 4,Sun,Home,D,2.0,2.0,Tottenham,2.4,...,0,1.333333,1.333333,10.666667,4.666667,17.200000,0.333333,0.000000,0.000000,0.966667
4,2019-09-15,16:30,Matchweek 5,Sun,Away,D,2.0,2.0,Watford,0.8,...,0,1.666667,2.000000,16.666667,6.666667,18.600000,1.000000,0.000000,0.000000,1.400000
6,2019-09-22,16:30,Matchweek 6,Sun,Home,W,3.0,2.0,Aston Villa,2.4,...,0,1.666667,2.333333,14.000000,5.000000,19.300000,1.000000,0.000000,0.000000,1.400000
8,2019-09-30,20:00,Matchweek 7,Mon,Away,D,1.0,1.0,Manchester Utd,1.9,...,0,2.333333,2.000000,17.666667,5.666667,18.600000,1.333333,0.333333,0.333333,1.866667
10,2019-10-06,14:00,Matchweek 8,Sun,Home,W,1.0,0.0,Bournemouth,1.2,...,0,2.000000,1.666667,12.333333,4.666667,18.100000,1.000000,0.333333,0.333333,1.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,2023-11-27,20:00,Matchweek 13,Mon,Away,L,2.0,3.0,Fulham,1.4,...,25,1.666667,1.666667,12.666667,4.333333,17.733333,0.666667,0.000000,0.000000,1.400000
15,2023-12-02,15:00,Matchweek 14,Sat,Away,L,1.0,2.0,Arsenal,0.7,...,25,1.666667,2.000000,12.000000,4.000000,17.533333,0.333333,0.333333,0.333333,1.566667
16,2023-12-05,19:30,Matchweek 15,Tue,Home,W,1.0,0.0,Burnley,0.8,...,25,1.666667,2.000000,10.666667,4.000000,15.300000,0.000000,0.333333,0.333333,1.400000
17,2023-12-09,15:00,Matchweek 16,Sat,Home,D,1.0,1.0,Nott'ham Forest,1.2,...,25,1.333333,1.666667,7.333333,4.000000,14.400000,0.333333,0.333333,0.333333,0.966667


In [221]:
matches_rolling.index = range(matches_rolling.shape[0])

In [222]:
def make_predictions(data, predictors):
    train = data[data["Date"] < '2022-01-01']
    test = data[data["Date"] > '2022-01-01']
    rf.fit(train[predictors], train["Target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
    error = precision_score(test["Target"], preds)
    return combined, error

In [223]:
combined, error = make_predictions(matches_rolling, predictors + new_cols)
error

0.534375

In [245]:
combined = combined.merge(matches_rolling, left_index=True, right_index=True)

  combined = combined.merge(matches_rolling, left_index=True, right_index=True)


In [246]:
combined.to_csv("COMBINEDTO.csv")