In [1]:
#importing data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df=pd.read_csv('Premier_League_Results.csv')
df.head()

Unnamed: 0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Unnamed: 10
0,01/01/24,20,Home,W,4,2,Newcastle,7.2,0.67,Liverpool,
1,12/26/23,19,Away,W,2,0,Burnley,1.75,0.72,Liverpool,
2,12/23/23,18,Home,D,1,1,Arsenal,1.01,0.87,Liverpool,
3,12/17/23,17,Home,D,0,0,Manchester Utd,2.66,0.52,Liverpool,
4,09/12/23,16,Away,W,2,1,Crystal Palace,1.26,1.99,Liverpool,


In [2]:
#converting date to correct format for reading in ml algorithm

df["Date"] = pd.to_datetime(df["Date"])

  df["Date"] = pd.to_datetime(df["Date"])


In [3]:
df.dtypes

Date             datetime64[ns]
Round                     int64
Venue                    object
Result                   object
Goals_For                 int64
Goals_Against             int64
Opponent                 object
XGF                     float64
XGA                     float64
Team                     object
Unnamed: 10              object
dtype: object

In [4]:
#converting appropriate variables for ml algorithm to correctly read and predict

df["Venue_Code"] = df["Venue"].astype("category").cat.codes

In [5]:
df["Opp_Code"] = df["Opponent"].astype("category").cat.codes

In [6]:
df["Target"] = (df["Result"] == "W").astype("int")

In [7]:
df

Unnamed: 0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Unnamed: 10,Venue_Code,Opp_Code,Target
0,2024-01-01,20,Home,W,4,2,Newcastle,7.20,0.67,Liverpool,,1,16,1
1,2023-12-26,19,Away,W,2,0,Burnley,1.75,0.72,Liverpool,,0,5,1
2,2023-12-23,18,Home,D,1,1,Arsenal,1.01,0.87,Liverpool,,1,0,0
3,2023-12-17,17,Home,D,0,0,Manchester Utd,2.66,0.52,Liverpool,,1,15,0
4,2023-09-12,16,Away,W,2,1,Crystal Palace,1.26,1.99,Liverpool,,0,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,2023-03-15,27,Home,L,0,2,Brentford,0.55,2.15,Southampton,,1,3,0
712,2023-03-12,26,Away,D,0,0,Manchester Utd,1.25,0.92,Southampton,,0,15,0
713,2023-03-04,25,Home,W,1,0,Leicester,1.52,2.09,Southampton,,1,11,1
714,2023-02-25,24,Away,L,0,1,Leeds,0.39,0.94,Southampton,,0,10,0


In [8]:
df['Venue']=df['Venue'].astype('category').cat.codes

In [9]:
#strange column added into dataset which is now removed

df = df.drop('Unnamed: 10', axis = 1)

In [10]:
df.dropna()

Unnamed: 0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Venue_Code,Opp_Code,Target
0,2024-01-01,20,1,W,4,2,Newcastle,7.20,0.67,Liverpool,1,16,1
1,2023-12-26,19,0,W,2,0,Burnley,1.75,0.72,Liverpool,0,5,1
2,2023-12-23,18,1,D,1,1,Arsenal,1.01,0.87,Liverpool,1,0,0
3,2023-12-17,17,1,D,0,0,Manchester Utd,2.66,0.52,Liverpool,1,15,0
4,2023-09-12,16,0,W,2,1,Crystal Palace,1.26,1.99,Liverpool,0,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,2023-03-15,27,1,L,0,2,Brentford,0.55,2.15,Southampton,1,3,0
712,2023-03-12,26,0,D,0,0,Manchester Utd,1.25,0.92,Southampton,0,15,0
713,2023-03-04,25,1,W,1,0,Leicester,1.52,2.09,Southampton,1,11,1
714,2023-02-25,24,0,L,0,1,Leeds,0.39,0.94,Southampton,0,10,0


In [11]:
#importing random forest model

from sklearn.ensemble import RandomForestClassifier

In [12]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [13]:
#setting up train and test, data is from Feb 2023

train = df[df["Date"] < '2023-10-29']

In [14]:
test = df[df["Date"] > '2023-10-29']

In [15]:
#These two variables used as the predictors for the model, can add more predictors

predictors = ["Venue_Code", "Opp_Code","XGF","XGA"]

In [16]:
rf.fit(train[predictors], train["Target"])

In [17]:
preds = rf.predict(test[predictors])

In [18]:
#testing accuracy of model

from sklearn.metrics import accuracy_score

In [19]:
acc = accuracy_score(test["Target"], preds)

In [20]:
acc

0.6538461538461539

In [21]:
combined = pd.DataFrame(dict(actual=test["Target"], prediction=preds))

In [22]:
#comparing how many times we predicted a result and how many times we were right against how many times we were wrong

pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,60,15
1,30,25


In [23]:
#importing precision score

from sklearn.metrics import precision_score

In [24]:
precision_score(test["Target"], preds)

0.625

In [25]:
grouped_df = df.groupby("Team")

In [26]:
#running a rolling average to help improve model accuracy

def rolling_average(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [27]:
cols = ["Goals_For", "Goals_Against"]
new_cols = [f"{c}_rolling" for c in cols]

In [28]:
new_cols

['Goals_For_rolling', 'Goals_Against_rolling']

In [29]:
df_rolling = df.groupby("Team").apply(lambda x: rolling_average(x, cols, new_cols))

In [30]:
df_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Venue_Code,Opp_Code,Target,Goals_For_rolling,Goals_Against_rolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Arsenal,425,2023-02-25,25,0,W,1,0,Leicester,0.69,0.02,Arsenal,0,11,1,2.333333,1.333333
Arsenal,424,2023-03-04,26,1,W,3,2,Bournemouth,2.77,1.45,Arsenal,1,2,1,2.333333,1.000000
Arsenal,75,2023-03-09,4,1,W,3,1,Manchester Utd,1.41,0.94,Arsenal,1,15,1,2.666667,1.333333
Arsenal,423,2023-03-12,27,0,W,0,3,Fulham,2.52,0.84,Arsenal,0,9,1,2.333333,1.000000
Arsenal,422,2023-03-19,28,1,W,4,1,Crystal Palace,1.57,0.64,Arsenal,1,7,1,2.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolves,206,2023-11-27,13,0,L,2,3,Fulham,1.46,2.97,Wolves,0,9,0,2.000000,1.333333
Wolves,202,2023-12-17,17,0,L,0,3,West Ham,0.61,0.99,Wolves,0,21,0,2.000000,2.000000
Wolves,201,2023-12-24,18,1,W,2,1,Chelsea,1.39,1.94,Wolves,1,6,1,1.333333,2.333333
Wolves,200,2023-12-27,19,0,W,4,1,Brentford,1.80,2.32,Wolves,0,3,1,1.333333,2.333333


In [31]:
df_rolling.index = range(df_rolling.shape[0])

In [32]:
df_rolling

Unnamed: 0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Venue_Code,Opp_Code,Target,Goals_For_rolling,Goals_Against_rolling
0,2023-02-25,25,0,W,1,0,Leicester,0.69,0.02,Arsenal,0,11,1,2.333333,1.333333
1,2023-03-04,26,1,W,3,2,Bournemouth,2.77,1.45,Arsenal,1,2,1,2.333333,1.000000
2,2023-03-09,4,1,W,3,1,Manchester Utd,1.41,0.94,Arsenal,1,15,1,2.666667,1.333333
3,2023-03-12,27,0,W,0,3,Fulham,2.52,0.84,Arsenal,0,9,1,2.333333,1.000000
4,2023-03-19,28,1,W,4,1,Crystal Palace,1.57,0.64,Arsenal,1,7,1,2.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642,2023-11-27,13,0,L,2,3,Fulham,1.46,2.97,Wolves,0,9,0,2.000000,1.333333
643,2023-12-17,17,0,L,0,3,West Ham,0.61,0.99,Wolves,0,21,0,2.000000,2.000000
644,2023-12-24,18,1,W,2,1,Chelsea,1.39,1.94,Wolves,1,6,1,1.333333,2.333333
645,2023-12-27,19,0,W,4,1,Brentford,1.80,2.32,Wolves,0,3,1,1.333333,2.333333


In [33]:
#making predictions based off new combined preds

def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-11-25']
    test = data[data["Date"] > '2023-11-25']
    rf.fit(train[predictors], train["Target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(atual=test["Target"], predicted=preds), index=test.index)
    precision = precision_score(test["Target"], preds)
    return combined, precision



In [34]:
combined, precision = make_predictions(df_rolling, predictors + new_cols)

In [35]:
precision

0.7096774193548387

In [36]:
combined

Unnamed: 0,atual,predicted
28,1,1
29,1,1
30,0,0
31,0,1
32,0,0
...,...,...
642,0,0
643,0,0
644,1,0
645,1,0


In [37]:
combined = combined.merge(df_rolling[["Date", "Team", "Result", "Opponent"]], left_index=True, right_index=True)

In [38]:
combined

Unnamed: 0,atual,predicted,Date,Team,Result,Opponent
28,1,1,2023-12-08,Arsenal,W,Nottingham Forest
29,1,1,2023-12-17,Arsenal,W,Brighton
30,0,0,2023-12-23,Arsenal,D,Liverpool
31,0,1,2023-12-28,Arsenal,L,West Ham
32,0,0,2023-12-31,Arsenal,L,Fulham
...,...,...,...,...,...,...
642,0,0,2023-11-27,Wolves,L,Fulham
643,0,0,2023-12-17,Wolves,L,West Ham
644,1,0,2023-12-24,Wolves,W,Chelsea
645,1,0,2023-12-27,Wolves,W,Brentford


In [39]:
acc

0.6538461538461539

In [40]:
combined["new_team"] = combined["Team"]

In [41]:
merged = combined.merge(combined, left_on=["Date", "new_team"], right_on=["Date", "Opponent"])

In [42]:
merged

Unnamed: 0,atual_x,predicted_x,Date,Team_x,Result_x,Opponent_x,new_team_x,atual_y,predicted_y,Team_y,Result_y,Opponent_y,new_team_y
0,1,1,2023-12-08,Arsenal,W,Nottingham Forest,Arsenal,0,0,Nottingham Forest,L,Arsenal,Nottingham Forest
1,1,1,2023-12-17,Arsenal,W,Brighton,Arsenal,0,0,Brighton,L,Arsenal,Brighton
2,0,0,2023-12-23,Arsenal,D,Liverpool,Arsenal,0,0,Liverpool,D,Arsenal,Liverpool
3,0,1,2023-12-28,Arsenal,L,West Ham,Arsenal,1,0,West Ham,W,Arsenal,West Ham
4,0,0,2023-12-31,Arsenal,L,Fulham,Arsenal,1,0,Fulham,W,Arsenal,Fulham
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,0,0,2023-11-27,Wolves,L,Fulham,Wolves,1,1,Fulham,W,Wolves,Fulham
100,0,0,2023-12-17,Wolves,L,West Ham,Wolves,1,1,West Ham,W,Wolves,West Ham
101,1,0,2023-12-24,Wolves,W,Chelsea,Wolves,0,0,Chelsea,L,Wolves,Chelsea
102,1,0,2023-12-27,Wolves,W,Brentford,Wolves,0,0,Brentford,L,Wolves,Brentford


In [43]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)] ["atual_x"].value_counts()

atual_x
1    22
0     9
Name: count, dtype: int64

In [48]:
9 / 22

0.4090909090909091

In [45]:
acc

0.6538461538461539

In [46]:
precision_score(test["Target"], preds)

0.625