In [71]:
#importing data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df=pd.read_csv('Premier_League_Results.csv')
df.head() 

Unnamed: 0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,05/19/2024,38,Home,W,2,0,Wolves,4.46,0.62,Liverpool,,,
1,05/13/2024,37,Away,D,3,3,Aston Villa,1.72,3.31,Liverpool,,,
2,05/05/2024,36,Home,W,4,2,Tottenham,3.19,1.16,Liverpool,,,
3,04/27/2024,35,Away,D,2,2,West Ham,1.74,0.82,Liverpool,,,
4,04/24/2024,34,Away,W,3,1,Fulham,1.12,0.69,Liverpool,,,


In [73]:
df.dtypes

Date              object
Round              int64
Venue             object
Result            object
Goals_For          int64
Goals_Against      int64
Opponent          object
XGF              float64
XGA              float64
Team              object
Unnamed: 10       object
Unnamed: 11      float64
Unnamed: 12       object
dtype: object

In [75]:
#converting date to correct format for reading in ml algorithm

df["Date"] = pd.to_datetime(df["Date"])

In [77]:
#converting appropriate variables for ml algorithm to correctly read and predict

df["Venue_Code"] = df["Venue"].astype("category").cat.codes

In [79]:
df["Opp_Code"] = df["Opponent"].astype("category").cat.codes

In [81]:
df["Target"] = (df["Result"] == "W").astype("int")

In [83]:
df

Unnamed: 0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Unnamed: 10,Unnamed: 11,Unnamed: 12,Venue_Code,Opp_Code,Target
0,2024-05-19,38,Home,W,2,0,Wolves,4.46,0.62,Liverpool,,,,1,22,1
1,2024-05-13,37,Away,D,3,3,Aston Villa,1.72,3.31,Liverpool,,,,0,1,0
2,2024-05-05,36,Home,W,4,2,Tottenham,3.19,1.16,Liverpool,,,,1,20,1
3,2024-04-27,35,Away,D,2,2,West Ham,1.74,0.82,Liverpool,,,,0,21,0
4,2024-04-24,34,Away,W,3,1,Fulham,1.12,0.69,Liverpool,,,,0,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,2023-03-15,27,Home,L,0,2,Brentford,0.55,2.15,Southampton,,,,1,3,0
1076,2023-03-12,26,Away,D,0,0,Manchester Utd,1.25,0.92,Southampton,,,,0,15,0
1077,2023-03-04,25,Home,W,1,0,Leicester,1.52,2.09,Southampton,,,,1,11,1
1078,2023-02-25,24,Away,L,0,1,Leeds,0.39,0.94,Southampton,,,,0,10,0


In [85]:
#strange column added into dataset which is now removed

df = df.drop('Unnamed: 10', axis = 1)

In [87]:
#strange column added into dataset which is now removed

df = df.drop('Unnamed: 11', axis = 1)

In [89]:
#strange column added into dataset which is now removed

df = df.drop('Unnamed: 12', axis = 1)

In [91]:
df

Unnamed: 0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Venue_Code,Opp_Code,Target
0,2024-05-19,38,Home,W,2,0,Wolves,4.46,0.62,Liverpool,1,22,1
1,2024-05-13,37,Away,D,3,3,Aston Villa,1.72,3.31,Liverpool,0,1,0
2,2024-05-05,36,Home,W,4,2,Tottenham,3.19,1.16,Liverpool,1,20,1
3,2024-04-27,35,Away,D,2,2,West Ham,1.74,0.82,Liverpool,0,21,0
4,2024-04-24,34,Away,W,3,1,Fulham,1.12,0.69,Liverpool,0,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,2023-03-15,27,Home,L,0,2,Brentford,0.55,2.15,Southampton,1,3,0
1076,2023-03-12,26,Away,D,0,0,Manchester Utd,1.25,0.92,Southampton,0,15,0
1077,2023-03-04,25,Home,W,1,0,Leicester,1.52,2.09,Southampton,1,11,1
1078,2023-02-25,24,Away,L,0,1,Leeds,0.39,0.94,Southampton,0,10,0


In [93]:
#importing random forest model

from sklearn.ensemble import RandomForestClassifier

In [95]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [97]:
#setting up train and test, data is from Feb 2023

train = df[df["Date"] < '2023-10-29']

In [99]:
test = df[df["Date"] > '2023-10-29']

In [101]:
#These two variables used as the predictors for the model, can add more predictors

predictors = ["Venue_Code", "Opp_Code","XGF","XGA"]

In [103]:
rf.fit(train[predictors], train["Target"])

In [105]:
preds = rf.predict(test[predictors])

In [107]:
#testing accuracy of model

from sklearn.metrics import accuracy_score

In [109]:
acc = accuracy_score(test["Target"], preds)

In [111]:
acc

0.7424547283702213

In [113]:
combined = pd.DataFrame(dict(actual=test["Target"], prediction=preds))

In [115]:
#comparing how many times we predicted a result and how many times we were right against how many times we were wrong

pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,252,51
1,77,117


In [117]:
#importing precision score

from sklearn.metrics import precision_score

In [119]:
precision_score(test["Target"], preds)

0.6964285714285714

In [121]:
grouped_df = df.groupby("Team")

In [123]:
#running a rolling average to help improve model accuracy

def rolling_average(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [125]:
cols = ["Goals_For", "Goals_Against"]
new_cols = [f"{c}_rolling" for c in cols]

In [127]:
new_cols

['Goals_For_rolling', 'Goals_Against_rolling']

In [129]:
df_rolling = df.groupby("Team").apply(lambda x: rolling_average(x, cols, new_cols))

In [131]:
df_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Venue_Code,Opp_Code,Target,Goals_For_rolling,Goals_Against_rolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Arsenal,789,2023-02-25,25,Away,W,1,0,Leicester,0.69,0.02,Arsenal,0,11,1,2.333333,1.333333
Arsenal,788,2023-03-04,26,Home,W,3,2,Bournemouth,2.77,1.45,Arsenal,1,2,1,2.333333,1.000000
Arsenal,148,2023-03-09,4,Home,W,3,1,Manchester Utd,1.41,0.94,Arsenal,1,15,1,2.666667,1.333333
Arsenal,787,2023-03-12,27,Away,W,0,3,Fulham,2.52,0.84,Arsenal,0,9,1,2.333333,1.000000
Arsenal,786,2023-03-19,28,Home,W,4,1,Crystal Palace,1.57,0.64,Arsenal,1,7,1,2.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolves,389,2024-04-24,29,Home,L,0,1,Bournemouth,0.53,1.99,Wolves,1,2,0,1.000000,2.000000
Wolves,383,2024-04-27,35,Home,W,2,1,Luton,1.17,0.45,Wolves,1,13,1,0.666667,1.666667
Wolves,382,2024-05-04,36,Away,L,1,5,Manchester City,0.37,3.03,Wolves,0,14,0,0.666667,1.333333
Wolves,381,2024-05-11,37,Home,L,1,3,Crystal Palace,1.16,3.00,Wolves,1,7,0,1.000000,2.333333


In [133]:
df_rolling.index = range(df_rolling.shape[0])

In [135]:
df_rolling

Unnamed: 0,Date,Round,Venue,Result,Goals_For,Goals_Against,Opponent,XGF,XGA,Team,Venue_Code,Opp_Code,Target,Goals_For_rolling,Goals_Against_rolling
0,2023-02-25,25,Away,W,1,0,Leicester,0.69,0.02,Arsenal,0,11,1,2.333333,1.333333
1,2023-03-04,26,Home,W,3,2,Bournemouth,2.77,1.45,Arsenal,1,2,1,2.333333,1.000000
2,2023-03-09,4,Home,W,3,1,Manchester Utd,1.41,0.94,Arsenal,1,15,1,2.666667,1.333333
3,2023-03-12,27,Away,W,0,3,Fulham,2.52,0.84,Arsenal,0,9,1,2.333333,1.000000
4,2023-03-19,28,Home,W,4,1,Crystal Palace,1.57,0.64,Arsenal,1,7,1,2.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006,2024-04-24,29,Home,L,0,1,Bournemouth,0.53,1.99,Wolves,1,2,0,1.000000,2.000000
1007,2024-04-27,35,Home,W,2,1,Luton,1.17,0.45,Wolves,1,13,1,0.666667,1.666667
1008,2024-05-04,36,Away,L,1,5,Manchester City,0.37,3.03,Wolves,0,14,0,0.666667,1.333333
1009,2024-05-11,37,Home,L,1,3,Crystal Palace,1.16,3.00,Wolves,1,7,0,1.000000,2.333333


In [137]:
#making predictions based off new combined preds

def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-11-25']
    test = data[data["Date"] > '2023-11-25']
    rf.fit(train[predictors], train["Target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(atual=test["Target"], predicted=preds), index=test.index)
    precision = precision_score(test["Target"], preds)
    return combined, precision


In [139]:
combined, precision = make_predictions(df_rolling, predictors + new_cols)

In [141]:
precision

0.6870748299319728

In [143]:
combined

Unnamed: 0,atual,predicted
28,1,0
29,1,1
30,0,0
31,0,1
32,0,0
...,...,...
1006,0,0
1007,1,1
1008,0,0
1009,0,0


In [145]:
combined = combined.merge(df_rolling[["Date", "Team", "Result", "Opponent"]], left_index=True, right_index=True)

In [147]:
combined

Unnamed: 0,atual,predicted,Date,Team,Result,Opponent
28,1,0,2023-12-08,Arsenal,W,Nottingham Forest
29,1,1,2023-12-17,Arsenal,W,Brighton
30,0,0,2023-12-23,Arsenal,D,Liverpool
31,0,1,2023-12-28,Arsenal,L,West Ham
32,0,0,2023-12-31,Arsenal,L,Fulham
...,...,...,...,...,...,...
1006,0,0,2024-04-24,Wolves,L,Bournemouth
1007,1,1,2024-04-27,Wolves,W,Luton
1008,0,0,2024-05-04,Wolves,L,Manchester City
1009,0,0,2024-05-11,Wolves,L,Crystal Palace


In [149]:
acc

0.7424547283702213

In [151]:
combined["new_team"] = combined["Team"]

In [153]:
merged = combined.merge(combined, left_on=["Date", "new_team"], right_on=["Date", "Opponent"])

In [155]:
merged

Unnamed: 0,atual_x,predicted_x,Date,Team_x,Result_x,Opponent_x,new_team_x,atual_y,predicted_y,Team_y,Result_y,Opponent_y,new_team_y
0,1,0,2023-12-08,Arsenal,W,Nottingham Forest,Arsenal,0,1,Nottingham Forest,L,Arsenal,Nottingham Forest
1,1,1,2023-12-17,Arsenal,W,Brighton,Arsenal,0,0,Brighton,L,Arsenal,Brighton
2,0,0,2023-12-23,Arsenal,D,Liverpool,Arsenal,0,0,Liverpool,D,Arsenal,Liverpool
3,0,1,2023-12-28,Arsenal,L,West Ham,Arsenal,1,0,West Ham,W,Arsenal,West Ham
4,0,0,2023-12-31,Arsenal,L,Fulham,Arsenal,1,0,Fulham,W,Arsenal,Fulham
...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,0,0,2024-04-24,Wolves,L,Bournemouth,Wolves,1,1,Bournemouth,W,Wolves,Bournemouth
458,1,1,2024-04-27,Wolves,W,Luton,Wolves,0,0,Luton,L,Wolves,Luton
459,0,0,2024-05-04,Wolves,L,Manchester City,Wolves,1,1,Manchester City,W,Wolves,Manchester City
460,0,0,2024-05-11,Wolves,L,Crystal Palace,Wolves,1,1,Crystal Palace,W,Wolves,Crystal Palace


In [157]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)] ["atual_x"].value_counts()

atual_x
1    98
0    45
Name: count, dtype: int64

In [161]:
45 / 98

0.45918367346938777

In [163]:
acc

0.7424547283702213

In [165]:
precision_score(test["Target"], preds)

0.6964285714285714