Premier League Games Prediction

In [1]:
import kagglehub
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("marcohuiii/english-premier-league-epl-match-data-2000-2025")

print("Dataset downloaded to:", path)

files = os.listdir(path)
print("Files in dataset:", files)

df = pd.read_csv(os.path.join(path, "epl_final.csv"))

print("DataFrame shape:", df.shape)

df.head()

Dataset downloaded to: /Users/aashitpaliwal/.cache/kagglehub/datasets/marcohuiii/english-premier-league-epl-match-data-2000-2025/versions/3
Files in dataset: ['epl_final.csv']
DataFrame shape: (9380, 22)


Unnamed: 0,Season,MatchDate,HomeTeam,AwayTeam,FullTimeHomeGoals,FullTimeAwayGoals,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,...,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeYellowCards,AwayYellowCards,HomeRedCards,AwayRedCards
0,2000/01,2000-08-19,Charlton,Man City,4,0,H,2,0,H,...,14,4,6,6,13,12,1,2,0,0
1,2000/01,2000-08-19,Chelsea,West Ham,4,2,H,1,0,H,...,10,5,7,7,19,14,1,2,0,0
2,2000/01,2000-08-19,Coventry,Middlesbrough,1,3,A,1,1,D,...,3,9,8,4,15,21,5,3,1,0
3,2000/01,2000-08-19,Derby,Southampton,2,2,D,1,2,A,...,4,6,5,8,11,13,1,1,0,0
4,2000/01,2000-08-19,Leeds,Everton,2,0,H,2,0,H,...,8,6,6,4,21,20,1,3,0,0


In [3]:
team1 = "West Ham"
team2 = "Chelsea"

h2h_df = df[((df['HomeTeam'] == team1) & (df['AwayTeam'] == team2)) |
            ((df['HomeTeam'] == team2) & (df['AwayTeam'] == team1))].copy()

In [4]:
def encode_result(row):
    if row['FullTimeResult'] == 'H':
        return 1 if row['HomeTeam'] == team1 else -1
    elif row['FullTimeResult'] == 'A':
        return 1 if row['AwayTeam'] == team1 else -1
    else:
        return 0

h2h_df['result'] = h2h_df.apply(encode_result, axis=1)

print(h2h_df[['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals', 'FullTimeAwayGoals', 'result']])

       Season   MatchDate  HomeTeam  AwayTeam  FullTimeHomeGoals  \
1     2000/01  2000-08-19   Chelsea  West Ham                  4   
284   2000/01  2001-03-07  West Ham   Chelsea                  0   
472   2001/02  2001-10-24  West Ham   Chelsea                  2   
605   2001/02  2002-01-20   Chelsea  West Ham                  5   
831   2002/03  2002-09-28   Chelsea  West Ham                  2   
1127  2002/03  2003-05-03  West Ham   Chelsea                  1   
2011  2005/06  2006-01-02  West Ham   Chelsea                  1   
2134  2005/06  2006-04-09   Chelsea  West Ham                  4   
2310  2006/07  2006-11-18   Chelsea  West Ham                  1   
2527  2006/07  2007-04-18  West Ham   Chelsea                  1   
2710  2007/08  2007-12-01   Chelsea  West Ham                  1   
2845  2007/08  2008-03-01  West Ham   Chelsea                  0   
3117  2008/09  2008-12-14   Chelsea  West Ham                  1   
3285  2008/09  2009-04-25  West Ham   Chelsea   

In [5]:
def cummulative_stats(df, team):
  team_home = df[df['HomeTeam'] == team]
  team_away = df[df['AwayTeam'] == team]

  #Goals scored and conceded
  goals_scored = team_home['FullTimeHomeGoals'].sum() + team_away['FullTimeAwayGoals'].sum()
  goals_conceded = team_home['FullTimeAwayGoals'].sum() + team_away['FullTimeHomeGoals'].sum()

  #Wins, Draws, Losses
  wins = ((team_home['FullTimeResult'] == 'H') | (team_away['FullTimeResult'] == 'A')).sum()
  draws = ((team_home['FullTimeResult'] == 'D') | (team_away['FullTimeResult'] == 'D')).sum()
  losses = ((team_home['FullTimeResult'] == 'A') | (team_away['FullTimeResult'] == 'H')).sum()
  
  return {
      'goals_scored': goals_scored,
      'goals_conceded': goals_conceded,
      'wins': wins,
      'draws': draws,
      'losses': losses
  }

team1_stats = cummulative_stats(h2h_df, team1)
team2_stats = cummulative_stats(h2h_df, team2)

for col, val in team1_stats.items():
  h2h_df[f'{team1}_{col}'] = val
for col, val in team2_stats.items():
  h2h_df[f'{team2}_{col}'] = val
    

In [6]:
x = h2h_df.drop(columns=['result', 'Date', 'HomeTeam', 'AwayTeam'], errors='ignore')
y = h2h_df['result']

In [7]:
x = x.apply(pd.to_numeric, errors='coerce').fillna(0)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle = True, random_state=42)

In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [10]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6666666666666666


In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.50      1.00      0.67         3
           0       1.00      0.50      0.67         2
           1       1.00      0.50      0.67         4

    accuracy                           0.67         9
   macro avg       0.83      0.67      0.67         9
weighted avg       0.83      0.67      0.67         9



In [12]:
latest_features = x.tail(1)
next_pred = model.predict(latest_features)[0]

In [13]:
if next_pred == 1:
  print(f"{team1} is predicted to win against {team2}")
elif next_pred == -1:
  print(f"{team2} is predicted to win against {team1}")
else:
  print("The match is predicted to be a draw")

Chelsea is predicted to win against West Ham
