In [54]:
import pandas as pd

# Load the dataset
file_path = '/content/Premier League Data 2013-2023.xlsx'
premier_league_data = pd.read_excel(file_path)

# Display the first few rows of the dataset to understand its structure
premier_league_data.head()


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,2013-08-17,Arsenal,Aston Villa,1,3,A,1,1,D,A Taylor,...,18,4,3,4,5,1,0,1.44,4.75,8.0
1,2013-08-17,Liverpool,Stoke,1,0,H,1,0,H,M Atkinson,...,11,12,6,1,1,0,0,1.4,5.0,9.5
2,2013-08-17,Norwich,Everton,2,2,D,0,0,D,M Oliver,...,10,6,8,2,0,0,0,3.2,3.4,2.4
3,2013-08-17,Sunderland,Fulham,0,1,A,0,0,D,N Swarbrick,...,14,6,1,0,3,0,0,2.3,3.4,3.4
4,2013-08-17,Swansea,Man United,1,4,A,0,2,A,P Dowd,...,10,7,4,1,3,0,0,4.2,3.5,2.0


In [55]:
# Checking for missing values in the dataset
missing_values = premier_league_data.isnull().sum()

# Displaying columns with missing values if any
missing_values[missing_values > 0]


Series([], dtype: int64)

In [56]:
from sklearn.preprocessing import OneHotEncoder

# Selecting categorical columns for one-hot encoding
categorical_columns = ['HomeTeam', 'AwayTeam']

# Applying one-hot encoding
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(premier_league_data[categorical_columns])

# Creating a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenating the encoded data with the original dataset
preprocessed_data = pd.concat([premier_league_data.drop(categorical_columns, axis=1), encoded_df], axis=1)
preprocessed_data.head()




Unnamed: 0,Date,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,...,AwayTeam_Sheffield United,AwayTeam_Southampton,AwayTeam_Stoke,AwayTeam_Sunderland,AwayTeam_Swansea,AwayTeam_Tottenham,AwayTeam_Watford,AwayTeam_West Brom,AwayTeam_West Ham,AwayTeam_Wolves
0,2013-08-17,1,3,A,1,1,D,A Taylor,16,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013-08-17,1,0,H,1,0,H,M Atkinson,26,10,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013-08-17,2,2,D,0,0,D,M Oliver,8,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2013-08-17,0,1,A,0,0,D,N Swarbrick,20,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013-08-17,1,4,A,0,2,A,P Dowd,17,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# Number of recent matches to consider
recent_matches_count = 5

# Creating a copy of the dataset to work with
recent_form_data = premier_league_data.copy()

# Function to calculate recent form metrics for a team
def calculate_recent_form(team, match_date, is_home_team):
    # Filter matches for the team before the given match date
    if is_home_team:
        team_matches = recent_form_data[(recent_form_data['HomeTeam'] == team) & (recent_form_data['Date'] < match_date)]
    else:
        team_matches = recent_form_data[(recent_form_data['AwayTeam'] == team) & (recent_form_data['Date'] < match_date)]

    # Sort matches by date and select the most recent ones
    team_matches = team_matches.sort_values(by='Date', ascending=False).head(recent_matches_count)

    # Calculate the metrics
    if is_home_team:
        points = team_matches['FTR'].apply(lambda x: 3 if x == 'H' else 1 if x == 'D' else 0).sum()
        goals_scored = team_matches['FTHG'].sum()
        goals_conceded = team_matches['FTAG'].sum()
    else:
        points = team_matches['FTR'].apply(lambda x: 3 if x == 'A' else 1 if x == 'D' else 0).sum()
        goals_scored = team_matches['FTAG'].sum()
        goals_conceded = team_matches['FTHG'].sum()

    return points, goals_scored, goals_conceded

# Applying the function to each row in the dataset
recent_form_features = recent_form_data.apply(lambda row: calculate_recent_form(row['HomeTeam'], row['Date'], True) +
                                              calculate_recent_form(row['AwayTeam'], row['Date'], False), axis=1)

# Creating new columns for recent form metrics
recent_form_df = pd.DataFrame(recent_form_features.tolist(), columns=['HomePoints', 'HomeGoalsScored', 'HomeGoalsConceded',
                                                                      'AwayPoints', 'AwayGoalsScored', 'AwayGoalsConceded'])

# Merging these new columns with the original dataset
enhanced_data = pd.concat([recent_form_data, recent_form_df], axis=1)

# Displaying the first few rows with recent form features
enhanced_data[['Date', 'HomeTeam', 'AwayTeam', 'HomePoints', 'HomeGoalsScored', 'HomeGoalsConceded',
               'AwayPoints', 'AwayGoalsScored', 'AwayGoalsConceded']].head(30)


Unnamed: 0,Date,HomeTeam,AwayTeam,HomePoints,HomeGoalsScored,HomeGoalsConceded,AwayPoints,AwayGoalsScored,AwayGoalsConceded
0,2013-08-17,Arsenal,Aston Villa,0,0,0,0,0,0
1,2013-08-17,Liverpool,Stoke,0,0,0,0,0,0
2,2013-08-17,Norwich,Everton,0,0,0,0,0,0
3,2013-08-17,Sunderland,Fulham,0,0,0,0,0,0
4,2013-08-17,Swansea,Man United,0,0,0,0,0,0
5,2013-08-17,West Brom,Southampton,0,0,0,0,0,0
6,2013-08-17,West Ham,Cardiff,0,0,0,0,0,0
7,2013-08-18,Chelsea,Hull,0,0,0,0,0,0
8,2013-08-18,Crystal Palace,Tottenham,0,0,0,0,0,0
9,2013-08-19,Man City,Newcastle,0,0,0,0,0,0


# Logistic Regression

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Preparing the target variable for binary classification (Home Win or Not)
label_encoder = LabelEncoder()
binary_target = label_encoder.fit_transform(premier_league_data['FTR'].apply(lambda x: 1 if x == 'H' else 0))

# Selecting features for the model (excluding date, categorical variables, and betting odds for simplicity)
features = enhanced_data.drop(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee',
                               'B365H', 'B365D', 'B365A'], axis=1)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, binary_target, test_size=0.3, random_state=42)

# Training the Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Predicting on the test set
y_pred = log_reg_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(report)



0.7280701754385965
              precision    recall  f1-score   support

           0       0.73      0.81      0.76       623
           1       0.73      0.63      0.68       517

    accuracy                           0.73      1140
   macro avg       0.73      0.72      0.72      1140
weighted avg       0.73      0.73      0.73      1140



In [59]:
# Encoding the 'FTR' column for multiclass classification
label_encoder = LabelEncoder()
multiclass_target = label_encoder.fit_transform(premier_league_data['FTR'])

# Splitting the dataset into training and testing sets for the multiclass target
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(features, multiclass_target, test_size=0.3, random_state=42)

# Training the Logistic Regression model for multiclass classification
log_reg_model_multi = LogisticRegression(max_iter=1000, multi_class='ovr')  # 'ovr' stands for One-vs-Rest
log_reg_model_multi.fit(X_train_multi, y_train_multi)

# Predicting on the test set
y_pred_multi = log_reg_model_multi.predict(X_test_multi)

# Evaluating the model for multiclass classification
accuracy_multi = accuracy_score(y_test_multi, y_pred_multi)
report_multi = classification_report(y_test_multi, y_pred_multi)

print(accuracy_multi)
print(report_multi)


0.6026315789473684
              precision    recall  f1-score   support

           0       0.57      0.71      0.63       353
           1       0.41      0.08      0.13       270
           2       0.64      0.80      0.71       517

    accuracy                           0.60      1140
   macro avg       0.54      0.53      0.49      1140
weighted avg       0.56      0.60      0.55      1140



In [60]:
# Adding the betting odds to the features for multiclass classification
features_with_odds = premier_league_data[['B365H', 'B365D', 'B365A']].join(features)

# Splitting the dataset with the new features into training and testing sets for multiclass classification
X_train_odds, X_test_odds, y_train_odds, y_test_odds = train_test_split(features_with_odds, multiclass_target, test_size=0.3, random_state=42)

# Training the Logistic Regression model for multiclass classification with betting odds
log_reg_model_odds = LogisticRegression(max_iter=1000, multi_class='ovr')
log_reg_model_odds.fit(X_train_odds, y_train_odds)

# Predicting on the test set
y_pred_odds = log_reg_model_odds.predict(X_test_odds)

# Calculating accuracy and F1 scores
accuracy_odds = accuracy_score(y_test_odds, y_pred_odds)
report_odds = classification_report(y_test_odds, y_pred_odds)

print(accuracy_odds)
print(report_odds)




0.6263157894736842
              precision    recall  f1-score   support

           0       0.60      0.76      0.67       353
           1       0.40      0.08      0.13       270
           2       0.66      0.82      0.73       517

    accuracy                           0.63      1140
   macro avg       0.56      0.55      0.51      1140
weighted avg       0.58      0.63      0.57      1140



# Random Forest

In [61]:
from sklearn.ensemble import RandomForestClassifier

# Training the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_model.fit(X_train_odds, y_train_odds)

# Predicting on the test set
y_pred_rf = rf_model.predict(X_test_odds)

# Evaluating the model
accuracy_rf = accuracy_score(y_test_odds, y_pred_rf)
report_rf = classification_report(y_test_odds, y_pred_rf)

accuracy_rf, report_rf
print(accuracy_rf)
print(report_rf)


0.5973684210526315
              precision    recall  f1-score   support

           0       0.61      0.70      0.65       353
           1       0.32      0.09      0.14       270
           2       0.62      0.79      0.70       517

    accuracy                           0.60      1140
   macro avg       0.52      0.53      0.50      1140
weighted avg       0.55      0.60      0.55      1140



In [63]:
from sklearn.model_selection import GridSearchCV

# Parameters to tune
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Creating the base model to tune
rf = RandomForestClassifier(random_state=42)

# Grid search of parameters, using 3 fold cross validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search model
grid_search.fit(X_train_odds, y_train_odds)

# Best parameters found by grid search
grid_search.best_params_

Fitting 3 folds for each of 162 candidates, totalling 486 fits


KeyboardInterrupt: 

In [None]:
# Training the Random Forest Classifier with the optimized parameters
rf_model_optimized = RandomForestClassifier(
    n_estimators=150,
    max_features='auto',
    max_depth=30,
    min_samples_leaf=4,
    min_samples_split=10,
    random_state=42
)
rf_model_optimized.fit(X_train_odds, y_train_odds)

# Predicting on the test set
y_pred_rf_optimized = rf_model_optimized.predict(X_test_odds)

# Evaluating the model
accuracy_rf_optimized = accuracy_score(y_test_odds, y_pred_rf_optimized)
report_rf_optimized = classification_report(y_test_odds, y_pred_rf_optimized)

accuracy_rf_optimized, report_rf_optimized
print(accuracy_rf_optimized)
print(report_rf_optimized)


# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train_odds, y_train_odds)
y_pred_gb = gb_model.predict(X_test_odds)
accuracy_gb = accuracy_score(y_test_odds, y_pred_gb)
report_gb = classification_report(y_test_odds, y_pred_gb)

# XGBoost Classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train_odds, y_train_odds)
y_pred_xgb = xgb_model.predict(X_test_odds)
accuracy_xgb = accuracy_score(y_test_odds, y_pred_xgb)
report_xgb = classification_report(y_test_odds, y_pred_xgb)

print(accuracy_gb)
print(report_gb)

print(accuracy_xgb)
print(report_xgb)


# Support Vector Machines

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_odds)
X_test_scaled = scaler.transform(X_test_odds)

# Training the SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train_multi)

# Predicting on the test set
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluating the model
accuracy_svm = accuracy_score(y_test_multi, y_pred_svm)
report_svm = classification_report(y_test_multi, y_pred_svm)

print(accuracy_svm)
print(report_svm)

#KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

k = 74
knn_model = KNeighborsClassifier(n_neighbors=k)

knn_model.fit(X_train_scaled, y_train_multi)

y_pred_knn = knn_model.predict(X_test_scaled)

accuracy_knn = accuracy_score(y_test_multi, y_pred_knn)
report_knn = classification_report(y_test_multi, y_pred_knn)

print(accuracy_knn)
print(report_knn)


Finding best value for k

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

k_range = range(1, 100)

k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train_multi, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())

best_k = k_range[k_scores.index(max(k_scores))]
print("Best k:", best_k)
