In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!ls gdrive/MyDrive/soccer_modelling

Soccer-Modelling


In [4]:
# import required libraries

import numpy as np
import pandas as pd
import os

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('/content/gdrive/MyDrive/soccer_modelling/Soccer-Modelling/soccerData.csv')
df['date_GMT']= pd.to_datetime(df['date_GMT'])

In [6]:
raw_match_stats = df[[
 'date_GMT',
 'home_team_name',
 'away_team_name',
 'home_team_goal_count',
 'away_team_goal_count',
 'home_team_corner_count',
 'away_team_corner_count',
 'home_team_shots',
 'away_team_shots',
 'home_team_shots_on_target',
 'away_team_shots_on_target',
 'home_team_fouls',
 'away_team_fouls',
 'home_team_possession',
 'away_team_possession',]]

In [7]:
raw_match_stats = raw_match_stats.sort_values(by=['date_GMT'], ascending=False)
raw_match_stats = raw_match_stats.dropna()

In [8]:
raw_match_stats

Unnamed: 0,date_GMT,home_team_name,away_team_name,home_team_goal_count,away_team_goal_count,home_team_corner_count,away_team_corner_count,home_team_shots,away_team_shots,home_team_shots_on_target,away_team_shots_on_target,home_team_fouls,away_team_fouls,home_team_possession,away_team_possession
572,2021-03-31 18:45:00,Spain,Kosovo,3,1,9.0,2.0,25.0,4.0,9.0,2.0,9.0,10.0,80.0,20.0
565,2021-03-31 18:45:00,Scotland,Faroe Islands,4,0,1.0,5.0,14.0,8.0,8.0,3.0,9.0,13.0,65.0,35.0
559,2021-03-31 18:45:00,Lithuania,Italy,0,2,4.0,5.0,8.0,29.0,3.0,11.0,14.0,13.0,34.0,66.0
560,2021-03-31 18:45:00,Northern Ireland,Bulgaria,0,0,12.0,2.0,16.0,4.0,5.0,2.0,17.0,17.0,70.0,30.0
561,2021-03-31 18:45:00,Bosnia-Herzegovina,France,0,1,4.0,8.0,5.0,14.0,3.0,3.0,12.0,10.0,43.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,2020-09-03 16:00:00,Latvia,Andorra,0,0,4.0,1.0,15.0,5.0,5.0,0.0,16.0,11.0,62.0,38.0
10,2020-09-03 14:00:00,Uzbekistan,Tajikistan,2,1,9.0,2.0,8.0,2.0,4.0,0.0,19.0,13.0,50.0,50.0
7,2020-02-26 15:00:00,Bulgaria,Belarus,0,1,3.0,4.0,15.0,7.0,9.0,3.0,8.0,16.0,50.0,50.0
5,2020-02-01 20:55:00,USA,Costa Rica,1,0,10.0,3.0,15.0,11.0,6.0,4.0,22.0,12.0,50.0,50.0


In [9]:
# create results columns for both home and away teams (W - win, D = Draw, L = Loss).

raw_match_stats.loc[raw_match_stats['home_team_goal_count'] == raw_match_stats['away_team_goal_count'], 'home_team_result'] = 'D'
raw_match_stats.loc[raw_match_stats['home_team_goal_count'] > raw_match_stats['away_team_goal_count'], 'home_team_result'] = 'W'
raw_match_stats.loc[raw_match_stats['home_team_goal_count'] < raw_match_stats['away_team_goal_count'], 'home_team_result'] = 'L'

raw_match_stats.loc[raw_match_stats['home_team_goal_count'] == raw_match_stats['away_team_goal_count'], 'away_team_result'] = 'D'
raw_match_stats.loc[raw_match_stats['home_team_goal_count'] > raw_match_stats['away_team_goal_count'], 'away_team_result'] = 'L'
raw_match_stats.loc[raw_match_stats['home_team_goal_count'] < raw_match_stats['away_team_goal_count'], 'away_team_result'] = 'W'

In [10]:
# Split the raw_match_stats to two datasets (home_team_stats and away_team_stats)

home_team_stats = raw_match_stats[[
 'date_GMT',
 'home_team_name',
 'home_team_goal_count',
 'home_team_corner_count',
 'home_team_shots',
 'home_team_shots_on_target',
 'home_team_fouls',
 'home_team_possession',
 'home_team_result',]]

away_team_stats = raw_match_stats[[
 'date_GMT',
 'away_team_name',
 'away_team_goal_count',
 'away_team_corner_count',
 'away_team_shots',
 'away_team_shots_on_target',
 'away_team_fouls',
 'away_team_possession',
 'away_team_result',]]

# rename "home_team" and "away_team" columns
home_team_stats.columns = [col.replace('home_team_','') for col in home_team_stats.columns]
away_team_stats.columns = [col.replace('away_team_','') for col in away_team_stats.columns]

# stack these two datasets so that each row is the stats for a team for one match (team_stats_per_match)
team_stats_per_match = home_team_stats.append(away_team_stats)

In [23]:
team_stats_per_match[team_stats_per_match['name']=="Spain"]

Unnamed: 0,date_GMT,name,goal_count,corner_count,shots,shots_on_target,fouls,possession,result,goals_per_match,corners_per_match,shots_per_match,shotsOnTarget_per_match,fouls_per_match,possession_per_match
0,2021-03-31 18:45:00,Spain,3,9.0,25.0,9.0,9.0,80.0,W,2.2,6.4,11.8,4.8,10.6,69.6
71,2021-03-25 19:45:00,Spain,1,3.0,10.0,3.0,15.0,80.0,D,1.8,8.0,13.0,6.2,9.4,64.6
159,2020-11-17 19:45:00,Spain,6,6.0,12.0,5.0,6.0,68.0,W,0.6,7.4,14.0,6.6,10.6,63.8
318,2020-10-10 18:45:00,Spain,1,8.0,10.0,4.0,9.0,62.0,W,1.666667,5.333333,15.666667,6.666667,12.333333,64.0
380,2020-09-06 18:45:00,Spain,4,9.0,18.0,7.0,15.0,72.0,W,1.0,4.0,12.0,6.0,10.0,56.0
462,2021-03-28 16:00:00,Spain,2,6.0,11.0,4.0,8.0,79.0,W,1.8,7.0,13.0,6.0,10.6,68.2
616,2020-11-14 19:45:00,Spain,1,10.0,14.0,7.0,11.0,70.0,D,1.2,7.2,14.8,6.6,11.4,64.2
659,2020-11-11 19:45:00,Spain,1,7.0,12.0,5.0,13.0,51.0,D,1.2,6.6,14.8,6.8,10.8,65.2
701,2020-10-13 18:45:00,Spain,0,9.0,17.0,10.0,8.0,72.0,L,1.5,6.0,14.25,6.0,11.5,63.5
767,2020-10-07 18:45:00,Spain,0,3.0,17.0,7.0,12.0,64.0,D,2.5,6.5,15.0,6.5,12.5,64.0


In [11]:
# At each row of this dataset, get the team name, find the stats for that team during the last 5 matches, and average these stats (avg_stats_per_team).

avg_stat_columns = ['goals_per_match','corners_per_match','shots_per_match','shotsOnTarget_per_match','fouls_per_match', 'possession_per_match']
stats_list = []
for index, row in team_stats_per_match.iterrows():
    team_stats_last_five_matches = team_stats_per_match.loc[(team_stats_per_match['name']==row['name']) & (team_stats_per_match['date_GMT']<row['date_GMT'])].sort_values(by=['date_GMT'], ascending=False)
    stats_list.append(team_stats_last_five_matches.iloc[0:5,:].mean(axis=0).values[0:6])

avg_stats_per_team = pd.DataFrame(stats_list, columns=avg_stat_columns)

In [24]:
avg_stats_per_team

Unnamed: 0,goals_per_match,corners_per_match,shots_per_match,shotsOnTarget_per_match,fouls_per_match,possession_per_match
0,2.2,6.4,11.8,4.8,10.6,69.6
1,0.8,6.0,13.6,6.2,13.8,48.8
2,0.8,2.6,8.4,2.4,11.6,47.2
3,0.8,4.0,8.8,3.0,9.2,45.8
4,0.6,3.6,10.2,3.6,11.2,51.2
...,...,...,...,...,...,...
827,,,,,,
828,,,,,,
829,,,,,,
830,,,,,,


In [12]:
# Add these stats to the team_stats_per_match dataset.

team_stats_per_match = pd.concat([team_stats_per_match.reset_index(drop=True), avg_stats_per_team], axis=1, ignore_index=False)


In [25]:
team_stats_per_match

Unnamed: 0,date_GMT,name,goal_count,corner_count,shots,shots_on_target,fouls,possession,result,goals_per_match,corners_per_match,shots_per_match,shotsOnTarget_per_match,fouls_per_match,possession_per_match
0,2021-03-31 18:45:00,Spain,3,9.0,25.0,9.0,9.0,80.0,W,2.2,6.4,11.8,4.8,10.6,69.6
1,2021-03-31 18:45:00,Scotland,4,1.0,14.0,8.0,9.0,65.0,W,0.8,6.0,13.6,6.2,13.8,48.8
2,2021-03-31 18:45:00,Lithuania,0,4.0,8.0,3.0,14.0,34.0,L,0.8,2.6,8.4,2.4,11.6,47.2
3,2021-03-31 18:45:00,Northern Ireland,0,12.0,16.0,5.0,17.0,70.0,D,0.8,4.0,8.8,3.0,9.2,45.8
4,2021-03-31 18:45:00,Bosnia-Herzegovina,0,4.0,5.0,3.0,12.0,43.0,L,0.6,3.6,10.2,3.6,11.2,51.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827,2020-09-03 16:00:00,Andorra,0,1.0,5.0,0.0,11.0,38.0,D,,,,,,
828,2020-09-03 14:00:00,Tajikistan,1,2.0,2.0,0.0,13.0,50.0,L,,,,,,
829,2020-02-26 15:00:00,Belarus,1,4.0,7.0,3.0,16.0,50.0,W,,,,,,
830,2020-02-01 20:55:00,Costa Rica,0,3.0,11.0,4.0,12.0,50.0,L,,,,,,


In [13]:
# Re-segment the home and away teams.

home_team_stats = team_stats_per_match.iloc[:int(team_stats_per_match.shape[0]/2),:]
away_team_stats = team_stats_per_match.iloc[int(team_stats_per_match.shape[0]/2):,:]

home_team_stats.columns = ['team_1_'+str(col) for col in home_team_stats.columns]
away_team_stats.columns = ['team_2_'+str(col) for col in away_team_stats.columns]

In [14]:
# Combine at each match to get a dataset with a row representing each match.
# drop the NA rows (earliest match for each team, i.e no previous stats)

match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
match_stats = match_stats.dropna().reset_index(drop=True)

In [15]:
# create columns with average stat differences between the two teams

match_stats['goals_per_match_diff'] = (match_stats['team_1_goals_per_match'] - match_stats['team_2_goals_per_match'])
match_stats['corners_per_match_diff'] = (match_stats['team_1_corners_per_match'] - match_stats['team_2_corners_per_match'])
match_stats['shots_per_match_diff'] = (match_stats['team_1_shots_per_match'] - match_stats['team_2_shots_per_match'])
match_stats['shotsOnTarget_per_match_diff'] = (match_stats['team_1_shotsOnTarget_per_match'] - match_stats['team_2_shotsOnTarget_per_match'])
match_stats['fouls_per_match_diff'] = (match_stats['team_1_fouls_per_match'] - match_stats['team_2_fouls_per_match'])
match_stats['possession_per_match_diff'] = (match_stats['team_1_possession_per_match'] - match_stats['team_2_possession_per_match'])

In [16]:
match_stats

Unnamed: 0,team_1_date_GMT,team_1_name,team_1_goal_count,team_1_corner_count,team_1_shots,team_1_shots_on_target,team_1_fouls,team_1_possession,team_1_result,team_1_goals_per_match,...,team_2_shots_per_match,team_2_shotsOnTarget_per_match,team_2_fouls_per_match,team_2_possession_per_match,goals_per_match_diff,corners_per_match_diff,shots_per_match_diff,shotsOnTarget_per_match_diff,fouls_per_match_diff,possession_per_match_diff
0,2021-03-31 18:45:00,Spain,3,9.0,25.0,9.0,9.0,80.0,W,2.2,...,6.6,2.4,12.2,45.0,1.8,3.4,5.2,2.4,-1.6,24.6
1,2021-03-31 18:45:00,Scotland,4,1.0,14.0,8.0,9.0,65.0,W,0.8,...,9.0,3.8,14.0,46.8,-0.2,2.2,4.6,2.4,-0.2,2.0
2,2021-03-31 18:45:00,Lithuania,0,4.0,8.0,3.0,14.0,34.0,L,0.8,...,15.6,5.4,11.4,64.4,-1.6,-4.6,-7.2,-3.0,0.2,-17.2
3,2021-03-31 18:45:00,Northern Ireland,0,12.0,16.0,5.0,17.0,70.0,D,0.8,...,8.8,3.2,16.2,48.6,-0.2,0.4,0.0,-0.2,-7.0,-2.8
4,2021-03-31 18:45:00,Bosnia-Herzegovina,0,4.0,5.0,3.0,12.0,43.0,L,0.6,...,14.2,5.8,15.8,60.2,-1.0,-4.0,-4.0,-2.2,-4.6,-9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,2020-09-06 18:45:00,Spain,4,9.0,18.0,7.0,15.0,72.0,W,1.0,...,10.0,2.0,7.0,36.0,-1.0,2.0,2.0,4.0,3.0,20.0
324,2020-09-06 16:00:00,Republic of Ireland,0,3.0,14.0,8.0,2.0,53.0,L,1.0,...,2.0,0.0,14.0,50.0,1.0,2.0,9.0,3.0,2.0,17.0
325,2020-09-06 16:00:00,Hungary,2,7.0,10.0,7.0,13.0,59.0,L,1.0,...,3.0,2.0,18.0,49.0,-2.0,2.0,5.0,3.0,1.0,-7.0
326,2020-09-06 13:00:00,Andorra,0,2.0,4.0,0.0,16.0,39.0,L,0.0,...,9.0,5.0,13.0,57.0,-3.0,-3.0,-4.0,-5.0,-2.0,-19.0


In [17]:
# import required libraries

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score, confusion_matrix, roc_auc_score, classification_report, log_loss

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

In [18]:
target = match_stats[['team_1_result']].replace(['W','L','D'],[0,1,2])

features = match_stats[['goals_per_match_diff', 'corners_per_match_diff',
       'shots_per_match_diff', 'shotsOnTarget_per_match_diff',
       'fouls_per_match_diff', 'possession_per_match_diff']]

In [26]:
X_train,X_test,y_train,y_test = train_test_split(features, target, test_size=0.2, stratify = target)

In [27]:
names = ["Nearest Neighbors", "Logistic Regression","Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    LogisticRegression(),
    SVC(kernel="linear", C=0.025, probability=True),
    SVC(gamma=2, C=1, probability=True),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [45]:
for name, clf in zip(names, classifiers):
  if name == "Nearest Neighbors":
        clf.fit(X_train, y_train)
        accuracy = clf.score(X_test, y_test)

        prediction_proba = clf.predict_proba(X_test)
        prediction = clf.predict(X_test)

        logloss = log_loss(y_test,prediction_proba)
        precision, recall, fscore, support = score(y_test, prediction)
        conf_martrix = confusion_matrix(y_test, prediction)
        clas_report = classification_report(y_test, prediction)

        print(name, accuracy)
        print(conf_martrix)

        print(X_test.iloc[0])
        print(prediction[0], y_test['team_1_result'].iloc[0])


Nearest Neighbors 0.5606060606060606
[[21  3  3]
 [10  8  3]
 [ 3  7  8]]
goals_per_match_diff            -1.0
corners_per_match_diff           0.0
shots_per_match_diff             2.0
shotsOnTarget_per_match_diff    -3.0
fouls_per_match_diff             6.0
possession_per_match_diff      -17.0
Name: 307, dtype: float64
1 2
