In [93]:
%pylab inline
import pandas as pd
import seaborn as sns
import numpy as np
import scipy 
import matplotlib.pyplot as plt
import matplotlib.style as style

import sys
from sklearn.model_selection import train_test_split
import pydot
#imports the own created package
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz

import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [94]:
#imports pickle file created and saved in 'Feature_extraction.ipynb'
df_final = pd.read_pickle('df_both_seasons_with_features')
df_final.head()

Unnamed: 0,Day,Month,Year,HomeTeam,AwayTeam,FTHG,FTAG,HST,AST,FTR,HTGDIFF,ATGDIFF,AVGHTGDIFF,AVGP
0,9,12,2020,Fulham,Arsenal,0,3,2,6,0,-3,3,0.1,1.0
1,9,12,2020,Crystal Palace,Southampton,1,0,3,5,2,1,-1,0.2,0.9
2,9,12,2020,Liverpool,Leeds,4,3,6,3,2,1,-1,2.0,1.9
3,9,12,2020,West Ham,Newcastle,0,2,3,2,0,-2,2,0.0,1.0
347,7,12,2020,Tottenham,Arsenal,2,1,9,4,2,1,-1,0.317771,1.3


In [95]:
df_result=df_final.copy()
## On définie la variable qui nous intéresse ( Home Team Result) et on supprime les variables du résultats ( FTHG FTAG HTGDIFF HTR ATGDIFF)
target=df_result["FTR"]

df_result= df_result.drop([
    'FTHG','FTAG', 'HTGDIFF', 'ATGDIFF','FTR','HomeTeam','AwayTeam'],axis=1) 

In [96]:
# splitting arrays into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(
    df_result, target, test_size = 0.25,random_state = 42
)      

In [97]:
print('X_train Shape:', X_train.shape)
print('y_train Shape:', y_train.shape)
print('X_test Shape:', X_test.shape)
print('y_test Shape:', y_test.shape)


X_train Shape: (434, 7)
y_train Shape: (434,)
X_test Shape: (145, 7)
y_test Shape: (145,)


In [98]:
from sklearn.metrics import accuracy_score
#from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt  # doctest: +SKIP
def performance_accuracy(y_test,X_test, clf):
    errors=0
    i=0
    y_pred =clf.predict(X_test)
    return accuracy_score(y_test, y_pred)
    
    #plot_confusion_matrix(clf, X_test, y_test)
    #plt.show()

In [116]:
## First attempt of prediction
clf = RandomForestClassifier(n_estimators=1000,max_features="sqrt",min_samples_split=35,n_jobs=-1,random_state=42)
clf.fit(X_train,y_train)

accuracy=performance_accuracy(y_test,X_test,clf)
print('Accuracy of the model is : ',accuracy)

Accuracy of the model is :  0.6137931034482759


In [101]:
features=np.array(df_result)
predictions_HTR = clf.predict(features)

df_final["prediction"]=predictions_HTR
df_show=df_final.copy()
df_show= df_show.drop([
    'HST','AST', 'HTGDIFF', 'ATGDIFF','AVGHTGDIFF','AVGP'],axis=1) 
df_show.head(20)

Unnamed: 0,Day,Month,Year,HomeTeam,AwayTeam,FTHG,FTAG,FTR,prediction
0,9,12,2020,Fulham,Arsenal,0,3,0,0
1,9,12,2020,Crystal Palace,Southampton,1,0,2,0
2,9,12,2020,Liverpool,Leeds,4,3,2,2
3,9,12,2020,West Ham,Newcastle,0,2,0,0
347,7,12,2020,Tottenham,Arsenal,2,1,2,2
346,7,12,2020,Aston Villa,Crystal Palace,2,0,2,2
345,7,12,2020,Wolves,Everton,3,0,2,2
348,7,12,2020,Bournemouth,Leicester,4,1,2,0
382,2,12,2020,Reading,West Brom,1,2,0,0
381,2,12,2020,Millwall,Fulham,1,1,1,1


In [102]:
importances=clf.feature_importances_
columns=df_result.columns
d= {"columns":columns,"importances":importances}
df=pd.DataFrame(d)
df

Unnamed: 0,columns,importances
0,Day,0.08407
1,Month,0.067484
2,Year,0.009824
3,HST,0.303329
4,AST,0.204642
5,AVGHTGDIFF,0.140791
6,AVGP,0.189861


In [122]:

from sklearn.model_selection import RandomizedSearchCV
def random_search(X_train,y_train, n_estimators=1000, n_iter=100, cv=3):
    # creating the parameter grid with variables
    param_grid = {
        'n_estimators': np.linspace(10, n_estimators).astype(int),
        'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
        'max_features': ['auto', 'sqrt', None] + list(np.arange(1, 1, 5)),
        'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
        'min_samples_split': [2, 5, 10,25,30,35,40],
        'bootstrap': [True, False]
    }

    # RandomForestClassifier selected as estimator
    clf = RandomForestClassifier(random_state = 42)

    # create randomized search 
    # as described here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
    rscv = RandomizedSearchCV(clf, param_grid, n_jobs = -1, cv = cv, 
                            n_iter = n_iter, verbose = 1, random_state=42)

    # refit 
    rscv.fit(X_train,y_train)
    return rscv

In [123]:
rs=random_search(X_train,y_train,cv=5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.6min finished


In [124]:
best_params = rs.best_params_
print(best_params)

{'n_estimators': 111, 'min_samples_split': 40, 'max_leaf_nodes': 26, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}


In [125]:
clf_optimized = RandomForestClassifier(n_estimators=111,min_samples_split=40,max_features='sqrt',max_leaf_nodes=26,max_depth=5,n_jobs=-1,random_state=42)
clf_optimized.fit(X_train,y_train)

accuracy=performance_accuracy(y_test,X_test,clf_optimized)
print('Accuracy of the optimized model is : ',accuracy)

Accuracy of the optimized model is :  0.593103448275862
