In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier


from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix,roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from scipy.stats import loguniform


import warnings
warnings.filterwarnings('ignore')

In [101]:
df=pd.read_csv('Arsenal season 2021_2022 - Sheet1 (2).csv')

In [102]:
drop=['pred_poss','Day','Date','Round','Prediction','GF_prediction', 'GA_prediction','Predicted_formation','pred_gk_Ratings','pred_def_ratings', 'pred_mid_ratings', 'pred_forw_ratings','Opponent','Venue','Result']


opponent=pd.get_dummies(df['Opponent'],drop_first=True)
venue=pd.get_dummies(df['Venue'],drop_first=True)
result=pd.get_dummies(df['Result'],drop_first=True)

df=pd.concat([df,opponent,venue,result],axis=1)
df=df.iloc[:18]

df=df.drop(drop,axis=1)

df['Formation'].replace('3-4-2003','3-4-3',inplace=True)
df['Formation']=df['Formation'].apply(lambda x:int(x.replace('-','')))

    - Cannot use KFold validation because it will cause data leakage.
    - Cannot use the usual train test split also since it will cause data leakage

    For grid search cv

In [104]:
x=df.drop(['GF','GA','L','W'],axis=1)
y=df[['L','W']]

In [4]:
train=df.iloc[:14]
test=df.iloc[14:]

    We will just use the data for 14 match (as of 21/12/2021) train our model

## Making predictions for match results

In [5]:
x_train=train.drop(['GF','GA','L','W'],axis=1)
y_train=train[['L','W']]

In [6]:
x_test=test.drop(['GF','GA','L','W'],axis=1)
y_test=test[['L','W']]

### Testing several models 

In [7]:
classification_model=[
    ('lr',LogisticRegression()),
    ('knn',KNeighborsClassifier()),
    ('svm',SVC()),
    ('rf',RandomForestClassifier()),
    ('naive',MultinomialNB()),
    ('gbm',GradientBoostingClassifier()),
    ('ada',AdaBoostClassifier()),
    ('xgb',XGBClassifier(eval_metric='mlogloss'))
]

In [9]:
for name,model in classification_model:
    mod=MultiOutputClassifier(model).fit(x_train,y_train)
    prediction=mod.predict(x_test)
    result=classification_report(y_test,prediction)
    
    print(result)
    
    avg_score=accuracy_score(y_test,prediction)
    print(name,'MODEL ACCURACY : {}'.format(avg_score))
    print('***********************************************************************************')
 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00         3

   micro avg       1.00      0.75      0.86         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.75      0.75      0.75         4
 samples avg       0.75      0.75      0.75         4

lr MODEL ACCURACY : 0.75
***********************************************************************************
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.75      1.00      0.86         3

   micro avg       0.75      0.75      0.75         4
   macro avg       0.38      0.50      0.43         4
weighted avg       0.56      0.75      0.64         4
 samples avg       0.75      0.75      0.75         4

knn MODEL ACCURACY : 0.75
***********************************************************************************
              preci

    There is 4 models that produced the highest accuracy. There is no right answer as to which model produces the best accuracy so probably we will use 4 models to predict.

In [10]:
classification_model=[
    ('lr',LogisticRegression()),
    ('knn',KNeighborsClassifier()),
    ('svm',SVC()),
    ('naive',MultinomialNB()),
]


print('ACTUAL RESULTS : ')
print(y_test)
print('***********************************************************************************')
print('***********************************************************************************')

for name,model in classification_model:
    mod=MultiOutputClassifier(model).fit(x_train,y_train)
    prediction=mod.predict(x_test)
    print(prediction)
    result=classification_report(y_test,prediction)
    
    print(result)
    
    avg_score=(accuracy_score(y_test,prediction))*100
    print(name,'MODEL SCORE IS :  {}'.format(avg_score), '%')
    print('***********************************************************************************')

ACTUAL RESULTS : 
    L  W
14  1  0
15  0  1
16  0  1
17  0  1
***********************************************************************************
***********************************************************************************
[[0 0]
 [0 1]
 [0 1]
 [0 1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00         3

   micro avg       1.00      0.75      0.86         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.75      0.75      0.75         4
 samples avg       0.75      0.75      0.75         4

lr MODEL SCORE IS :  75.0 %
***********************************************************************************
[[0 1]
 [0 1]
 [0 1]
 [0 1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.75      1.00      0.86         3

   micro avg       0.75      0.75      0.75         4


    Here we can see the most accuract model is Logistic Regression since it predicts the first match as a draw which is the nearest to the actual results (lose).

# Logistic regression hyper parameter tuning

In [44]:
def model_score(model,X_train,Y_train,X_test,Y_test,feature_importance=True):
    
    #fit the model
    mod=MultiOutputClassifier(model).fit(X_train,Y_train)
    prediction=mod.predict(X_test)   
    #see accuracy with normal train test split
    print ("\nModel classification report :")
    print ("Accuracy : {}".format(accuracy_score(y_test,prediction)*100), '%')


In [45]:
model_score(LogisticRegression(),x_train,y_train,x_test,y_test)


Model classification report :
Accuracy : 75.0 %


In [None]:
 'c_values':[100, 10, 1.0, 0.1, 0.01]

# Need to make our own for loop to use each parameters since grid search does mixed train test split

In [97]:
param_test1= dict()
param_test1['estimator__solver'] = ['newton-cg', 'lbfgs', 'liblinear']
param_test1['estimator__penalty'] = ['none', 'l1', 'l2', 'elasticnet']
param_test1['estimator__C'] = [100, 10, 1.0, 0.1, 0.01]
model=LogisticRegression()
lr=MultiOutputClassifier(model)
lr.get_params().keys()

dict_keys(['estimator__C', 'estimator__class_weight', 'estimator__dual', 'estimator__fit_intercept', 'estimator__intercept_scaling', 'estimator__l1_ratio', 'estimator__max_iter', 'estimator__multi_class', 'estimator__n_jobs', 'estimator__penalty', 'estimator__random_state', 'estimator__solver', 'estimator__tol', 'estimator__verbose', 'estimator__warm_start', 'estimator', 'n_jobs'])

In [107]:
gsearch1 = GridSearchCV(estimator = lr,param_grid=param_test1)
gsearch1.fit(x,y)

GridSearchCV(estimator=MultiOutputClassifier(estimator=LogisticRegression()),
             param_grid={'estimator__C': [100, 10, 1.0, 0.1, 0.01],
                         'estimator__penalty': ['none', 'l1', 'l2',
                                                'elasticnet'],
                         'estimator__solver': ['newton-cg', 'lbfgs',
                                               'liblinear']})

In [108]:
gsearch1.best_score_

0.6833333333333333

In [109]:
gsearch1.best_params_

{'estimator__C': 10,
 'estimator__penalty': 'l1',
 'estimator__solver': 'liblinear'}