In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier


from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix,roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('Arsenal season 2021_2022 - Sheet1 (2).csv')

In [3]:
drop=['pred_poss','Day','Date','Round','Prediction','GF_prediction', 'GA_prediction','Predicted_formation','pred_gk_Ratings','pred_def_ratings', 'pred_mid_ratings', 'pred_forw_ratings','Opponent','Venue','Result']


opponent=pd.get_dummies(df['Opponent'],drop_first=True)
venue=pd.get_dummies(df['Venue'],drop_first=True)
result=pd.get_dummies(df['Result'],drop_first=True)

df=pd.concat([df,opponent,venue,result],axis=1)
df=df.iloc[:18]

df=df.drop(drop,axis=1)

df['Formation'].replace('3-4-2003','3-4-3',inplace=True)
df['Formation']=df['Formation'].apply(lambda x:int(x.replace('-','')))

    - Cannot use KFold validation because it will cause data leakage.
    - Cannot use the usual train test split also since it will cause data leakage

In [4]:
train=df.iloc[:14]
test=df.iloc[14:]

    We will just use the data for 14 match (as of 21/12/2021) train our model

## Making predictions for match results

In [5]:
x_train=train.drop(['GF','GA','L','W'],axis=1)
y_train=train[['L','W']]

In [6]:
x_test=test.drop(['GF','GA','L','W'],axis=1)
y_test=test[['L','W']]

For predicting score results

In [7]:
y_train_scoring=train[['GF','GA']]
y_test_scoring=test[['GF','GA']]

# Testing several models 

In [199]:
classification_model=[
    ('lr',LogisticRegression()),
    ('knn',KNeighborsClassifier()),
    ('svm',SVC()),
    ('rf',RandomForestClassifier()),
    ('naive',MultinomialNB()),
    ('gbm',GradientBoostingClassifier()),
    ('ada',AdaBoostClassifier()),
    ('xgb',XGBClassifier(eval_metric='mlogloss'))
]

In [200]:
for name,model in classification_model:
    mod=MultiOutputClassifier(model).fit(x_train,y_train)
    prediction=mod.predict(x_test)
    result=classification_report(y_test,prediction)
    
    print(result)
    
    avg_score=accuracy_score(y_test,prediction)
    print(name,'MO {}'.format(avg_score))
    print('***********************************************************************************')
 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00         3

   micro avg       1.00      0.75      0.86         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.75      0.75      0.75         4
 samples avg       0.75      0.75      0.75         4

lr model accuracy score is : 0.75
***********************************************************************************
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.75      1.00      0.86         3

   micro avg       0.75      0.75      0.75         4
   macro avg       0.38      0.50      0.43         4
weighted avg       0.56      0.75      0.64         4
 samples avg       0.75      0.75      0.75         4

knn model accuracy score is : 0.75
***********************************************************************************
 

In [None]:
lr knn svm naive 

    There is 4 models that produced the highest accuracy. There is no right answer as to which model produces the best accuracy so probably we will use 4 models to predict.

In [212]:
classification_model=[
    ('lr',LogisticRegression()),
    ('knn',KNeighborsClassifier()),
    ('svm',SVC()),
    ('naive',MultinomialNB()),
]


print('ACTUAL RESULTS : ')
print(y_test)
print('***********************************************************************************')
print('***********************************************************************************')

for name,model in classification_model:
    mod=MultiOutputClassifier(model).fit(x_train,y_train)
    prediction=mod.predict(x_test)
    print(prediction)
    result=classification_report(y_test,prediction)
    
    print(result)
    
    avg_score=(accuracy_score(y_test,prediction))*100
    print(name,'MODEL SCORE IS :  {}'.format(avg_score), '%')
    print('***********************************************************************************')

ACTUAL RESULTS : 
    L  W
14  1  0
15  0  1
16  0  1
17  0  1
***********************************************************************************
***********************************************************************************
[[0 0]
 [0 1]
 [0 1]
 [0 1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00         3

   micro avg       1.00      0.75      0.86         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.75      0.75      0.75         4
 samples avg       0.75      0.75      0.75         4

lr MODEL SCORE IS :  75.0 %
***********************************************************************************
[[0 1]
 [0 1]
 [0 1]
 [0 1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.75      1.00      0.86         3

   micro avg       0.75      0.75      0.75         4


    Here we can see the most accuract model is Logistic Regression since it predicts the first match as a draw which is the nearest to the actual results (lose).

# Logistic regression hyperparameter tuning

In [10]:
grid_search.fit(x_train,y_train)

GridSearchCV(estimator=MultiOutputClassifier(estimator=LogisticRegression()),
             param_grid={'estimator__C': [100, 10, 1.0, 0.1, 0.01],
                         'estimator__penalty': ['l2'],
                         'estimator__solver': ['newton-cg', 'lbfgs',
                                               'liblinear']},
             scoring='accuracy')

In [11]:
grid_search.best_score_

0.6

In [12]:
grid_search.best_params_

{'estimator__C': 100, 'estimator__penalty': 'l2', 'estimator__solver': 'lbfgs'}

    But using grid search, it uses K fold validation hence it causes data leakage. We will try using these hyper for our model.

# Predicting match results

In [13]:
model = LogisticRegression(C=100)
lr=MultiOutputClassifier(model)

In [14]:
lr.get_params()

{'estimator__C': 100,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__l1_ratio': None,
 'estimator__max_iter': 100,
 'estimator__multi_class': 'auto',
 'estimator__n_jobs': None,
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'lbfgs',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': LogisticRegression(C=100),
 'n_jobs': None}

In [15]:
lr.fit(x_train,y_train)
prediction=lr.predict(x_test)

In [16]:
print('PREDICTION MADE BY THE MODEL :')
prediction

PREDICTION MADE BY THE MODEL :


array([[1, 0],
       [0, 1],
       [0, 1],
       [0, 1]], dtype=uint8)

In [17]:
print('ACTUAL RESULTS :')
print(y_test)

print('----CLASSIFICATION REPORT-----')
print(classification_report(y_test,prediction))

ACTUAL RESULTS :
    L  W
14  1  0
15  0  1
16  0  1
17  0  1
----CLASSIFICATION REPORT-----
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         3

   micro avg       1.00      1.00      1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4
 samples avg       1.00      1.00      1.00         4



    Somehow the model predicted really well using a higher C value which means it really penalizes the penalty and weaken the penalty values.

# Predicting score results

In [18]:
model = LogisticRegression(C=100)
lr=MultiOutputClassifier(model)

lr.fit(x_train,y_train_scoring)

MultiOutputClassifier(estimator=LogisticRegression(C=100))

In [19]:
scoring_prediction=lr.predict(x_test)

In [20]:
print('----PREDICTED SCORE RESULTS------')
print(scoring_prediction)

print('----ACTUAL SCORE RESULTS-----')
print(y_test_scoring)

----PREDICTED SCORE RESULTS------
[[2. 2.]
 [2. 0.]
 [1. 0.]
 [3. 1.]]
----ACTUAL SCORE RESULTS-----
     GF   GA
14  1.0  2.0
15  3.0  0.0
16  2.0  0.0
17  4.0  1.0


In [21]:
y_test_scoring

Unnamed: 0,GF,GA
14,1.0,2.0
15,3.0,0.0
16,2.0,0.0
17,4.0,1.0
