In [1]:
#import data
#Basics
import pandas as pd
import numpy as np

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#SkLearn ML General
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, roc_curve, auc, precision_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics

#SkLearn ML Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier

In [2]:
#load data
games_df = pd.DataFrame()

games_df = pd.read_csv("C:/Users/Jude/Downloads/games_data1.csv")

In [3]:
#Clean data. drop columns that link to scores
#games_df = games_df.drop(['Hscore','Ascore','Match Status','Sum_Odds'], axis = 1)
games_df.head()

Unnamed: 0,H_Team,A_Team,H_Score,A_Score,Home,Draw,Away,Results
0,MCI,LEE,0,3,1.33,4.79,11.0,0
1,MCI,SOU,0,2,1.34,5.03,9.3,0
2,MCI,LIV,0,1,2.02,3.67,3.52,0
3,MCI,MUN,0,1,1.79,3.84,4.3,0
4,MCI,MUN,0,1,1.79,3.84,4.3,0


In [4]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1426 entries, 0 to 1425
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   H_Team   1426 non-null   object 
 1   A_Team   1426 non-null   object 
 2   H_Score  1426 non-null   int64  
 3   A_Score  1426 non-null   int64  
 4   Home     1426 non-null   float64
 5   Draw     1426 non-null   float64
 6   Away     1426 non-null   float64
 7   Results  1426 non-null   int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 89.2+ KB


In [5]:
#self team data
team_df = games_df.loc[games_df['H_Team']== "MCI"]
team_df

Unnamed: 0,H_Team,A_Team,H_Score,A_Score,Home,Draw,Away,Results
0,MCI,LEE,0,3,1.33,4.79,11.00,0
1,MCI,SOU,0,2,1.34,5.03,9.30,0
2,MCI,LIV,0,1,2.02,3.67,3.52,0
3,MCI,MUN,0,1,1.79,3.84,4.30,0
4,MCI,MUN,0,1,1.79,3.84,4.30,0
...,...,...,...,...,...,...,...,...
1421,MCI,FUL,4,1,1.32,5.16,9.99,1
1422,MCI,BHA,1,0,1.43,4.33,8.18,1
1423,MCI,LEI,3,1,1.47,4.24,7.25,1
1424,MCI,BOU,2,1,1.30,5.21,10.60,1


In [6]:
#Separate x_inputs from y_target variables
x_inputs = team_df.drop(['Results','H_Team','A_Team','H_Score','A_Score'], axis=1)
y_target = team_df['Results']



In [21]:
#split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_inputs, y_target, test_size=0.2, random_state=13)

In [22]:
# Define the classifiers
classifiers = {
    "classifier_NB":    GaussianNB(),
    "Classifier_LR":    LogisticRegression(max_iter=300),
    "classifier_SVM":   SVC(gamma = 2, C = 1),
    "classifier_KNN":   KNeighborsClassifier(n_neighbors = 3),
    "classifier_DT":    DecisionTreeClassifier(max_depth=5),
    "classifier_RF":    RandomForestClassifier(max_depth = 5, n_estimators =10, max_features=1),
    "Classifier_ANN":   MLPClassifier(alpha = 0.01, max_iter=1000, hidden_layer_sizes=[3], activation = 'tanh')
}
results = {}
print("Number of mislabeled points out of a total %d points:" % x_test.shape[0])
# Train the classifiers on the training set and test classifiers on unseen test data
for clf_name, clf_model in classifiers.items():
    clf_model.fit(x_train, y_train)
    clf_model.predict(x_test)
    print(clf_name + ": %d" % (y_test != clf_model.predict(x_test)).sum())
    results[clf_name] = clf_model.predict(x_test)

Number of mislabeled points out of a total 286 points:
classifier_NB: 109
Classifier_LR: 107
classifier_SVM: 111
classifier_KNN: 116
classifier_DT: 111
classifier_RF: 109
Classifier_ANN: 106


In [23]:
print("Accuracy Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.accuracy_score(y_test, clf_model) * 100,4))

print("\nPrecision Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.precision_score(y_test, clf_model, average = 'weighted',zero_division=1) * 100,4))
        
print("\nRecall Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.recall_score(y_test, clf_model, average = 'weighted') * 100,4))

print("\nF₁ Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.f1_score(y_test, clf_model, average = 'weighted', labels = np.unique(clf_model)) * 100,4))
        
print("\nF₂ Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.fbeta_score(y_test, clf_model, average = 'weighted', beta = 2) * 100,4))

Accuracy Scores:
classifier_NB:  61.8881
Classifier_LR:  62.5874
classifier_SVM:  61.1888
classifier_KNN:  59.4406
classifier_DT:  61.1888
classifier_RF:  61.8881
Classifier_ANN:  62.9371

Precision Scores:
classifier_NB:  62.1878
Classifier_LR:  62.6259
classifier_SVM:  61.5426
classifier_KNN:  59.6502
classifier_DT:  61.5426
classifier_RF:  61.8379
Classifier_ANN:  63.757

Recall Scores:
classifier_NB:  61.8881
Classifier_LR:  62.5874
classifier_SVM:  61.1888
classifier_KNN:  59.4406
classifier_DT:  61.1888
classifier_RF:  61.8881
Classifier_ANN:  62.9371

F₁ Scores:
classifier_NB:  61.8671
Classifier_LR:  62.5998
classifier_SVM:  61.1485
classifier_KNN:  59.4406
classifier_DT:  61.1485
classifier_RF:  61.8259
Classifier_ANN:  62.7334

F₂ Scores:
classifier_NB:  61.8388
Classifier_LR:  62.5907
classifier_SVM:  61.1208
classifier_KNN:  59.4154
classifier_DT:  61.1208
classifier_RF:  61.8543
Classifier_ANN:  62.7111


In [27]:
#Display Confusion Matrix For Each Classifier




In [28]:
#Plot ROC Curve
