In [78]:
#import data
#Basics
import pandas as pd
import numpy as np

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#SkLearn ML General
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, roc_curve, auc, precision_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics

#SkLearn ML Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier

In [128]:
#load data
games_df = pd.DataFrame()

games_df = pd.read_csv("C:/Users/Jude/Downloads/games_data.csv")

In [129]:
#Clean data. drop columns that link to scores
#games_df = games_df.drop(['Hscore','Ascore','Match Status','Sum_Odds'], axis = 1)
games_df.head()

Unnamed: 0,H_Team,A_Team,H_Score,A_Score,Home,Draw,Away,Results
0,CRY,FOR,5,0,1.87,3.55,4.22,1
1,EVE,NEW,1,2,2.7,3.29,2.64,1
2,NEW,FOR,3,2,1.57,4.08,5.86,1
3,LIV,WHU,1,0,1.67,3.77,5.33,1
4,BOU,CHE,2,3,6.12,4.13,1.55,1


In [130]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164170 entries, 0 to 164169
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   H_Team   164170 non-null  object 
 1   A_Team   164170 non-null  object 
 2   H_Score  164170 non-null  int64  
 3   A_Score  164170 non-null  int64  
 4   Home     164170 non-null  float64
 5   Draw     164170 non-null  float64
 6   Away     164170 non-null  float64
 7   Results  164170 non-null  int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 10.0+ MB


In [131]:
#self team data
team_df = games_df.loc[games_df['H_Team']== "MCI"]
team_df

Unnamed: 0,H_Team,A_Team,H_Score,A_Score,Home,Draw,Away,Results
11,MCI,BHA,1,0,1.43,4.33,8.18,1
21,MCI,FUL,5,0,1.32,5.16,9.99,1
118,MCI,BRN,1,0,1.34,4.73,10.30,1
123,MCI,BHA,5,0,1.43,4.33,8.18,1
128,MCI,WOL,3,1,1.41,4.56,7.98,1
...,...,...,...,...,...,...,...,...
164101,MCI,EVE,2,0,1.41,4.45,8.32,1
164118,MCI,SOU,4,0,1.34,5.03,9.30,1
164122,MCI,CRY,2,0,1.38,4.65,8.90,1
164143,MCI,BHA,1,0,1.43,4.33,8.18,1


In [132]:
#Separate x_inputs from y_target variables
x_inputs = team_df.drop(['Results','H_Team','A_Team','H_Score','A_Score'], axis=1)
y_target = team_df['Results']



In [133]:
#split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_inputs, y_target, test_size=0.3, random_state=0)

In [136]:
# Define the classifiers
classifiers = {
    "classifier_NB":    GaussianNB(),
    "Classifier_LR":    LogisticRegression(max_iter=300),
    "classifier_SVM":   SVC(gamma = 2, C = 1),
    "classifier_KNN":   KNeighborsClassifier(n_neighbors = 3),
    "classifier_DT":    DecisionTreeClassifier(max_depth=5),
    "classifier_RF":    RandomForestClassifier(max_depth = 5, n_estimators =10, max_features=1),
    "Classifier_ANN":   MLPClassifier(alpha = 0.01, max_iter=1000, hidden_layer_sizes=[3], activation = 'tanh')
}
results = {}
print("Number of mislabeled points out of a total %d points:" % x_test.shape[0])
# Train the classifiers on the training set and test classifiers on unseen test data
for clf_name, clf_model in classifiers.items():
    clf_model.fit(x_train, y_train)
    clf_model.predict(x_test)
    print(clf_name + ": %d" % (y_test != clf_model.predict(x_test)).sum())
    results[clf_name] = clf_model.predict(x_test)

Number of mislabeled points out of a total 2546 points:
classifier_NB: 293
Classifier_LR: 208
classifier_SVM: 208
classifier_KNN: 208
classifier_DT: 208
classifier_RF: 208
Classifier_ANN: 208


In [137]:
print("Accuracy Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.accuracy_score(y_test, clf_model) * 100,4))

print("\nPrecision Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.precision_score(y_test, clf_model, average = 'weighted',zero_division=1) * 100,4))
        
print("\nRecall Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.recall_score(y_test, clf_model, average = 'weighted') * 100,4))

print("\nF₁ Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.f1_score(y_test, clf_model, average = 'weighted', labels = np.unique(clf_model)) * 100,4))
        
print("\nF₂ Scores:")
for clf_name, clf_model in results.items():
    print(clf_name + ": ", round(metrics.fbeta_score(y_test, clf_model, average = 'weighted', beta = 2) * 100,4))

Accuracy Scores:
classifier_NB:  88.4918
Classifier_LR:  91.8303
classifier_SVM:  91.8303
classifier_KNN:  91.8303
classifier_DT:  91.8303
classifier_RF:  91.8303
Classifier_ANN:  91.8303

Precision Scores:
classifier_NB:  85.5202
Classifier_LR:  92.4978
classifier_SVM:  92.4978
classifier_KNN:  92.4978
classifier_DT:  92.4978
classifier_RF:  92.4978
Classifier_ANN:  92.4978

Recall Scores:
classifier_NB:  88.4918
Classifier_LR:  91.8303
classifier_SVM:  91.8303
classifier_KNN:  91.8303
classifier_DT:  91.8303
classifier_RF:  91.8303
Classifier_ANN:  91.8303

F₁ Scores:
classifier_NB:  86.9034
Classifier_LR:  95.7412
classifier_SVM:  95.7412
classifier_KNN:  95.7412
classifier_DT:  95.7412
classifier_RF:  95.7412
Classifier_ANN:  95.7412

F₂ Scores:
classifier_NB:  87.8381
Classifier_LR:  90.225
classifier_SVM:  90.225
classifier_KNN:  90.225
classifier_DT:  90.225
classifier_RF:  90.225
Classifier_ANN:  90.225


In [27]:
#Display Confusion Matrix For Each Classifier




In [28]:
#Plot ROC Curve
