In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline


import seaborn as sns
from sklearn.model_selection import train_test_split


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

## Variables

In [2]:
dataset = 'data/df_stanford_real_features_leagues_[21518, 1729]_seasons_[\'15_16\', \'14_15\', \'13_14\', \'12_13\', \'11_12\', \'10_11\', \'09_10\'].csv'
#dataset = 'data/df_stanford_fifa_features_leagues_[21518, 1729]_seasons_[\'15_16\', \'14_15\', \'13_14\', \'12_13\', \'11_12\', \'10_11\', \'09_10\'].csv'
#dataset = 'data/df_match_with_clusters.csv'
dataset = 'data/df_merge_dataset_stanford_real_fifa_leagues_[21518, 1729]_seasons_[\'15_16\', \'14_15\', \'13_14\', \'12_13\', \'11_12\', \'10_11\', \'09_10\'].csv'
dataset = 'data/df_stanford_real_features_leagues_[1729]_seasons_[\'15_16\', \'14_15\', \'13_14\', \'12_13\', \'11_12\', \'10_11\', \'09_10\'].csv'

print(dataset)
random_seed =1



data/df_stanford_real_features_leagues_[1729]_seasons_['15_16', '14_15', '13_14', '12_13', '11_12', '10_11', '09_10'].csv


## Reading and preparing datasets

In [3]:
df = pd.read_csv(dataset)
df.head()

FileNotFoundError: File b"data/df_stanford_real_features_leagues_[1729]_seasons_['15_16', '14_15', '13_14', '12_13', '11_12', '10_11', '09_10'].csv" does not exist

In [None]:
df.columns

## Labels

In [None]:
labels = df['labels']
features = df.drop(columns = ['labels'])

# Balance of classes

In [None]:
sns.countplot(labels,label="Count")

plt.show()

## Remove Rows With Missing Values

In [None]:
labels = labels[np.invert(features.isnull().any(axis=1))]
features.dropna(inplace=True)


# Functions to convert types

In [None]:
def convert_to(df,type_to_convert, all_categorical = False,columns = 'None'):
 
    if all_categorical == True:
        categoric_types = df.select_dtypes(include=['object'])
        if categoric_types.columns.empty:
            print('No object types in the dataframe to be converted')
            return df
        else:
            if 'labels' in categoric_types.columns:
                categoric_types = list(categoric_types.columns)
                categoric_types.remove('labels')
                df[categoric_types] = df[categoric_types].astype('category')
                columns = categoric_types
        
    else:
        df[columns] = df[columns].astype('category')
        
        
    if type_to_convert == 'labelling':
        for column in columns:
            df[column] = df[column].cat.codes
    elif type_to_convert == 'onehot':
        for column in columns:
            df= pd.get_dummies(df, columns=[column])             
      
    return df

In [None]:

features = features.drop(columns= 'date')



features = convert_to(features,'onehot', all_categorical = True,columns ='None')


if 'cluster_home' in features.columns:
    features = convert_to(features,'onehot', all_categorical = False,columns =['cluster_home','cluster_away'])



#Change types


#features['stage'] = features['stage'].astype(int)  #uncomment when fifa players data has stage




# Create Training and Test Sets and Apply Scaling

In [None]:
X = features
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=random_seed)

In [None]:


X_test.shape




In [None]:
scaler = MinMaxScaler()
scaler = StandardScaler()

numeric_types = X_train.select_dtypes(include=['float64','int'])
#print(numeric_types.columns)

X_train[numeric_types.columns] = scaler.fit_transform(X_train[numeric_types.columns])
X_test[numeric_types.columns]  = scaler.transform(X_test[numeric_types.columns] )






X_test.head()

## Logistic regression

In [None]:
X_train.info()

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))

print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))


y_predict = logreg.predict(X_test)
print('F1 Score of Logistic regression classifier on test set: {:.2f}'
     .format(f1_score(y_test, y_predict, average='weighted')))


 

## SVC

In [None]:
svm = SVC()
svm.fit(X_train, y_train)

print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

y_predict = svm.predict(X_test)
print('F1 Score of Logistic regression classifier on test set: {:.2f}'
     .format(f1_score(y_test, y_predict, average='weighted')))

## Let's try to convert the label to dummy variables

In [None]:
y_train_A = pd.get_dummies(y_train)['A']
y_test_A = pd.get_dummies(y_test)['A']

logreg = LogisticRegression()
logreg.fit(X_train, y_train_A)

print('Accuracy of Logistic regression classifier A on training set: {:.2f}'
     .format(logreg.score(X_train, y_train_A)))

print('Accuracy of Logistic regression classifier A on test set: {:.2f}'
     .format(logreg.score(X_test, y_test_A)))






y_train_H = pd.get_dummies(y_train)['H']
y_test_H = pd.get_dummies(y_test)['H']

logreg = LogisticRegression()
logreg.fit(X_train, y_train_H)

print('Accuracy of Logistic regression classifier H on training set: {:.2f}'
     .format(logreg.score(X_train, y_train_H)))

print('Accuracy of Logistic regression classifier H on test set: {:.2f}'
     .format(logreg.score(X_test, y_test_H)))





y_train_D = pd.get_dummies(y_train)['D']
y_test_D = pd.get_dummies(y_test)['D']

logreg = LogisticRegression()
logreg.fit(X_train, y_train_D)

print('Accuracy of Logistic regression classifier D on training set: {:.2f}'
     .format(logreg.score(X_train, y_train_D)))

print('Accuracy of Logistic regression classifier D on test set: {:.2f}'
     .format(logreg.score(X_test, y_test_D)))



y_train_bin = label_binarize(y_train, classes=['H','D','A'])
y_test_bin = label_binarize(y_test, classes=['H','D','A'])
n_classes = 3
dict_classes = {0:'H',1:'D',2:'A'}



# classifier
clf = OneVsRestClassifier(LogisticRegression())
y_score = clf.fit(X_train, y_train_bin).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example class %s' % dict_classes[i])
    plt.legend(loc="lower right")
    plt.show()


In [None]:
y_train_A = pd.get_dummies(y_train)['A']
y_test_A = pd.get_dummies(y_test)['A']

svc = SVC()
svc.fit(X_train, y_train_A)

print('Accuracy of Logistic regression classifier A on training set: {:.2f}'
     .format(svc.score(X_train, y_train_A)))

print('Accuracy of Logistic regression classifier A on test set: {:.2f}'
     .format(svc.score(X_test, y_test_A)))


y_train_H = pd.get_dummies(y_train)['H']
y_test_H = pd.get_dummies(y_test)['H']

svc = SVC()
svc.fit(X_train, y_train_H)

print('Accuracy of Logistic regression classifier H on training set: {:.2f}'
     .format(svc.score(X_train, y_train_H)))

print('Accuracy of Logistic regression classifier H on test set: {:.2f}'
     .format(svc.score(X_test, y_test_H)))


y_train_D = pd.get_dummies(y_train)['D']
y_test_D = pd.get_dummies(y_test)['D']

svc = SVC()
svc.fit(X_train, y_train_D)

print('Accuracy of Logistic regression classifier D on training set: {:.2f}'
     .format(svc.score(X_train, y_train_D)))

print('Accuracy of Logistic regression classifier D on test set: {:.2f}'
     .format(svc.score(X_test, y_test_D)))


# classifier
clf = OneVsRestClassifier(SVC())
y_score = clf.fit(X_train, y_train_bin).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example class %s' % dict_classes[i])
    plt.legend(loc="lower right")
    plt.show()