In [376]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as pyplot
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from matplotlib import style
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [377]:
data = pd.read_csv("high_diamond_ranked_10min.csv", sep=",")

# Drop game id, which is not related to game, so may damage our model.
data = data.drop('gameId', axis=1)

# Column that we gonna predict.
predict = "blueWins"

# Divide data into train and test splits
X = data.drop([predict], 1)
Y = data[predict]

xt = X
yt = Y

In [407]:
# x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split( X,Y, test_size=0.2 )
X = xt
Y = yt

In [380]:
# Normalization
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

In [408]:
# Standardization
preprocessing.scale(data)
X = preprocessing.scale(X)

In [None]:
pca = PCA().fit(X)
pyplot.plot(np.cumsum(pca.explained_variance_ratio_))
pyplot.xlabel('number of components')
pyplot.ylabel('cumulative explained variance')

In [None]:
# Perform PCA
tempp = 0.99

pca = PCA(n_components=tempp)

principalComponents = pca.fit_transform(X)
X = pd.DataFrame(data = principalComponents)

print(X)

In [None]:
fs = SelectKBest(score_func=f_classif, k=4)
# learn relationship from training data
fs.fit(X, Y)
# transform train input data
X = fs.transform(X)
for i in range(len(fs.scores_)):
  print('Feature %d: %f' % (i, fs.scores_[i]))

print(X.shape[1])

In [None]:
logreg = LogisticRegression(max_iter = 1000)

print( "Logistic regression mean of cross validation AUC score: ", cross_val_score(logreg, X, Y, scoring='roc_auc', cv=10).mean())
print()


bnb = BernoulliNB(binarize=0.0)

print( "Bernoulli naive bayes mean of cross validation AUC score: ", cross_val_score(bnb, X, Y, scoring='roc_auc', cv=10).mean())
print()


clf = DecisionTreeClassifier(max_depth = 6)

print( "Decision tree mean of cross validation AUC score: ", cross_val_score(clf, X, Y, scoring='roc_auc', cv=10).mean())
print()


rforest = RandomForestClassifier(n_estimators=20)

print( "Random Forest mean of cross validation AUC score: ", cross_val_score(rforest, X, Y, scoring='roc_auc', cv=10).mean())
print()


neighbours = 125
knn = KNeighborsClassifier(n_neighbors=neighbours)

print( "KNN with %d as k, mean of cross validation AUC score: %f" %(neighbours, cross_val_score(knn, X, Y, scoring='roc_auc', cv=10).mean()))
print()


sgdc = SGDClassifier(max_iter=1000, tol=0.01)

print( "SGD mean of cross validation AUC score: ", cross_val_score(sgdc, X, Y, scoring='roc_auc', cv=10).mean())
print()

psvm = svm.SVC(kernel='poly')

print( "Polynomial SVM mean of cross validation AUC score: ", cross_val_score(psvm, X, Y, scoring='roc_auc', cv=10).mean())
print()


#lsvm = svm.SVC(kernel='linear')

#print( "Linear SVM mean of cross validation AUC score: ", cross_val_score(lsvm, X, Y, scoring='roc_auc', cv=10).mean())
#print()


In [None]:
for neighbours in range(1,300):
  knn = KNeighborsClassifier(n_neighbors=neighbours)
  print( "KNN with %d as k, mean of cross validation AUC score: %f" %(neighbours, cross_val_score(knn, X, Y, scoring='roc_auc', cv=10).mean()))

In [None]:
for dep in range(2,100):
  rforest = RandomForestClassifier(max_depth=dep)
  print( "Random Forest mean of cross validation AUC score: ", cross_val_score(rforest, X, Y, scoring='roc_auc', cv=10).mean())