In [2]:
import pandas as pd

filtered_df = pd.read_excel("C:/Users/anush/Desktop/Torikumi_New.xlsx")

In [3]:
logical = (pd.notnull(filtered_df['Rikishi1_modified_rank'])) & (pd.notnull(filtered_df['Rikishi2_modified_rank'])) 

filtered_df = filtered_df[logical]

In [4]:
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# create dataset
#X_scaled, y = make_classification(n_samples=59817, n_features=24, random_state=1, n_informative=2, n_redundant=22)
# configure the cross-validation procedure

X = filtered_df.loc[:, ['Rikishi1_modified_rank', 'Rikishi2_modified_rank']]
scaler = StandardScaler(copy=True, with_mean=False, with_std=True)
X_scaled = scaler.fit_transform(X)

y = filtered_df.loc[:, 'Label']  # labels, outcomes for first sumo (1 for wins, 0 for losses)

# define the model
clf1 = LogisticRegression(multi_class='multinomial',solver='newton-cg',random_state=1)
clf2 = KNeighborsClassifier(algorithm='ball_tree',leaf_size=20)
clf3 = DecisionTreeClassifier(random_state=1)
clf4 = GaussianNB()  
clf5 = SVC()
    
# define search space
param_grid1 = [{'penalty': ['l2'],'C': np.power(10., np.arange(-4, 4))}]
param_grid2 = [{'n_neighbors': list(range(1, 10)),'p': [1, 2]}]
param_grid3 = [{'max_depth': list(range(1, 10)) + [None],'criterion': ['gini', 'entropy']}]
param_grid4 = [{'var_smoothing': np.logspace(0,-9, num=100)}]
param_grid5 = [{'kernel': ['linear'],'C': [1, 10]}]

cv_inner = KFold(n_splits=5, shuffle=True, random_state=1)
# define search    
gridcvs = {}
for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4, param_grid5),(clf1, clf2, clf3, clf4, clf5),('LR', 'KNN', 'DTree', 'GNB', 'SVM')):
    search = GridSearchCV(estimator=est,param_grid=pgrid,scoring='accuracy',n_jobs=-1,cv=cv_inner,verbose=0,refit=True)
    gridcvs[name] = search
        
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
        
for name, gs_est in sorted(gridcvs.items()):
    print(50 * '-', '\n')
    print('Algorithm:', name)
    print('    Inner loop:')
    for train_ix, test_ix in cv_outer.split(X_scaled):
        # split data
        X_train, X_test = X_scaled[train_ix, :], X_scaled[test_ix, :]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        result = gridcvs[name].fit(X_train, y_train) # run inner loop hyperparam tuning
        
        # get the best performing model fit on the whole training set
        best_model = result.best_estimator_
        # evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        # evaluate the model
        acc = accuracy_score(y_test, yhat)
        # store the result
        outer_results.append(acc)
        # report progress
        print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))     
    # summarize the estimated performance of the model
    print('\n    Outer Loop:')
    print('ACC : %.3f (%.3f)' % (mean(outer_results), std(outer_results)))

-------------------------------------------------- 

Algorithm: DTree
    Inner loop:
>acc=0.576, est=0.586, cfg={'criterion': 'gini', 'max_depth': 8}
>acc=0.574, est=0.585, cfg={'criterion': 'gini', 'max_depth': 9}
>acc=0.576, est=0.585, cfg={'criterion': 'entropy', 'max_depth': 9}
>acc=0.587, est=0.584, cfg={'criterion': 'gini', 'max_depth': 6}
>acc=0.586, est=0.585, cfg={'criterion': 'gini', 'max_depth': 6}
>acc=0.590, est=0.583, cfg={'criterion': 'gini', 'max_depth': 9}
>acc=0.583, est=0.584, cfg={'criterion': 'entropy', 'max_depth': 5}
>acc=0.589, est=0.583, cfg={'criterion': 'gini', 'max_depth': 5}
>acc=0.584, est=0.585, cfg={'criterion': 'entropy', 'max_depth': 9}
>acc=0.587, est=0.584, cfg={'criterion': 'entropy', 'max_depth': 7}

    Outer Loop:
ACC : 0.583 (0.006)
-------------------------------------------------- 

Algorithm: GNB
    Inner loop:
>acc=0.574, est=0.584, cfg={'var_smoothing': 1.0}
>acc=0.572, est=0.584, cfg={'var_smoothing': 1.0}
>acc=0.578, est=0.583, cfg={'va