In [1]:
#meta 1/25/2021 Poker my Model Knn 
#Previously in example 1, the author compared Keras NN with LogR, CART and SVM. Here, I try kNN modeling.
#setup like Example 1 with with transformed dataset from Kaggle (800K records x 54 features)
#src https://medium.com/@virgoady7/poker-hand-prediction-7a801e254acd
#based on tryPoker_example1.ipynb

#history
#based on tryPoker_example1.ipynb
#      Trainset 25,010 records, testset 1mil records

#here 1/27/202 TRY KNN MODELING - SWITCH TO TRANSFORMED KAGGLE DS
#      Kaggle ds, transformed into numpy matrix 800K x 54
#      Try diff values of k = [3,7]


In [2]:
#!pip freeze

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import os
from os import path
import time #to track performance time
import warnings
warnings.filterwarnings('ignore')

#modeling
#from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

#modeling help
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from joblib import load, dump

# Poker Example with Keras
Previously in example 1, the author compared Keras NN with LogR, CART and SVM.  Here, I try kNN modeling.

## 0. Load Data

In [4]:
poker = load('data/my_poker_tidy.pkl')
poker.shape

(800000, 53)

In [5]:
poker['class'].cat.categories

Index(['Nothing', 'One pair', 'Two pairs', 'Three of a kind', 'Straight',
       'Flush', 'Full house', 'Four of a kind', 'Straight flush',
       'Royal flush'],
      dtype='object')

## 1. Prep Data
X & y

In [6]:
X = poker.iloc[:,:-1]
y = poker['class'].cat.codes

In [7]:
# Randomly split data into two groups: a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=24)

print ("\nTotal records in learning set: {}".format(poker.shape[0]))
print ("Total records in training set: {}".format(len(X_train)))
print ("Total records in test set: {}".format(len(X_test)))
    
#notice index after reshuffling
X_train.tail()


Total records in learning set: 800000
Total records in training set: 720000
Total records in test set: 80000


Unnamed: 0,ace_clubs,ace_diamonds,ace_hearts,ace_spades,eight_clubs,eight_diamonds,eight_hearts,eight_spades,five_clubs,five_diamonds,...,ten_hearts,ten_spades,three_clubs,three_diamonds,three_hearts,three_spades,two_clubs,two_diamonds,two_hearts,two_spades
478609,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
516439,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
211136,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
899,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
242082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [8]:
y_train.shape, y_test.shape

((720000,), (80000,))


## 2. Model  

### kNN
try diff values of k = [3,7]

In [9]:
#track time
t0 = time.time()

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

print ("Training time (in min)", (time.time() - t0)/60)

#predict
y_hat=clf.predict(X_test)
print ("+ Testing time (in min)", (time.time() - t0)/60)

accuracy_score(y_hat,y_test)

Training time (in min) 0.008503878116607666
+ Testing time (in min) 23.600722944736482


0.7499375

In [10]:
unique, counts = np.unique(y_hat, return_counts=True)
print (np.asarray((unique, counts)).T)

[[    0 44757]
 [    1 32765]
 [    2  1551]
 [    3   802]
 [    4    48]
 [    5    52]
 [    6    20]
 [    7     5]]


In [11]:
cm = confusion_matrix(y_test, y_hat)
print("Confusion matrix:\n{}".format(cm))

Confusion matrix:
[[34746  5479     0     0    14    14     0     0     0]
 [ 9634 23499   453    43    12     6     0     0     0]
 [    0  2786   997    13     0     0     3     0     0]
 [    0   939    31   685     0     0     8     0     0]
 [  260    46     0     0    22     0     0     0     0]
 [  116    16     0     0     0    32     0     0     0]
 [    0     0    70    37     0     0     9     0     0]
 [    0     0     0    24     0     0     0     5     0]
 [    1     0     0     0     0     0     0     0     0]]


In [12]:
#track time
t0 = time.time()

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)

print ("Training time (in min)", (time.time() - t0)/60)

#predict
y_hat=clf.predict(X_test)
print ("+ Testing time (in min)", (time.time() - t0)/60)

accuracy_score(y_hat,y_test)

Training time (in min) 0.007603196303049723
+ Testing time (in min) 27.85140189329783


0.7847625

In [13]:
unique, counts = np.unique(y_hat, return_counts=True)
print (np.asarray((unique, counts)).T)

[[    0 45327]
 [    1 32982]
 [    2   970]
 [    3   689]
 [    4    13]
 [    5    15]
 [    6     3]
 [    7     1]]


In [14]:
cm = confusion_matrix(y_test, y_hat)
print("Confusion matrix:\n{}".format(cm))

Confusion matrix:
[[36443  3806     0     0     2     2     0     0     0]
 [ 8468 25000   152    20     5     2     0     0     0]
 [    0  3087   710     2     0     0     0     0     0]
 [    0  1029    23   609     0     0     2     0     0]
 [  276    46     0     0     6     0     0     0     0]
 [  139    14     0     0     0    11     0     0     0]
 [    0     0    85    30     0     0     1     0     0]
 [    0     0     0    28     0     0     0     1     0]
 [    1     0     0     0     0     0     0     0     0]]


kNN models trained on transformed Kaggle dataset (800K records in learning set) perform at 74%+ accuracy, which is a great improvement over models trained on non-transformed data.

##### Summary
Same data represented differently made a simple algorithm way more accurate than a DL model with non-transformed data.  So it really matters how you feed the data to ML. 

In [15]:
mystop

NameError: name 'mystop' is not defined

In [None]:
#track time
t0 = time.time()

clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(x_train, y_train)

print ("Training time (in min)", (time.time() - t0)/60)

#predict
y_hat=clf.predict(x_test)
print ("+ Testing time (in min)", (time.time() - t0)/60)

accuracy_score(y_hat,y_test)

In [None]:
unique, counts = np.unique(y_hat, return_counts=True)
print (np.asarray((unique, counts)).T)

In [None]:
cm = confusion_matrix(y_test, y_hat)
print("Confusion matrix:\n{}".format(cm))

In [None]:
mystop

Slightly better results with higher value of k -> finetune hyperparam k.  Test time increases slightly with k.

In [None]:
mystop

## Xtra

In [None]:
import scipy.sparse
mat = scipy.sparse.eye(3)
pd.DataFrame.sparse.from_spmatrix(mat)
