In [1]:
#meta 1/25/2021 Poker my Model Knn 
#setup and data from Example 1
#src https://medium.com/@virgoady7/poker-hand-prediction-7a801e254acd
#based on tryPoker_example1.ipynb

#history
#here 1/26/202 TRY KNN MODELING (DATA FROM EXAMPLE 1)
#      Reusing Example 1 setup and datasets
#      Try diff values of k = [3,7,9]

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import os
from os import path
import time #to track performance time
import warnings
warnings.filterwarnings('ignore')

#modeling
#from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

#from joblib import load, dump

# Poker Example with Keras

## 0. Load Data

In [3]:
#$mycodedelta #was
#!wget http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-testing.data
#!wget http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data
#!wget http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand.names

In [4]:
#$mycodedelta
#check if data already downloaded
if path.exists('data/poker-hand.names'):
    print('Poker data already exists')
else:
    !wget http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-testing.data -O 'data/poker-hand-testing.data'
    !wget http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data -O 'data/poker-hand-training-true.data'
    !wget http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand.names -O 'data/poker-hand.names'

Poker data already exists


## 1. Prep Data
note: When switch between train and test.  SVM runs way longer.  NN results are slightly better.  
In reality need `train`, `valid` and `test` datsets.

In [5]:
#data_train=pd.read_csv("data/poker-hand-training-true.data",header=None)
#data_test = pd.read_csv("data/poker-hand-testing.data",header=None)
data_test=pd.read_csv("data/poker-hand-training-true.data",header=None)
data_train = pd.read_csv("data/poker-hand-testing.data",header=None)
col=['Suit of card #1','Rank of card #1','Suit of card #2','Rank of card #2','Suit of card #3','Rank of card #3','Suit of card #4','Rank of card #4','Suit of card #5','Rank of card 5','Poker Hand']
col

['Suit of card #1',
 'Rank of card #1',
 'Suit of card #2',
 'Rank of card #2',
 'Suit of card #3',
 'Rank of card #3',
 'Suit of card #4',
 'Rank of card #4',
 'Suit of card #5',
 'Rank of card 5',
 'Poker Hand']

In [6]:
data_train.columns=col
data_test.columns=col

In [7]:
y_train=data_train['Poker Hand']
y_test=data_test['Poker Hand']
y_train=pd.get_dummies(y_train)
y_test=pd.get_dummies(y_test)

In [8]:
x_train=data_train.drop('Poker Hand',axis=1)
x_test=data_test.drop('Poker Hand',axis=1)

In [9]:
print('Shape of Training Set:',x_train.shape)
print('Shape of Testing Set:',x_test.shape)

Shape of Training Set: (1000000, 10)
Shape of Testing Set: (25010, 10)


In [10]:
x_train.head()

Unnamed: 0,Suit of card #1,Rank of card #1,Suit of card #2,Rank of card #2,Suit of card #3,Rank of card #3,Suit of card #4,Rank of card #4,Suit of card #5,Rank of card 5
0,1,1,1,13,2,4,2,3,1,12
1,3,12,3,2,3,11,4,5,2,5
2,1,9,4,6,1,4,3,2,3,9
3,1,4,3,13,2,13,2,1,3,6
4,3,10,2,7,1,2,2,11,4,9


In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train=le.fit_transform(data_train['Poker Hand'])
y_test=le.transform(data_test['Poker Hand'])

In [12]:
y_train.shape, y_test.shape

((1000000,), (25010,))


## 2. Model 
Previously in example 1, the author compared Keras NN with LogR, CART and SVM.  Here, I try kNN modeling. 

### kNN
try diff values of k

In [13]:
#track time
t0 = time.time()

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train, y_train)

print ("Training time (in min)", (time.time() - t0)/60)

#predict
y_hat=clf.predict(x_test)
print ("+ Testing time (in min)", (time.time() - t0)/60)

accuracy_score(y_hat,y_test)

Training time (in min) 0.48160584767659503
+ Testing time (in min) 0.68642072280248


0.6141543382646941

In [14]:
unique, counts = np.unique(y_hat, return_counts=True)
print (np.asarray((unique, counts)).T)

[[    0 14196]
 [    1 10192]
 [    2   437]
 [    3   155]
 [    4    22]
 [    5     3]
 [    6     5]]


In [15]:
cm = confusion_matrix(y_test, y_hat)
print("Confusion matrix:\n{}".format(cm))

Confusion matrix:
[[9290 3156   44    2    0    1    0    0    0    0]
 [4441 5872  193   76   14    2    1    0    0    0]
 [ 353  682  143   23    4    0    1    0    0    0]
 [  44  381   37   50    0    0    1    0    0    0]
 [  24   61    5    0    3    0    0    0    0    0]
 [  41   13    0    0    0    0    0    0    0    0]
 [   1   18   14    0    1    0    2    0    0    0]
 [   0    1    1    4    0    0    0    0    0    0]
 [   0    5    0    0    0    0    0    0    0    0]
 [   2    3    0    0    0    0    0    0    0    0]]


In [16]:
#track time
t0 = time.time()

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(x_train, y_train)

print ("Training time (in min)", (time.time() - t0)/60)

#predict
y_hat=clf.predict(x_test)
print ("+ Testing time (in min)", (time.time() - t0)/60)

accuracy_score(y_hat,y_test)

Training time (in min) 0.4776416540145874
+ Testing time (in min) 0.7241820176442464


0.6227908836465413

In [17]:
unique, counts = np.unique(y_hat, return_counts=True)
print (np.asarray((unique, counts)).T)

[[    0 14445]
 [    1 10222]
 [    2   251]
 [    3    74]
 [    4    15]
 [    6     3]]


In [18]:
cm = confusion_matrix(y_test, y_hat)
print("Confusion matrix:\n{}".format(cm))

Confusion matrix:
[[9515 2960   17    0    1    0    0    0    0    0]
 [4525 5938   91   37    8    0    0    0    0    0]
 [ 324  773   95   10    3    0    1    0    0    0]
 [  26  431   30   25    0    0    1    0    0    0]
 [  13   70    7    1    2    0    0    0    0    0]
 [  41   13    0    0    0    0    0    0    0    0]
 [   1   24    9    0    1    0    1    0    0    0]
 [   0    3    2    1    0    0    0    0    0    0]
 [   0    5    0    0    0    0    0    0    0    0]
 [   0    5    0    0    0    0    0    0    0    0]]


In [19]:
#track time
t0 = time.time()

clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(x_train, y_train)

print ("Training time (in min)", (time.time() - t0)/60)

#predict
y_hat=clf.predict(x_test)
print ("+ Testing time (in min)", (time.time() - t0)/60)

accuracy_score(y_hat,y_test)

Training time (in min) 0.4773810545603434
+ Testing time (in min) 0.7532381931940715


0.6248300679728108

In [20]:
unique, counts = np.unique(y_hat, return_counts=True)
print (np.asarray((unique, counts)).T)

[[    0 14617]
 [    1 10096]
 [    2   227]
 [    3    59]
 [    4     7]
 [    5     3]
 [    6     1]]


In [21]:
cm = confusion_matrix(y_test, y_hat)
print("Confusion matrix:\n{}".format(cm))

Confusion matrix:
[[9630 2844   16    1    0    2    0    0    0    0]
 [4594 5889   87   26    3    0    0    0    0    0]
 [ 318  790   86   11    1    0    0    0    0    0]
 [  23  446   25   19    0    0    0    0    0    0]
 [   9   80    2    1    1    0    0    0    0    0]
 [  42   11    0    0    0    1    0    0    0    0]
 [   1   24    9    0    1    0    1    0    0    0]
 [   0    3    2    1    0    0    0    0    0    0]
 [   0    5    0    0    0    0    0    0    0    0]
 [   0    4    0    0    1    0    0    0    0    0]]


In [22]:
mystop

NameError: name 'mystop' is not defined

Slightly better results with higher value of k -> finetune hyperparam k.  Test time increases slightly with k.

##### Summary
kNN models showed similar accuracy as Keras NN models ~ 60%. kNN models are the bravest so far, tried to take on and predict more classes (7-10) than LogR(1), CART(1), SVM(2) or NN(4ish).

Still an issue: In ML, using the same `test` ds for validation and testing is not a valid technique.  Next step should be to truly have `train`, `validation` and `test` sets and see how all the models fair with a holdout dataset.

Src: https://keras.io/guides/training_with_built_in_methods/
Here's what the typical end-to-end workflow looks like, consisting of:

- Training
- Validation on a holdout set generated from the original training data
- Evaluation on the test data

In [None]:
mystop

## Xtra

In [None]:
#$xtra my export data for reuse
dump(data_train, 'data/poker_ex1_data_train.pkl')
dump(data_test, 'data/poker_ex1_data_test.pkl')