## Modules used

In [1]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB,BernoulliNB


## The problem

#### Given a large set of known poker hands, train a machine learning model to identify a poker hand (rules guessing, is pretty hard for machine learning, but an interesting excercise).

##### NOTE:  I did this project after the kaggle competition finished.  So I don't officially have anything to submit/compare to.  Therefore I create a rules based model to compare to.  I then give what my kaggle score would have been based on these rules.

<p>The values of the hands are:</p>
<p>
0: Nothing in hand; not a recognized poker hand <br>
1: One pair; one pair of equal ranks within five cards<br>
2: Two pairs; two pairs of equal ranks within five cards<br>
3: Three of a kind; three equal ranks within five cards<br>
4: Straight; five cards, sequentially ranked with no gaps<br>
5: Flush; five cards with the same suit<br>
6: Full house; pair + different rank three of a kind<br>
7: Four of a kind; four equal ranks within five cards<br>
8: Straight flush; straight + flush<br>
9: Royal flush; {Ace, King, Queen, Jack, Ten} + flush<br>
</p>

### Importing data and setting up df with train/test split of 20%

In [2]:
trainfile="train.csv"
fulldf=pd.read_csv(trainfile)

fulldf.head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,hand
0,4,9,2,1,2,2,4,7,2,8,0
1,1,4,3,6,1,12,3,11,2,7,0
2,1,11,4,1,3,7,4,11,2,1,2
3,2,9,2,4,3,6,1,9,4,9,3
4,1,8,2,4,2,11,2,2,2,1,0


In [3]:
df,dftest=train_test_split(fulldf,test_size=0.2, random_state=42)

### Setting up a function that determines the hand from pre-defined rules

In [4]:
#also used in feature construction
def f(a):
    return np.max(np.bincount(np.array(a)))


In [5]:
def rules(h):
    s=h[5:] #suits
    c=np.sort(h[:5]) # card values
    fl=f(s)==5 # is flush t/f
    diff=np.diff(c) #taking the difference between sorted card values
    sets=np.bincount(np.bincount(c),minlength=5)[1:] # counting the number of pairs and sets
    rst=strait=False  # pre conditioning the strait and royal strait booleans

    #print c
    #determining if a strait or royal strait exisits
    if np.bincount(diff)[1]==4:
        strait=True
    elif all([a==b for a,b in zip(diff,[9,1,1,1])]):
        rst=True
    
    #assigning strait, flush, strait flush, and royal strait flush
    if (rst and not fl) or (strait and not fl):
        return 4
    elif fl and not rst and not strait:
        return 5
    elif strait and fl:
        return 8
    elif rst and fl:
        return 9
    
    if sets[1]==1 and sets[2]==1:
        return 6
    elif sets[1]==1:
        return 1
    elif sets[1]==2:
        return 2
    elif sets[2]==1:
        return 3
    elif sets[4-1]==1:
        return 7
    
    #elif (not fl) and sets[0]==5 and not rst and not strait:
    return 0


<p>I am double checking that the rules correctly label all the training data.  -- It does.</p>

In [6]:
fulldf['ruleout']=map(lambda x1,x2,x3,x4,x5,y1,y2,y3,y4,y5:rules([x1,x2,x3,x4,x5,y1,y2,y3,y4,y5]),
    fulldf['C1'],fulldf['C2'],fulldf['C3'],fulldf['C4'],fulldf['C5'],
    fulldf['S1'],fulldf['S2'],fulldf['S3'],fulldf['S4'],fulldf['S5']
    )
fulldf['outdiff']=map(lambda a,b:np.fabs(a-b),fulldf['ruleout'],fulldf['hand'])

sumh= sum([a==b for a,b in zip(fulldf['ruleout'],fulldf['hand'])])
lh= len(fulldf['hand'])
print lh,sumh,1.*(lh-sumh)/lh

25010 25010 0.0


### Setting up features and prediction analysis:

In [7]:
def features_gen(df):
    '''The input data frame should not have "hand" '''
    dfout=df.copy(deep=True)

    #Sort the cards into order
    dfout['cards'] = map(lambda x1,x2,x3,x4,x5:np.sort([x1,x2,x3,x4,x5]),df['C1'],df['C2'],df['C3'],df['C4'],df['C5'])
    for i in xrange(0,5):
        s="C{:1d}".format(i+1)
        dfout[s]=map(lambda x1:x1[i],dfout['cards'])

    #Sort the suits into order
    dfout['suits'] = map(lambda x1,x2,x3,x4,x5:np.sort([x1,x2,x3,x4,x5]),df['S1'],df['S2'],df['S3'],df['S4'],df['S5'])
    for i in xrange(0,5):
        s="S{:1d}".format(i+1)
        dfout[s]=map(lambda x1:x1[i],dfout['suits'])
    
    #get the difference between the sorted card values
    dfout['diff'] = map(lambda x1,x2,x3,x4,x5:np.diff(np.sort([x1,x2,x3,x4,x5,x1+13])),df['C1'],df['C2'],df['C3'],df['C4'],df['C5'])
    for i in xrange(0,5):
        s="D{:1d}".format(i+1)
        dfout[s]=map(lambda x1:x1[i],dfout['diff'])
    
    #the number of cards of the suit with the most cards (5 is a flush)
    dfout['flush'] = map(lambda x1,x2,x3,x4,x5:f([x1,x2,x3,x4,x5]),df['S1'],df['S2'],df['S3'],df['S4'],df['S5'])
    
    #Drop temporary fields
    dfout=dfout.drop('suits',1)
    dfout=dfout.drop('cards',1)
    dfout=dfout.drop('diff',1)
    
    return dfout[['S1','S2','S3','S4','S5','C1','C2','C3','C4','C5','D1','D2','D3','D4','D5','flush']]

In [8]:
def analyzing_preditions(Xtest):
    s=svc.predict(Xtest)
    r=rfc.predict(Xtest)
    l=lrc.predict(Xtest)
    n=nb.predict(Xtest)

    print "Preditctions without feature Engineering (sorted card numbers)"
    string="{:14s} {:7.6f}%".format("SVM acc.:",svc.score(Xtest,ytest))
    print string
    string="{:14s} {:7.6f}%".format("RFC acc.:",rfc.score(Xtest,ytest))
    print string
    string="{:14s} {:7.6f}%".format("Logit R. acc.:", lrc.score(Xtest,ytest))
    print string
    string="{:14s} {:7.6f}%".format("Naive B. acc.:", nb.score(Xtest,ytest))
    print string

    out=[max([a,b]) for a,b,c in zip(s,r,n)]
    su=sum([a==b for a,b in zip(out,ytest)])
    la=len(out)
    string="{:14s} {:7.6f}%  (max hand of rfc,svm,nb)".format("Ensemble acc.:", 1.*su/la)
    print string


    out=[np.argmax(np.bincount(np.array([a,b,c,d]))) for a,b,c,d in zip(s,r,n,l)]
    su=sum([a==b for a,b in zip(out,ytest)])
    la=len(out)
    string="{:14s} {:7.6f}%  (voting from rfc,svm,nb)".format("Ensemble acc.:", 1.*su/la)
    print string
    
    print "----------------------"

### Building models with different features as input

<p>I use the same models (Random Forest (rfc/rf), SVM, logistic regression (lgc), and Bernoulli Naise Baise with 2 ensembles) to get results.  I then vary the features used from directly as inputted to heavily processed features.  The final features are the number of cards of suit with the most cards, and the difference between the sorted card values.</p>

#### Using the data as given to fit the models - no sorting or anything


In [9]:
X=df[['S1','S2','S3','S4','S5','C1','C2','C3','C4','C5']]
y=df.hand

svc = svm.SVC()#decision_function_shape='ovo')
svc.fit(X,y)

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X,y)

lrc=LogisticRegression(C=1.)
lrc.fit(X,y)

nb = BernoulliNB() 
nb.fit(X,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [10]:
#pro=features_gen(dftest.drop('hand',1))
Xtest=dftest[['S1','S2','S3','S4','S5','C1','C2','C3','C4','C5']]
ytest=dftest.hand

analyzing_preditions(Xtest)

Preditctions without feature Engineering (sorted card numbers)
SVM acc.:      0.582767%
RFC acc.:      0.600560%
Logit R. acc.: 0.506597%
Naive B. acc.: 0.506597%
Ensemble acc.: 0.589764%  (max hand of rfc,svm,nb)
Ensemble acc.: 0.506597%  (voting from rfc,svm,nb)
----------------------


#### Just using the sorted Card numbers and sorted Suits to train the models

In [11]:
processed=features_gen(df.drop('hand',1))
X=processed[['S1','S2','S3','S4','S5','C1','C2','C3','C4','C5']]
y=df.hand

svc = svm.SVC()#decision_function_shape='ovo')
svc.fit(X,y)

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X,y)

lrc=LogisticRegression(C=1.)
lrc.fit(X,y)

nb = BernoulliNB() 
nb.fit(X,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [12]:
pro=features_gen(dftest.drop('hand',1))
Xtest=pro[['S1','S2','S3','S4','S5','C1','C2','C3','C4','C5']]
ytest=dftest.hand

analyzing_preditions(Xtest)

Preditctions without feature Engineering (sorted card numbers)
SVM acc.:      0.951020%
RFC acc.:      0.944622%
Logit R. acc.: 0.549380%
Naive B. acc.: 0.506597%
Ensemble acc.: 0.963015%  (max hand of rfc,svm,nb)
Ensemble acc.: 0.692923%  (voting from rfc,svm,nb)
----------------------


#### With Feature Engineering (Features are: the differences between sorted card values, and the number of cards of the most numerous suit)

In [13]:
processed=features_gen(df.drop('hand',1))
#X=processed[['S1','S2','S3','S4','S5','D1','D2','D3','D4','D5','C1','C2','C3','C4','C5','flush']]
X=processed[['D1','D2','D3','D4','D5','flush']]
y=df.hand

svc = svm.SVC()#decision_function_shape='ovo')
svc.fit(X,y)

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X,y)

lrc=LogisticRegression(C=1.)
lrc.fit(X,y)

nb = BernoulliNB() 
nb.fit(X,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [14]:
pro=features_gen(dftest.drop('hand',1))
#Xtest=pro[['S1','S2','S3','S4','S5','D1','D2','D3','D4','D5','C1','C2','C3','C4','C5','flush']]
Xtest=pro[['D1','D2','D3','D4','D5','flush']]
ytest=dftest.hand

analyzing_preditions(Xtest)

final_model=rfc

Preditctions without feature Engineering (sorted card numbers)
SVM acc.:      0.989204%
RFC acc.:      1.000000%
Logit R. acc.: 0.552779%
Naive B. acc.: 0.918233%
Ensemble acc.: 0.999400%  (max hand of rfc,svm,nb)
Ensemble acc.: 0.943423%  (voting from rfc,svm,nb)
----------------------


#### Discussion before I "submit" my answers to kaggle

<p>
<b>Result:</b> So with the proper features selected we see that the Random Forest gives a perfect result.  As such lets select this as our final model to be "submitted to kaggle" (really compared to a set of rules I wrote).  However, as a data scientist I find it interesting to consider the various models and features:<br>

<b>Progression of Features:</b> So we see that the more I mess with the features the better the accuracy gets particularly with the SVM and random forest models since I am effectively providing more and more information to the model with each improvement of the features.   <br>

<b> Models:</b><br>
<u>Random Forest:</u> Makes sense it does the best of the non-emsemble methods because a decision tree can almost be consided auto generated rules.  However, it is worth noting that for some runs the random forest is beaten by an ensemble including the Random Forest and the SVM.<br>
<u>SVM:</u>  Does pretty well because support vector machines are a non linear model that draws hyper-planes around the various catagories<br>
<u>LogisticRegression:</u> Generally does not do well regardless of the features because it is a linear method the catagories are deffinately non-linear.<br>
<u>Bernoulli Naive Baise:</u> Does fairly wellwith compilcated features but generally the known problems of Naie Baise make it not as good as the SVM/RF.<br>
<u> Ensemble of RF and SVM:</u>  Here the highest hand is selected from the two.  Generally, this is the highest or second highest model.  With less processed features this model tends to do best, but is beaten by RF with higher processed features.<br>
<u> Ensemble of all models voting: </u>  This model does fairly well but is poisoned by the logistic regression model and NB models.<br>
</p>

##### Note:  I am including the rules output for the test.csv File because the answers aren't included now that the kaggle challenge is closed.

### Generating an output file for the test data and the rules data - "Submitting" my answers

In [15]:
fi="test.csv"
testdf=pd.read_csv(fi)
featdf=features_gen(testdf.drop('id',1))
X=featdf[['D1','D2','D3','D4','D5','flush']]
#featdf['hand']=final_model.predict(X)
#out=pd.DataFrame(featdf['hand'],index=testdf['id'])
out=pd.DataFrame()
out['id']=testdf['id']
out['hand']=final_model.predict(X)
out['ruleout']=map(lambda x1,x2,x3,x4,x5,y1,y2,y3,y4,y5:rules([x1,x2,x3,x4,x5,y1,y2,y3,y4,y5]),
    testdf['C1'],testdf['C2'],testdf['C3'],testdf['C4'],testdf['C5'],
    testdf['S1'],testdf['S2'],testdf['S3'],testdf['S4'],testdf['S5']
    )

fileout="/Users/abramvandergeest/fun+progs/poker2/test_predict.csv"
out[['id','hand']].to_csv(fileout,index=False)
fileout="/Users/abramvandergeest/fun+progs/poker2/test_rules.csv"
out[['id','ruleout']].to_csv(fileout,index=False)

<p>So below is the comparison between my predicted results and the rules (or what kaggle would have had).  We see that I would have gotten a kaggle score of (1-2.e-5)=0.99998.  So pretty good without fiddeling to get the best test data result (WHICH WOULD BE BAD! since you are FITTING TO THE TEST DATA).  The exact comparison depends on the "randomized fitting" so the comparison varies a bit with each run.  We see that the biggest error is that the 4 of a kind is most often considered a three of a kind.  Occasionally, a Royal strait flush is only considered a flush.  Both of these errors makes sense since they are the rarest examples in the training set.</p>

In [16]:
out['ruleout']=map(lambda x1,x2,x3,x4,x5,y1,y2,y3,y4,y5:rules([x1,x2,x3,x4,x5,y1,y2,y3,y4,y5]),
    testdf['C1'],testdf['C2'],testdf['C3'],testdf['C4'],testdf['C5'],
    testdf['S1'],testdf['S2'],testdf['S3'],testdf['S4'],testdf['S5']
    )

testdf['ruleout']=out['ruleout']
testdf['hand']=out['hand']
testdf['diff']=map(lambda a,b:np.fabs(a-b),testdf['ruleout'],testdf['hand'])

print testdf[testdf['diff']!=0]

            id  S1  C1  S2  C2  S3  C3  S4  C4  S5  C5  ruleout  hand  diff
54180    54181   3   1   1   5   3   5   4   5   2   5        7     3     4
94010    94011   2   2   1   4   3   4   2   4   4   4        7     3     4
100795  100796   3   2   2  11   4  11   3  11   1  11        7     3     4
178405  178406   3   3   2  10   4  10   3  10   1  10        7     3     4
193692  193693   4   3   4   6   2   6   3   6   1   6        7     3     4
198727  198728   4   4   1   8   3   8   4   8   2   8        7     3     4
325202  325203   2   8   2   9   3   9   4   9   1   9        7     3     4
440176  440177   1   3   3  12   4  12   2  12   1  12        7     3     4
467572  467573   4   1   3   3   4   3   1   3   2   3        7     3     4
531760  531761   2   2   4   6   1   6   3   6   2   6        7     3     4
587698  587699   2   6   1   9   2   9   3   9   4   9        7     3     4
611320  611321   2   2   3   9   4   9   2   9   1   9        7     3     4
635356  6353

In [17]:
sumh= sum([a==b for a,b in zip(out['ruleout'],out['hand'])])
lh= len(out['hand'])
print lh,sumh,1.*(lh-sumh)/lh,1.-1.*(lh-sumh)/lh

1000000 999980 2e-05 0.99998
