## Section 02: Data Validation ##
**Abstract**: In this notebook, we validate the frequencies of certain features for the poker hand dataset generated previously. This involves verifying that each distinct seven card hand appears with a frequency that approximates its likelihood. Any discrepancies or anomalies found during this validation process will be documented and addressed to maintain the quality of the dataset for subsequent analysis.

In [1]:
import numpy as np
# Imports
import pandas as pd
from deuces import Card, Evaluator

In [2]:
hands_df = pd.read_pickle('../data/hands_long.pkl')
hands_df.head()

Unnamed: 0,hand_id,player_id,flop,river,turn,hole_,flop_eval_,turn_eval_,river_eval_,showdown_order_
0,0,0,"[16795671, 33573149, 16783383]",67119647,1082379,"[533255, 67144223]",4310,4309,2734,2
1,1,0,"[268454953, 1082379, 134253349]",268442665,139523,"[4212241, 164099]",6322,5750,2578,4
2,2,0,"[67119647, 81922, 1065995]",73730,16812055,"[268454953, 8394515]",6428,6388,5977,5
3,3,0,"[268446761, 4204049, 4212241]",268454953,1065995,"[8406803, 8398611]",3018,3018,2516,4
4,4,0,"[268454953, 4212241, 73730]",134228773,268471337,"[16787479, 279045]",6588,3472,3346,8


In [3]:
evaluator = Evaluator()

hand_classes = hands_df['river_eval_'].apply(evaluator.get_rank_class)

# frequency of each class, normalized
hand_dist = pd.DataFrame()
hand_dist['freq'] = hand_classes.value_counts(normalize=True).sort_index()
hand_dist['class_name'] = hand_dist.index.map(evaluator.class_to_string)
hand_dist['class_int'] = hand_dist.index
hand_dist.reset_index(drop=True, inplace=True)
hand_dist

Unnamed: 0,freq,class_name,class_int
0,0.000302,Straight Flush,1
1,0.001716,Four of a Kind,2
2,0.026248,Full House,3
3,0.030383,Flush,4
4,0.046916,Straight,5
5,0.048131,Three of a Kind,6
6,0.23407,Two Pair,7
7,0.438299,Pair,8
8,0.173936,High Card,9


In [4]:
hand_dist['expected_freq'] = pd.Series([
    0.000032 + 0.000279,
    0.00168,
    0.0260,
    0.0303,
    0.0462,
    0.0483,
    0.235,
    0.438,
    0.174
])
hand_dist['error_abs'] = np.abs(hand_dist['freq'] - hand_dist['expected_freq'])
hand_dist['error_sqr'] = np.square(hand_dist['freq'] - hand_dist['expected_freq'])

In [5]:
mse = hand_dist['error_sqr'].mean()
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1.7429004252400407e-07


In [6]:
hand_dist.to_csv('../data/hand_dist.csv', index=False)