In [2]:
import pandas as pd
import numpy as np

from scipy.stats import chi2_contingency

In [16]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")

In [19]:
def chi2_score(col, other_col):
    obs = pd.crosstab(col, other_col)
    chi2, p, dof, ex = chi2_contingency(obs)
    return chi2, p, obs

In [15]:
df.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

## banner_pos and click
They are the features with the smallest cardinalities.

In [45]:
df_banner_pos_not_null = df[df.banner_pos != 0]
_, p, obs = chi2_score(df_banner_pos_not_null.click, df_banner_pos_not_null.banner_pos)
p

3.9523722820364723e-44

In [46]:
obs

banner_pos,1,2,3,4,5,7
click,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,227266,296,36,146,131,664
1,51091,39,12,31,19,364


## Low cardinality features

In [33]:
low_cardinality_cols = ['click', 'C1', 'banner_pos', 'site_category',
                        'app_category', 'device_type', 'device_conn_type',
                        'C15', 'C16', 'C18', 'C19', 'C21']
#low_cardinality_cols = ['click', 'C1', 'device_type', 'device_conn_type', 'C15', 'C16']

In [25]:
import itertools

In [34]:
pvals = pd.DataFrame(index=low_cardinality_cols, columns=low_cardinality_cols)
for c1, c2 in itertools.combinations(low_cardinality_cols, 2):
    _, p, _ = chi2_score(df[c1], df[c2])
    pvals.loc[c1, c2] = p
pvals

Unnamed: 0,click,C1,banner_pos,site_category,app_category,device_type,device_conn_type,C15,C16,C18,C19,C21
click,,0.0,1.1979e-167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C1,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
banner_pos,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
site_category,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
app_category,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
device_type,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0
device_conn_type,,,,,,,,0.0,0.0,0.0,0.0,0.0
C15,,,,,,,,,0.0,0.0,0.0,0.0
C16,,,,,,,,,,0.0,0.0,0.0
C18,,,,,,,,,,,0.0,0.0


## app_id and app_domain

In [20]:
chi2, p, obs = chi2_score(df.app_category, df.site_category)

In [21]:
chi2, p

(786112.1963098047, 0.0)

In [22]:
obs

site_category,0569f928,110ab22d,28905ebd,335d28a8,3e814130,42a36e14,50e219e0,5378d028,70fb0e29,72722551,...,8fd0aea4,9ccfa2ea,a818d37a,bcf865d9,c0dd3be3,c706e647,dedf689d,e787de0e,f028772b,f66779e6
app_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
07d7df22,427,1,181991,3326,75290,61,56665,18,598,686,...,186,5,79,24,1087,1,571,34,313740,6278
09481d60,0,0,0,0,0,0,1301,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0bfbc358,0,0,0,0,0,0,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0d82db25,0,0,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0f2161f8,0,0,0,0,0,0,235972,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0f9a328c,0,0,0,0,0,0,139,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18b1e0be,0,0,0,0,0,0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2281a340,0,0,0,0,0,0,55,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2fc4f2aa,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4681bb9d,0,0,0,0,0,0,145,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## site_id and site_domain
They should be correlated, but the chi2 statistic is huge---too many zero entries.

In [13]:
df.site_id.nunique(), df.site_domain.nunique()

(2667, 2875)

In [14]:
obs = pd.crosstab(df.site_id, df.site_domain)
chi2, p, dof, ex = chi2_contingency(obs)
chi2, p

(1691595879.2559881, 0.0)

## Example scipy.stats.chi2_contingency

In [125]:
# independent case
k = 3 # the number of categories
n = int(1e5)
x = np.random.randint(k, size=n)
y = np.random.randint(k, size=n)
obs = pd.crosstab(x, y)
obs

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,11032,11016,11273
1,11186,11170,11051
2,11007,11016,11249


In [126]:
chi2, p, dof, ex = chi2_contingency(obs)

In [127]:
# p is large because x and y are independent
# (p := integrate the chi2 distribution from the chi2 value reported to infty) 
chi2, p, dof, ex

(5.492058682497643,
 0.24042841016763603,
 4,
 array([[11070.90225, 11063.23842, 11186.85933],
        [11099.47575, 11091.79214, 11215.73211],
        [11054.622  , 11046.96944, 11170.40856]]))

In [3]:
# dependent case
k = 3 # the number of categories
n = int(1e5)
x = np.random.randint(k, size=n)
y = np.mod(x+1, 3)
obs = pd.crosstab(x, y)
obs

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,33393,0
1,0,0,33361
2,33246,0,0


In [5]:
chi2, p, dof, ex = chi2_contingency(obs)
# chi2 is large, and p is small because x and y are correlated.
chi2, p, dof, ex

(200000.0, 0.0, 4, array([[11101.83678, 11150.92449, 11140.23873],
        [11091.19806, 11140.23873, 11129.56321],
        [11052.96516, 11101.83678, 11091.19806]]))