1. Benchmark Datasets
* ref: https://thinkingneuron.com/how-to-measure-the-correlation-between-two-categorical-variables-in-python/

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency


1.1 Benchmark Datasets: Adult

In [2]:
###### age: continuous.
###### workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
###### fnlwgt: continuous.
###### education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
###### education-num: continuous.
###### marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
###### occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
###### relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
###### race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
###### sex: Female, Male.
###### capital-gain: continuous.
###### capital-loss: continuous.
###### hours-per-week: continuous.
###### native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
###### class: >50K, <=50K


df_adult = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data') 
colnames_adult = ['age', 'workclass','fnlwgt','education','education-num',
                  'marital-status','occupation','relationship','race',
                  'sex','capital-gain','capital-loss','hours-per-week',
                  'native-country','class']
df_adult.columns = colnames_adult
df_adult_cross=pd.crosstab(index=df_adult['sex'],columns=df_adult['race'])
# print("GENDER:", df_adult.sex.unique())
# print("RACE:", df_adult.race.unique())
print(df_adult_cross)


race      Amer-Indian-Eskimo   Asian-Pac-Islander   Black   Other   White
sex                                                                      
 Female                  119                  346    1555     109    8642
 Male                    192                  693    1569     162   19173


1.2 Benchmark: COMPAS

In [3]:
df_compas = pd.read_csv("data/compas_raw.csv", skipinitialspace = True)
df_compas.columns = df_compas.columns.str.replace(' ', '')
df_compas.sex = df_compas.sex.str.replace(' ', '')
df_compas.race = df_compas.race.str.replace(' ', '')
colnames_compas = list(df_compas.columns)
# print(df_compas.sex.unique())
# print(df_compas.race.unique())
# Contingency Tables: cross tabulation between GENDER and RACE
df_compas_cross=pd.crosstab(index=df_compas['sex'],columns=df_compas['race'])
print(df_compas_cross)

race    African-American  Asian  Caucasian  Hispanic  NativeAmerican  Other
sex                                                                        
Female               652      2        567       103               4     67
Male                3044     30       1887       534              14    310


In [4]:
# (3) Benchmark: LACS
df_lacs = pd.read_csv("data/LSAC_raw.csv").iloc[:,2:]
# print(df_lacs.isna().sum())
df_lacs = df_lacs.dropna(axis='columns', thresh=900) # int(df_lacs.shape[0]*0.05)
df_lacs = df_lacs.dropna(axis='rows') # int(df_lacs.shape[0]*0.05)
# print(df_lacs.isna().sum())
assert df_lacs.isna().sum().sum() == 0
colnames_lacs = list(df_lacs.columns)
# Contingency Tables: cross tabulation between GENDER and RACE
df_lacs_cross=pd.crosstab(index=df_lacs['sex'],columns=df_lacs['race'])
print(df_lacs_cross)


race  1.0  2.0  3.0  4.0  5.0  6.0   7.0  8.0
sex                                          
1.0    38  394  747  160   42  217  7206  122
2.0    60  396  451  209   59  239  9933  154


##### Correlation
1. Chi-square test finds the probability of a Null hypothesis(H0).
2. Assumption(H0): The two columns are NOT related to each other
3. Result of Chi-Sq Test: The Probability of H0 being True




In [5]:
def cmr_v(df_cross):
    # df_cross = df_adult_cross.copy()
    res_chisq = chi2_contingency(df_cross, correction=False)
    # res_chisq = chi2_contingency(df_cross, correction=True)
    chi2 = res_chisq[0].round(2)
    p_val = res_chisq[1].round(2)
    N = df_cross.values.sum()
    min_dim = min(df_cross.shape)-1
    cramer_v = np.sqrt((chi2/N) / min_dim).round(2) # Calculate Cramer's V
    print('chi2:', res_chisq[0].round(2), 'p_val:', res_chisq[1].round(2))
    print('N:', N, 'min_dim:', min_dim)
    print("Cramer's V: ", cramer_v)
    return cramer_v

def stat_phi(df_cross):
    # df_cross = df_example_cross.copy()
    n00 = df_cross.iloc[0, 0]
    n01 = df_cross.iloc[0, 1]
    n10 = df_cross.iloc[1, 0]
    n11 = df_cross.iloc[1, 1]
    n1_ = n10+n11
    n_1 = n01+n11    
    n0_ = n00+n01
    n_0 = n00+n10
    phi = ((n11*n00)-(n01*n10))/np.sqrt(n0_*n_0*n1_*n_1)
    print('phi:', phi)
    return phi

def stat_chi(df_cross):
    # df_cross = df_adult_cross.copy()
    I = range(df_cross.shape[0])
    J = range(df_cross.shape[1])
    # N = np.sum(df_cross)
    N = df_cross.values.sum()
    chi = 0
    for i in I:
        for j in J:
            nij = df_cross.iloc[i, j]
            ni_ = df_cross.iloc[i, :].sum()
            n_j = df_cross.iloc[:, j].sum()
            chi += ((nij-((ni_*n_j)/N))**2) / ((ni_*n_j)/N)
    # print('chi:', chi)
    return chi

def cmr_v_(df_cross):
    chi = stat_chi(df_cross)
    # N = np.sum(df_cross)
    N = df_cross.values.sum()
    min_dim = min(df_cross.shape)-1
    cramer_v = np.sqrt((chi/N) / min_dim).round(2) # Calculate Cramer's V
    print('chi:', chi)
    print('N:', N, 'min_dim:', min_dim)
    print("Cramer's V: ", cramer_v)
    return cramer_v


In [6]:

# corr_cmr_adult = cmr_v(df_adult_cross) # K(=2)xJ(>2)
corr_cmr_adult_ = cmr_v_(df_adult_cross)

# corr_cmr_compas = cmr_v(df_compas_cross) # K(=2)xJ(>2)
corr_cmr_compas = cmr_v_(df_compas_cross) # K(=2)xJ(>2)

# corr_cmr_lacs = cmr_v(df_lacs_cross) # K(=2)xJ(>2)
corr_cmr_lacs_ = cmr_v_(df_lacs_cross) # K(=2)xJ(>2)

chi2: 454.2 p_val: 0.0
N: 32560 min_dim: 1
Cramer's V:  0.12
chi: 454.20178110733684
N: 32560 min_dim: 1
Cramer's V:  0.12
chi2: 37.46 p_val: 0.0
N: 7214 min_dim: 1
Cramer's V:  0.07
chi: 37.45588380488598
N: 7214 min_dim: 1
Cramer's V:  0.07
chi2: 204.77 p_val: 0.0
N: 20427 min_dim: 1
Cramer's V:  0.1
chi: 204.76681188080647
N: 20427 min_dim: 1
Cramer's V:  0.1
