In [16]:
import pandas as pd
from scipy.stats import chi2_contingency

# Load the provided data files
X_test5 = pd.read_csv('data/X_test_unique_5.csv')
y_test5 = pd.read_csv('data/y_test_unique_5.csv')
X_train5 = pd.read_csv('data/X_train_unique_5.csv')
y_train5 = pd.read_csv('data/y_train_unique_5.csv')

# Define the function to bin numerical features
def bin_numerical_feature(series, bins=4):
    try:
        return pd.qcut(series, bins, labels=False, duplicates='drop')
    except ValueError:
        # If binning fails, return the original series as categorical data
        return series.astype('category').cat.codes

# Apply the binning function to each feature in X_train5
binned_features = X_train5.apply(bin_numerical_feature, axis=0)

#Perform Chi-squared test for each binned feature
chi2_results = {}
for feature in binned_features.columns:
    contingency_table = pd.crosstab(binned_features[feature], y_train5.squeeze())
    if contingency_table.size == 0:
        print(f"Skipping feature {feature} due to empty contingency table.")
        continue
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    chi2_results[feature] = {
        "Chi2 Statistic": chi2,
        "P-value": p,
        "Degrees of Freedom": dof,
        "Expected Frequencies": expected
    }

# Display the Chi-squared test results
for feature, results in chi2_results.items():
    print(f"Feature: {feature}")
    print(f"  Chi2 Statistic: {results['Chi2 Statistic']}")
    print(f"  P-value: {results['P-value']}")
    print(f"  Degrees of Freedom: {results['Degrees of Freedom']}")
    print(f"  Expected Frequencies: \n{results['Expected Frequencies']}\n")

Skipping feature manufacturer_JANSSEN due to empty contingency table.
Feature: 0-9
  Chi2 Statistic: 0.0
  P-value: 1.0
  Degrees of Freedom: 0
  Expected Frequencies: 
[[874.  46.]]

Feature: 10-19
  Chi2 Statistic: 83.62379392226998
  P-value: 5.984837696951247e-20
  Degrees of Freedom: 1
  Expected Frequencies: 
[[678.3  35.7]
 [195.7  10.3]]

Feature: 20-29
  Chi2 Statistic: 78.00541307705444
  P-value: 1.1517010718633071e-17
  Degrees of Freedom: 2
  Expected Frequencies: 
[[512.05  26.95]
 [151.05   7.95]
 [210.9   11.1 ]]

Feature: 30-39
  Chi2 Statistic: 75.55440249455262
  P-value: 3.922563262580017e-17
  Degrees of Freedom: 2
  Expected Frequencies: 
[[457.9  24.1]
 [199.5  10.5]
 [216.6  11.4]]

Feature: 40-49
  Chi2 Statistic: 76.17142780514624
  P-value: 2.8812743482870434e-17
  Degrees of Freedom: 2
  Expected Frequencies: 
[[506.35  26.65]
 [153.9    8.1 ]
 [213.75  11.25]]

Feature: 50-59
  Chi2 Statistic: 79.7521705632488
  P-value: 4.808794323338089e-18
  Degrees of F

In [13]:
binned_features

Unnamed: 0,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
0,5,11,14,16,17,19,17,14,10,4,...,37,40,46,7,15,16,46,17,27,35
1,5,7,12,12,14,15,14,12,8,3,...,39,0,34,18,22,18,25,48,44,42
2,6,12,15,17,18,20,18,16,12,6,...,30,5,45,2,26,31,26,19,49,21
3,7,13,16,18,19,21,19,17,13,8,...,29,0,49,24,32,21,38,28,49,27
4,7,13,16,18,19,21,19,17,13,8,...,34,17,48,12,29,20,38,49,47,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0,1,1,0,1,3,5,0,0,0,...,31,6,31,8,8,7,37,39,27,32
916,0,0,0,0,1,0,0,0,0,0,...,48,47,35,49,47,46,43,0,0,2
917,0,0,2,2,1,1,0,0,0,0,...,6,20,0,16,26,35,26,12,2,36
918,0,0,0,0,0,0,0,0,0,0,...,45,27,0,44,40,3,21,39,35,30
