In [1]:
import pandas as pd
from scipy.stats import fisher_exact

# Example DataFrame
df = pd.DataFrame({
    'x': ['A', 'B', 'A', 'C', 'B', 'B', 'C', 'A'],
    'y': [1, 0, 0, 1, 1, 0, 1, 1]
})

results = []
categories = df['x'].unique()

for category in categories:
    # Create contingency table for each category vs. rest
    table = pd.crosstab(df['x'] == category, df['y'])
    # Calculate Fisher's Exact Test
    _, p_value = fisher_exact(table, alternative='two-sided')
    odds_ratio, _ = fisher_exact(table, alternative='two-sided')
    results.append((category, odds_ratio, p_value))

results_df = pd.DataFrame(results, columns=['Category', 'Odds Ratio', 'P-value'])
print(results_df)

  Category  Odds Ratio   P-value
0        A    1.333333  1.000000
1        B    0.125000  0.464286
2        C         inf  0.464286


In [2]:
dfEmbeddings = pd.read_csv('../data/embeddings.csv')
dfEmbeddings#.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91,X92,X93,X94,X95,X96,X97,X98,X99,X100
0,-0.004914,0.065536,0.000462,-0.002728,0.007441,0.034030,-0.065837,0.027918,0.009296,0.029449,...,-0.022181,0.049798,0.003935,0.039526,0.032888,-0.044452,0.028218,0.003186,-0.058388,-0.022827
1,0.019465,0.059981,-0.004884,0.010679,0.000434,0.057856,-0.021590,0.008793,0.009179,0.028753,...,-0.022216,0.000365,0.041841,0.014515,0.063999,-0.021314,0.027341,-0.011377,-0.060039,-0.014886
2,0.006310,0.052803,0.001488,-0.010779,0.011241,0.012157,-0.034958,0.025509,-0.003456,-0.012611,...,-0.017798,0.046048,-0.010954,0.029525,0.041459,-0.018786,0.014388,0.013798,-0.066921,-0.013504
3,-0.024605,0.027766,0.000341,0.025616,0.003682,0.025818,-0.011927,0.065287,0.015571,0.043525,...,-0.017621,0.022542,0.022470,0.042399,0.021228,-0.050567,0.058822,-0.009142,-0.051751,-0.005033
4,0.013725,0.064173,-0.001580,0.010255,0.013958,0.012550,-0.011009,0.009173,-0.015934,0.016448,...,-0.008481,0.048487,0.014635,0.027340,0.041392,-0.035852,0.039120,0.001916,-0.061403,-0.009010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
825,0.003187,0.061810,-0.001316,0.003906,0.016137,0.047285,-0.022230,0.012820,0.031504,0.013954,...,-0.034186,0.028708,0.024056,0.027452,0.051166,-0.012463,0.018349,-0.008675,-0.058728,-0.029849
826,-0.002295,0.020781,-0.008793,0.007581,-0.029041,0.044218,-0.026725,0.029875,0.027327,0.037177,...,-0.029921,0.022865,0.012220,0.016396,0.049220,-0.041222,0.078307,-0.003952,-0.036467,-0.029828
827,0.003885,0.054498,-0.007109,0.011779,0.007044,0.060663,-0.029159,0.025585,0.030881,0.007865,...,-0.056843,0.024514,0.015253,0.005434,0.058058,-0.010202,0.014160,-0.004179,-0.061415,-0.022068
828,0.006727,0.069821,-0.008365,0.011773,0.011865,0.040893,0.005162,0.019691,0.016622,0.027092,...,-0.019845,0.016468,0.032752,0.038210,0.041479,-0.017532,0.012667,-0.014132,-0.072288,-0.027833


In [3]:
dfRaw = pd.read_csv('../data/raw.csv')
dfRaw#.head()

Unnamed: 0,is_female,age,height,weight,optime,diagnoses,aki_severity
0,1,18.11,148,80.90,112,155500. Cardiac conduit complication;010125. P...,0
1,1,18.23,169,56.10,144,091591. Aortic regurgitation;091519. Congenita...,1
2,1,16.86,166,61.60,114,155516. Cardiac conduit failure;090101. Common...,0
3,1,16.88,162,44.30,109,010116. Partial anomalous pulmonary venous con...,0
4,0,18.12,175,70.50,119,155516. Cardiac conduit failure;010133. Left h...,0
...,...,...,...,...,...,...,...
825,0,0.21,55,4.70,184,091501. Aortic valvar stenosis - congenital; 0...,1
826,1,0.04,47,2.70,188,010501. Discordant VA connections (TGA); 11010...,0
827,0,0.03,55,3.27,82,091501. Aortic valvar stenosis - congenital; 0...,0
828,1,0.01,50,3.91,113,092911. Aortic arch hypoplasia; 071001. Perime...,1


In [4]:
from TextEmbeddingFE.main import cluster_embeddings
import numpy as np

my_embedding_colnames = ["X" + str(n+1) for n in range(100)]
my_embedding_matrix = dfEmbeddings[my_embedding_colnames].to_numpy()
my_clusters = cluster_embeddings(
    X = my_embedding_matrix
    , n_clusters = 10
    , n_init = 10
)
np.bincount(my_clusters)

array([ 65, 126, 135,  37,  60,  66, 114,  27, 165,  35], dtype=int64)

In [5]:
dfFisher = pd.DataFrame(
    data = {'x': my_clusters, 'y': dfRaw['aki_severity']}
)
dfFisher

Unnamed: 0,x,y
0,2,0
1,8,1
2,6,0
3,2,0
4,8,0
...,...,...
825,8,1
826,7,0
827,8,0
828,8,1


In [6]:
from TextEmbeddingFE.main import fisher_test_wrapper
fisher_test_wrapper(dat = dfFisher)

Unnamed: 0,Category,Odds Ratio,P-value
0,2,1.017387,0.9154452
1,8,1.323465,0.1404068
2,6,0.670554,0.1104654
3,5,1.667399,0.05943214
4,1,0.988207,1.0
5,4,0.838812,0.6501177
6,3,5.661538,5.53938e-07
7,9,0.565323,0.2433607
8,0,0.122984,5.7134e-06
9,7,0.625531,0.5050092
