# Advanced Topics

## Benefit of Cross-Fitting in `compress` Module

In [1]:
import pandas as pd
from TabuLLM.embed import TextColumnTransformer
df = pd.read_csv('../data/raw.csv')
embeddings = TextColumnTransformer(
    model_type = 'st'
).fit_transform(df.loc[:, ['diagnoses']])
print(f'Shape of embeddings: {embeddings.shape}')

features_baseline = ['is_female', 'age', 'height', 'weight', 'optime']
features_embedding = [f'X_{i}' for i in range(embeddings.shape[1])]
X = pd.concat([embeddings, df[features_baseline]], axis = 1)
y = df['aki_severity']

  from tqdm.autonotebook import tqdm, trange


Shape of embeddings: (830, 384)


In [23]:
from TabuLLM.compress import CompressClassifier
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

ct_crossfit = ColumnTransformer([
    ('baseline', 'passthrough', features_baseline)
    , ('embedding', CompressClassifier(n_neighbors = 50, ncv=5), features_embedding)
], remainder = 'drop')
pipeline_crossfit = Pipeline([
    ('coltrans', ct_crossfit)
    , ('logit', LogisticRegression(penalty = None, solver = 'newton-cholesky', max_iter = 1000))
])

ct_naive = ColumnTransformer([
    ('baseline', 'passthrough', features_baseline)
    , ('embedding', CompressClassifier(n_neighbors = 50, ncv=0), features_embedding)
], remainder = 'drop')
pipeline_naive = Pipeline([
    ('coltrans', ct_naive)
    , ('logit', LogisticRegression(penalty = None, solver = 'newton-cholesky', max_iter = 1000))
])

from sklearn.model_selection import KFold, cross_val_score, RepeatedKFold

#kf = KFold(n_splits = 50, shuffle = True, random_state = 4321)
kf = RepeatedKFold(n_splits = 10, n_repeats = 50, random_state = 4321)

auc_naive = cross_val_score(
    pipeline_naive
    , X, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=5
)
auc_crossfit = cross_val_score(
    pipeline_crossfit
    , X, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=5
)
print(f'Naive/Cross-fit AUC: {auc_naive.mean():.3f}/{auc_crossfit.mean():.3f}')

# test the correlation between the two sets of AUCs
from scipy.stats import pearsonr
corrtest = pearsonr(auc_naive, auc_crossfit)
print(f'Pearson correlation (p-value): {corrtest[0]:.3f} ({corrtest[1]:.3e})')

# paired t-test
from scipy.stats import ttest_rel
ttest = ttest_rel(auc_naive, auc_crossfit)
print(f'P-value of paired t-test: {ttest[1]:.3e}')

Naive/Cross-fit AUC: 0.635/0.682
Pearson correlation (p-value): 0.665 (3.161e-65)
P-value of paired t-test: 9.603e-67


## KNN+Cross-fit vs. Clustering