# Advanced Topics

## Benefit of Cross-Fitting in `compress` Module

In [1]:
import pandas as pd
from TabuLLM.embed import TextColumnTransformer

import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=openai_api_key)
google_project_id = os.getenv('VERTEXAI_PROJECT')
google_location = os.getenv('VERTEXAI_LOCATION')

openai_args = {
    'client': client
}
google_args = {
    'project_id': google_project_id,
    'location': google_location
}

df = pd.read_csv('../data/raw.csv')

#textCol = 'diagnoses'
textCol = 'operations'

embeddings = TextColumnTransformer(
    #model_type = 'st'
    model_type = 'openai', openai_args = openai_args
    #model_type = 'google', google_args = google_args
).fit_transform(df.loc[:, [textCol]])
print(f'Shape of embeddings: {embeddings.shape}')

features_baseline = ['is_female', 'age', 'height', 'weight', 'optime']
features_embedding = [f'X_{i}' for i in range(embeddings.shape[1])]
X = pd.concat([embeddings, df[features_baseline]], axis = 1)
y = df['aki_severity']

  from tqdm.autonotebook import tqdm, trange


Shape of embeddings: (830, 3072)


In [2]:
from TabuLLM.compress import CompressClassifier
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

ct_crossfit = ColumnTransformer([
    ('baseline', 'passthrough', features_baseline)
    , ('embedding', CompressClassifier(n_neighbors = 50, ncv=10), features_embedding)
], remainder = 'drop')
pipeline_crossfit = Pipeline([
    ('coltrans', ct_crossfit)
    , ('logit', LogisticRegression(penalty = None, solver = 'newton-cholesky', max_iter = 1000))
])

ct_naive = ColumnTransformer([
    ('baseline', 'passthrough', features_baseline)
    , ('embedding', CompressClassifier(n_neighbors = 50, ncv=0), features_embedding)
], remainder = 'drop')
pipeline_naive = Pipeline([
    ('coltrans', ct_naive)
    , ('logit', LogisticRegression(penalty = None, solver = 'newton-cholesky', max_iter = 1000))
])

from sklearn.model_selection import KFold, cross_val_score, RepeatedKFold

#kf = KFold(n_splits = 50, shuffle = True, random_state = 4321)
kf = RepeatedKFold(n_splits = 5, n_repeats = 50, random_state = 1111)

auc_naive = cross_val_score(
    pipeline_naive
    , X, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=10
)
auc_crossfit = cross_val_score(
    pipeline_crossfit
    , X, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=10
)
print(f'Naive/Cross-fit AUC: {auc_naive.mean():.3f}/{auc_crossfit.mean():.3f}')

# test the correlation between the two sets of AUCs
from scipy.stats import pearsonr
corrtest = pearsonr(auc_naive, auc_crossfit)
print(f'Pearson correlation (p-value): {corrtest[0]:.3f} ({corrtest[1]:.3e})')

# paired t-test
from scipy.stats import ttest_rel
ttest = ttest_rel(auc_naive, auc_crossfit)
print(f'P-value of paired t-test: {ttest[1]:.3e}')

Naive/Cross-fit AUC: 0.661/0.690
Pearson correlation (p-value): 0.747 (7.885e-46)
P-value of paired t-test: 1.083e-41


## KNN+Cross-fit vs. Clustering

In [3]:
from TabuLLM.cluster import SphericalKMeans

skmeans = SphericalKMeans(n_clusters=10, n_init=100).fit(embeddings)
clusters_soft = skmeans.transform(embeddings)
clusters_hard = skmeans.predict(embeddings)
print(f'Shape of clusters_soft: {clusters_soft.shape}')
print(f'Shape of clusters_hard: {clusters_hard.shape}')

Shape of clusters_soft: (830, 10)
Shape of clusters_hard: (830,)


In [4]:
varnames_cluster_soft = ['X' + str(n) for n in range(clusters_soft.shape[1])]
varnames_baseline = ['is_female', 'age', 'height', 'weight', 'optime']
dfCluster_soft = pd.DataFrame(clusters_soft, columns=varnames_cluster_soft)
dfCombined_soft = pd.concat([df, dfCluster_soft], axis=1)
X_soft = dfCombined_soft[varnames_baseline + varnames_cluster_soft]

dfCluster_hard = pd.DataFrame(clusters_hard, columns=['cluster'])
dfCombined_hard = pd.concat([df, dfCluster_hard], axis=1)
X_hard = dfCombined_hard[varnames_baseline + ['cluster']]

In [5]:
from sklearn.preprocessing import OneHotEncoder

pipeline_soft = LogisticRegression(max_iter=10, solver = 'newton-cholesky', penalty=None)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', varnames_baseline),
        ('cat', OneHotEncoder(drop = 'first'), ['cluster'])
    ]
    , remainder='drop'
)
pipeline_hard = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=10, solver = 'newton-cholesky', penalty=None))])

In [6]:
auc_soft = cross_val_score(
    pipeline_soft
    , X_soft, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=10
)
auc_hard = cross_val_score(
    pipeline_hard
    , X_hard, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=10
)
print(f'Soft/Hard AUC: {auc_soft.mean():.3f}/{auc_hard.mean():.3f}')

Soft/Hard AUC: 0.709/0.714


In [7]:
#pearsonr(auc_crossfit, auc_hard)

In [12]:
# apply PCA to X_soft
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
pca.fit(clusters_soft)
# examine eigenvalues
print(pca.explained_variance_ratio_)
# cumsum of above
print(pca.explained_variance_ratio_.cumsum())

[0.57678346 0.16431861 0.08219219 0.0469406  0.04502442 0.03451438
 0.02511895 0.01093988 0.00982741 0.0043401 ]
[0.57678346 0.74110207 0.82329426 0.87023485 0.91525927 0.94977366
 0.9748926  0.98583248 0.9956599  1.        ]


In [20]:
preprocessor_soft_pca = ColumnTransformer(
    transformers=[
        ('pass', 'passthrough', varnames_baseline),
        ('pca', PCA(n_components=10), varnames_cluster_soft)
    ]
    , remainder='drop'
)
pipeline_soft_pca = Pipeline(steps=[('preprocessor', preprocessor_soft_pca),
                      ('classifier', LogisticRegression(max_iter=10, solver = 'newton-cholesky', penalty=None))])
auc_soft_pca = cross_val_score(
    pipeline_soft_pca
    , X_soft, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=10
)
print(f'Soft PCA AUC: {auc_soft_pca.mean():.3f}')

Soft PCA AUC: 0.709


In [31]:
preprocessor_hard_pca = ColumnTransformer(
    transformers=[
        ('pass', 'passthrough', varnames_baseline),
        ('pca', Pipeline(steps = [('onehot', OneHotEncoder(drop = 'first')), ('pca', PCA(n_components = 6, svd_solver='arpack'))]), ['cluster'])
    ]
    , remainder='drop'
)
pipeline_hard_pca = Pipeline(steps=[('preprocessor', preprocessor_hard_pca),
                      ('classifier', LogisticRegression(max_iter=10, solver = 'newton-cholesky', penalty=None))])
auc_hard_pca = cross_val_score(
    pipeline_hard_pca
    , X_hard, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=10
)
print(f'Hard PCA AUC: {auc_hard_pca.mean():.3f}')

Hard PCA AUC: 0.664


In [49]:
pipeline_soft_pca_v2 = Pipeline(steps=[('pca', PCA(n_components=13)),
                      ('classifier', LogisticRegression(max_iter=10, solver = 'newton-cholesky', penalty=None))])
auc_soft_pca_v2 = cross_val_score(
    pipeline_soft_pca_v2
    , X_soft, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=10
)
print(f'Soft PCA AUC - v2: {auc_soft_pca_v2.mean():.3f}')

Soft PCA AUC - v2: 0.713


In [54]:
preprocessor_hard_pca_v2 = ColumnTransformer(
    transformers=[
        ('pass', 'passthrough', varnames_baseline),
        ('cat', OneHotEncoder(drop = 'first'), ['cluster'])
    ]
    , remainder='drop'
)
pipeline_hard_pca_v2 = Pipeline(steps=[
    ('preprocessor', preprocessor_hard_pca_v2)
    , ('pca', PCA(n_components=10))
    , ('classifier', LogisticRegression(max_iter=10, solver = 'newton-cholesky', penalty=None))
    ])
auc_hard_pca_v2 = cross_val_score(
    pipeline_hard_pca_v2
    , X_hard, y, cv = kf
    , scoring = 'roc_auc'
    , n_jobs=10
)
print(f'Hard PCA AUC - v2: {auc_hard_pca_v2.mean():.3f}')

Hard PCA AUC - v2: 0.668


In [39]:
X_soft.shape[1]

15

In [50]:
X_hard.head()

Unnamed: 0,is_female,age,height,weight,optime,cluster
0,1,18.11,148,80.9,112,0
1,1,18.23,169,56.1,144,9
2,1,16.86,166,61.6,114,0
3,1,16.88,162,44.3,109,2
4,0,18.12,175,70.5,119,0
