In [1]:
import pandas as pd
from TabuLLM.embed import TextColumnTransformer
df = pd.read_csv('../../data/raw.csv')
embeddings = TextColumnTransformer(
    model_type = 'st'
).fit_transform(df.loc[:, ['diagnoses']])
print(f'Shape of embeddings: {embeddings.shape}')

  from tqdm.autonotebook import tqdm, trange


Shape of embeddings: (830, 384)


In [2]:
features_baseline = ['is_female', 'age', 'height', 'weight', 'optime']
features_embedding = [f'X_{i}' for i in range(embeddings.shape[1])]
X = pd.concat([embeddings, df[features_baseline]], axis = 1)
y = df['aki_severity']

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from TabuLLM.compress import CompressClassifier, CompressClassifier_v2

ct_embedding = ColumnTransformer([
    ('baseline', 'passthrough', features_baseline)
    , ('embedding', CompressClassifier(n_neighbors = 50), features_embedding)
], remainder = 'drop')
pipeline_embedding = Pipeline([
    ('coltrans', ct_embedding)
    , ('logit', LogisticRegression(penalty = None, solver = 'newton-cholesky', max_iter = 1000))
])

In [4]:
pipeline_embedding.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'coltrans', 'logit', 'coltrans__n_jobs', 'coltrans__remainder', 'coltrans__sparse_threshold', 'coltrans__transformer_weights', 'coltrans__transformers', 'coltrans__verbose', 'coltrans__verbose_feature_names_out', 'coltrans__baseline', 'coltrans__embedding', 'coltrans__embedding__laplace', 'coltrans__embedding__logit', 'coltrans__embedding__ncv', 'coltrans__embedding__nx', 'logit__C', 'logit__class_weight', 'logit__dual', 'logit__fit_intercept', 'logit__intercept_scaling', 'logit__l1_ratio', 'logit__max_iter', 'logit__multi_class', 'logit__n_jobs', 'logit__penalty', 'logit__random_state', 'logit__solver', 'logit__tol', 'logit__verbose', 'logit__warm_start'])

In [6]:
from sklearn.neighbors import KNeighborsClassifier

ct_embedding_v2 = ColumnTransformer([
    ('baseline', 'passthrough', features_baseline)
    , ('embedding', CompressClassifier_v2(estimator = KNeighborsClassifier(n_neighbors=50)), features_embedding)
], remainder = 'drop')
pipeline_embedding_v2 = Pipeline([
    ('coltrans', ct_embedding_v2)
    , ('logit', LogisticRegression(penalty = None, solver = 'newton-cholesky', max_iter = 1000))
])

In [8]:
pipeline_embedding_v2.get_params()['coltrans__embedding__estimator__n_neighbors']

50

In [None]:
def make_pipeline(
        features_baseline,
        features_text,
        features_embedding,
        features_cluster,
        learner_cluster = None,
        learner_initial = None,
        learner_final = None,
        pre_embed = True,
        do_cluster = True,
        pre_cluster = False,
        soft_cluster = True,
):
    pass
