In [1]:
import pandas as pd
import numpy as np

dfEmbeddings = pd.read_csv(
    'C:/Users/alire/OneDrive/data/statman_bitbucket/aki/LLM/March2024/openai_3large_operation.csv'
)

dfPatients = pd.read_csv(
    'C:/Users/alire/OneDrive/data/statman_bitbucket/aki/LLM/March2024/patients_for_python.csv'
)

my_features = ['age', 'is_female', 'height_residual', 'bmi']

dfPatients_subset = dfPatients.loc[:, ['project_id', 'operation_no', 'kdigo_stage'] + my_features].dropna()

dfCombined = pd.merge(
    dfPatients_subset
    , dfEmbeddings
    , on = ['project_id', 'operation_no']
    , how = 'inner'
)

Xall, y = (
    dfCombined.iloc[:, 3:]
    , dfCombined.iloc[:, 2].to_numpy(dtype = 'int')
)

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from TextEmbeddingFE.main import TextToNumberClassifier

ct = ColumnTransformer(
    [("text2number", TextToNumberClassifier(), slice(4, 4 + 3072))]
    , remainder = 'passthrough'
)
pipe = Pipeline([('preprocess', ct), ('logit', LogisticRegression(penalty = None))])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.model_selection import GridSearchCV, RepeatedKFold
import time

param_grid = {
    'preprocess__text2number__nx': [50, 200, 1000, 3072],
    'preprocess__text2number__base_learner__n_neighbors': [5, 10, 50]  # Access the n_neighbors parameter
}

# GridSearchCV
rkf = RepeatedKFold(
    n_splits = 5
    , n_repeats= 10
    #, random_state = 12345
)
grid_search = GridSearchCV(pipe, param_grid, cv = rkf, n_jobs = 5)

# Fit GridSearchCV
t = time.time()
grid_search.fit(Xall, y)
t = time.time() - t
print(f'time: {round(t, 2)}sec')

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)

time: 66.42sec
Best parameters found:  {'preprocess__text2number__base_learner__n_neighbors': 10, 'preprocess__text2number__nx': 1000}


In [4]:
df = pd.DataFrame(grid_search.cv_results_['params'])
df.columns = ['n_neighbors', 'nx']
df['mean_test_score'] = grid_search.cv_results_['mean_test_score']
df.sort_values('mean_test_score', ascending=False)

Unnamed: 0,n_neighbors,nx,mean_test_score
6,10,1000,0.737108
3,5,3072,0.736747
8,50,50,0.736747
2,5,1000,0.736024
7,10,3072,0.735904
11,50,3072,0.735663
10,50,1000,0.73494
0,5,50,0.734699
4,10,50,0.734458
9,50,200,0.733133
