In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

dfEmbeddings = pd.read_csv(
    'C:/Users/alire/OneDrive/data/statman_bitbucket/aki/LLM/March2024/openai_3large_operation.csv'
)
#dfEmbeddings.head()

dfPatients = pd.read_csv(
    'C:/Users/alire/OneDrive/data/statman_bitbucket/aki/LLM/March2024/patients_for_python.csv'
)
#dfPatients#.head()

my_features = ['age', 'is_female', 'height_residual', 'bmi']

dfPatients_subset = dfPatients.loc[:, ['project_id', 'operation_no', 'kdigo_stage'] + my_features].dropna()
#dfPatients_subset#.head()

dfCombined = pd.merge(
    dfPatients_subset
    , dfEmbeddings
    , on = ['project_id', 'operation_no']
    , how = 'inner'
)
#dfCombined.head()

X, y, Z, Xall = (
    dfCombined.iloc[:, (3 + len(my_features)):].to_numpy()
    , dfCombined.iloc[:, 2].to_numpy(dtype = 'int')
    , dfCombined.iloc[:, 3:(3 + len(my_features))].to_numpy()
    , dfCombined.iloc[:, 3:]
)
#X_train, X_test, y_train, y_test, Z_train, Z_test = train_test_split(X, y, Z, test_size = 0.3)

In [129]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, RegressorMixin
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
import numpy as np
from sklearn.utils.validation import check_X_y
from sklearn.utils.multiclass import type_of_target
from sklearn.model_selection import KFold
import copy

class TextToNumberBase(BaseEstimator, TransformerMixin):
    def __init__(self, nx = None, ncv = 5):
        self.nx = nx
        self.ncv = ncv
        pass

    def fit(self, X, y=None):
        # Implement fitting logic here, if needed
        return self

    def transform(self, X):
        # Implement transformation logic here
        X_transformed = X  # Example transformation
        return X_transformed

class TextToNumberClassifier(TextToNumberBase, ClassifierMixin):
    def __init__(self, nx = None, ncv = 5, logit = True, laplace = True, **kwargs):
        super().__init__(nx = nx, ncv = ncv)
        self.knn = KNeighborsClassifier(**kwargs)
        self.logit = logit
        self.laplace = laplace
        pass

    def fit(self, X, y):
        print('inside fit method')
        if not self.nx:
            self.nx = X.shape[1]
        
        if self.nx > X.shape[1]:
            raise ValueError('X has fewer columns than nx')
        
        X, y = check_X_y(X, y)
        if type_of_target(y) != 'binary':
            raise ValueError('Target type must be binary')
        
        # select subset of columns and renormalize
        X = np.apply_along_axis(lambda x: x / np.sqrt(np.sum(x * x)), 1, X[:, :self.nx])
        
        # create folds
        kf = KFold(n_splits = self.ncv, shuffle = True)
        kf.get_n_splits(X)
        self.kfolds = kf
        
        # train model within each fold
        trained_models = []
        insample_prediction_proba = np.empty(len(y), dtype = float)
        for (train_index, test_index) in kf.split(X):
            tmp_knn = copy.deepcopy(self.knn).fit(X[train_index, :], y[train_index])
            
            tmp_pred = tmp_knn.predict_proba(X[test_index, :])[:, 1]
            if self.laplace:
                tmp_pred = (tmp_pred * self.knn.n_neighbors + 1) / (self.knn.n_neighbors + 2)
            if self.logit:
                tmp_pred = np.log(tmp_pred / (1.0 - tmp_pred))
            insample_prediction_proba[test_index] = tmp_pred
            
            trained_models.append(tmp_knn)

        self.trained_models = trained_models
        self.insample_prediction_proba = np.reshape(insample_prediction_proba, (insample_prediction_proba.size, 1))
        return self

    def fit_transform(self, X, y):
        print('here!')
        self.fit(X, y)
        return self.insample_prediction_proba
    
    def transform(self, X):
        print('hopefully not here!')
        # select subset of columns and renormalize
        X = np.apply_along_axis(lambda x: x / np.sqrt(np.sum(x * x)), 1, X[:, :self.nx])
        
        all_preds = np.empty((len(X), self.ncv), dtype = float)
        for n in range(self.ncv):
            tmp_pred = self.trained_models[n].predict_proba(X)[:, 1]
            if self.laplace:
                tmp_pred = (tmp_pred * self.knn.n_neighbors + 1) / (self.knn.n_neighbors + 2)
            if self.logit:
                tmp_pred = np.log(tmp_pred / (1.0 - tmp_pred))
            all_preds[:, n] = tmp_pred
        ret = np.mean(all_preds, axis = 1)
        return np.reshape(ret, (ret.size, 1))

In [106]:
my_ttn = TextToNumberClassifier(ncv = 5, nx = 10)
my_ttn.fit_transform(X, y)[:5]

array([[-1.79175947],
       [-1.79175947],
       [-1.79175947],
       [ 0.28768207],
       [-1.79175947]])

In [107]:
my_ttn.transform(X[:5, :])

array([[-1.61666572],
       [-1.09138448],
       [-1.61666572],
       [-0.06818532],
       [-1.61666572]])

In [66]:
from sklearn.compose import ColumnTransformer

In [111]:
ct = ColumnTransformer(
    [("text2number", TextToNumberClassifier(), slice(4, 4 + 3072))]
    , remainder = 'passthrough'
)
#ct.fit_transform(Xall, y)

In [130]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([('preprocess', ct), ('logit', LogisticRegression(penalty = None))])

In [131]:
Xall_train, Xall_test, y_train, y_test = train_test_split(Xall, y, test_size = 0.3)

In [132]:
pipe.fit(Xall_train, y_train)#.score(Xall_test, y_test)

In [121]:
y_train.shape

(581,)