In [1]:
import pandas as pd
import numpy as np
import random
import yaml

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import SimpleImputer, KNNImputer

from imblearn.over_sampling import RandomOverSampler
from helper import get_nan_count, get_missing_val_percentage


In [2]:
df_new = pd.read_csv('../resources/data/HTx_ind_treat_res_new_data_update_without_dates_08032024.csv', sep = ';',decimal = ',', encoding = 'utf-8', engine ='python')
df_date = pd.read_csv('../resources/data/HTx_ind_treat_res_new_data_update_only_dates_08032024.csv', sep = ';',decimal = ',', encoding = 'utf-8', engine ='python')


In [3]:
# Read common variables from a YAML file
with open('../common_variables.yaml', 'r') as file:
    common_data = yaml.safe_load(file)

In [4]:
# rename postinumero with id
df = df_new.rename(columns={'potilasnumero': 'id'})
df_date = df_date.rename(columns={'potilasnumero': 'id'})

columns_to_add = common_data['columns_to_add']

# Add selected columns from dfdate to df
for col in columns_to_add:
    df[col] = df_date[col]

is_train_with_all = False

df.shape

(13882, 238)

In [5]:

# response_variable = 'hba1c_12m'
response_variable_list = common_data['response_variable_list']

correlated_variables = common_data['correlated_variables']

variables = df.columns
thresh = common_data['thresh']
keep = []
rem = []

def preprocess(df, test_size):
    print("original shape: ", np.shape(df))
    # find duplicates and keep first record only
#     df = df.sort_values("id").drop_duplicates(subset = ['id'], keep='first')
#     print('Shape of data after removing duplicate ids:', np.shape(df))
    
    # remove all the records with drug class is not 2,3,or 4 
#     2=GLP-1 analogues (A10BJ)
#     3=DPP-4 inhibitors (A10BH)
#     4=SGLT2 inhibitors (A10BK)
    if(not is_train_with_all):
        df = df[
                (df['drug_class'] == 3) |
                (df['drug_class'] == 4) ]

    # replace ' ' as NaN
    df = df.replace(' ', np.NaN)
    print('Shape of data after removing other drug types:', np.shape(df))
    
    # filter by bmi
    df['bmi'] = df['bmi'].astype(float)
    df['sp'] = df['sp'].astype(int)
    df['ika'] = df['ika'].astype(float)
    df['smoking'] = df['smoking'].astype(float)

    # remove rows with missing 'response variable'
    get_nan_count(df)
#     df = df.dropna(how='any', subset = response_variable_list)
    print('Shape of data after excluding missing response:', np.shape(df))
    
    #delete columns with more than threshold NaN
    # get missing values < threshold feature name list
    missing_per = get_missing_val_percentage(df)
    
    for i in range(df.columns.shape[0]):
        if missing_per[i] <= 42: #setting the threshold as 40%
            keep.append(variables[i])
        else :
            rem.append(variables[i])
    
    columns_to_remove = ['hba1c_prev_1y', 'date_hdl_12m', 'date_bmi_12m','date_ldl_12m',
                        'hba1c_12m', 'ldl_12m', 'hdl_12m', 'bmi_12m']
    
    for col in columns_to_remove:
        if col in rem:
            rem.remove(col)
    
    df = df.drop([x for x in rem if x in df.columns], axis=1)
    print('Shape of data after removing cols with less than %.2f percent values missing:' % (thresh), np.shape(df))
        
    #df = df.drop('id', axis=1)
    
    #     remove correlated features 
    df = df.drop([x for x in correlated_variables if x in df.columns], axis=1)
    print('Shape of data after removing correlated features:', np.shape(df))
    
    # convert categorical to numeric
    cat_cols = []
    for i in cat_cols:
        labelencoder = LabelEncoder()
        df[i] = labelencoder.fit_transform(df[i])
        
    ## insert days
    
    date_cols = ['date_hba_bl_6m','date_ldl_bl','date_bmi_bl','date_hdl_bl',
                 'date_12m', 'date_n1',
                 'date_ldl_12m',
                 'date_bmi_12m',
                 'date_hdl_12m']
    
    #convert dates into datetime format
    df[date_cols] = df[date_cols].apply(pd.to_datetime, errors='coerce', format='%m/%d/%Y')#, exact= False)
    days_to_response_hba1c = df['date_12m'] - df['date_hba_bl_6m']
    days_to_response_bmi = df['date_bmi_12m'] - df['date_bmi_bl']
    days_to_response_hdl = df['date_hdl_12m'] - df['date_hdl_bl']
    days_to_response_ldl = df['date_ldl_12m'] - df['date_ldl_bl']
    
    df.loc[:,'days_hba1c'] = [x.days for x in days_to_response_hba1c]
    df.loc[:,'days_bmi'] = [x.days for x in days_to_response_bmi]
    df.loc[:,'days_hdl'] = [x.days for x in days_to_response_hdl]
    df.loc[:,'days_ldl'] = [x.days for x in days_to_response_ldl]
    
    print('Shape of full data with change + days', np.shape(df))

    #convert other "object" columns to numeric 
    convert = df.select_dtypes('object').columns
    df.loc[:, convert] = df[convert].apply(pd.to_numeric, downcast='float', errors='coerce')
    
    # select time interval
    start = 21
    end = 365 #426
#     df = df[(df['days_hba1c'] >= start)]
#     df = df[(df['days_bmi'] >= start)]
#     df = df[(df['days_hdl'] >= start)]
#     df = df[(df['days_ldl'] >= start)]
    print('Shape of full data after selecting date range dates > 21 days', np.shape(df))
    
#     df = df.drop(date_cols, axis=1)
#     df = df.drop(['days_hba1c','days_bmi','days_hdl','days_ldl'], axis=1)

#     df = df.drop(['days_hba1c', 'days_bmi', 'days_hdl','days_ldl'], axis = 1)
    # remove outliers
    
#     df = df.astype(float)
    
    
    # filter by hba1c baseline levels and egfr levels
    df['hba1c_bl_6m'] = df['hba1c_bl_6m'].apply(pd.to_numeric, downcast='float', errors='coerce')
    df['eGFR'] = df['eGFR'].apply(pd.to_numeric, downcast='float', errors='coerce')
    
    criteria = (df['hba1c_bl_6m'] < 53) | (df['hba1c_bl_6m'] > 119) | (df['eGFR'] < 45)
    print('baseline hba1c min ', df['hba1c_bl_6m'].min(), ' and max ', df['hba1c_bl_6m'].max())
    
    print(f'\n Number of samples after the filteration of baseline hba1c and eGFR: {criteria.sum()}')

    print('df shape before the filteration ', df.shape)

    # Invert the criteria to keep only the rows that do not meet any of the criteria
    df = df[~criteria]
    print('df shape after the filteration ', df.shape)
    
    
    # split data
    random.seed(42)
    # Save original data set
    original = df
    Y = df[response_variable_list]
    X = df.drop(response_variable_list, axis=1)
    random.seed(42)
    
    # Split into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=123)
    # save preprocessed df to csv
    result_df = pd.concat([X_train, Y_train], axis=1)
    test_df = pd.concat([X_test, Y_test], axis = 1)

    result_df.to_csv('data/X_train.csv', index=True)
    test_df.to_csv('data/X_test.csv', index=True)
    
    
#     # data imputation
#     original_X_train = X_train
#     original_X_test = X_test
#     random.seed(42)
#     imputer = SimpleImputer(missing_values=np.nan, strategy = "most_frequent")
#     # imputeX = KNNImputer(missing_values=np.nan, n_neighbors = 3, weights='distance')
#     # imputeX = IterativeImputer(max_iter=5, random_state=0)
#     X_train = imputer.fit_transform(X_train)
#     X_test = imputer.transform(X_test)
    
#     X_train = pd.DataFrame(X_train, columns = original_X_train.columns, index=original_X_train.index)
#     X_test = pd.DataFrame(X_test, columns = original_X_train.columns, index=original_X_test.index)
    
#     #     columns_to_skip_normalization = ['drug_class']
#     columns_to_skip_normalization = []
#     # List of columns to normalize
#     columns_to_normalize = [col for col in X_train.columns if col not in columns_to_skip_normalization]

#     # scale data 
# #     scaler = StandardScaler()
#     scaler = MinMaxScaler()
#     select = {}
#     X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
#     X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])
    
    # random oversampling 
#     combined_df = pd.concat([X_train, Y_train], axis=1)
#     X_oversamp = combined_df.drop(['drug_class'], axis = 1)
#     Y_oversamp = combined_df['drug_class']
# #     ros = RandomOverSampler(random_state=0)
#     smote = SMOTE()
#     X_resampled, y_resampled = smote.fit_resample(X_oversamp, Y_oversamp)
#     print(sorted(Counter(Y_oversamp).items()))
#     print(sorted(Counter(y_resampled).items()))
#     combined = pd.concat([X_resampled, y_resampled], axis=1)
#     X_train = combined_df.drop([response_variable], axis = 1)
#     Y_train = combined_df[response_variable]
    
#     X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
#     X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])
    
    return df, X_train, X_test, Y_train, Y_test, X, Y



In [6]:
df, X_train, X_test, Y_train, Y_test, X, Y = preprocess(df, 0.25)

original shape:  (13882, 238)
Shape of data after removing other drug types: (5480, 238)

 NaN counts in resonse variables:
             Feature  NaN Count
hba1c_12m  hba1c_12m        697
ldl_12m      ldl_12m       2276
hdl_12m      hdl_12m       3982
bmi_12m      bmi_12m       2003

Shape of data after excluding missing response: (5480, 238)
Shape of data after removing cols with less than 40.00 percent values missing: (5480, 184)
Shape of data after removing correlated features: (5480, 122)
Shape of full data with change + days (5480, 126)
Shape of full data after selecting date range dates > 21 days (5480, 126)
baseline hba1c min  23.0  and max  164.0

 Number of samples after the filteration of baseline hba1c and eGFR: 1933
df shape before the filteration  (5480, 126)
df shape after the filteration  (3547, 126)


  if missing_per[i] <= 42: #setting the threshold as 40%


In [7]:
df['ldl_12m'].isna().sum()

1491

In [8]:
df['ldl_12m']

3        1.5
8        NaN
10       3.6
19       NaN
30       1.8
        ... 
13858    2.2
13862    NaN
13863    NaN
13878    NaN
13880    3.9
Name: ldl_12m, Length: 3547, dtype: object