In [1]:
import numpy as np
import csv
import pandas as pd
import miceforest as mf

import joblib
from joblib import load

import sklearn
from sklearn.preprocessing import MinMaxScaler

import scipy.stats as stats

import importlib
import sys
import feature_sets
importlib.reload(feature_sets)

import model_util
importlib.reload(model_util)

<module 'model_util' from '/mnt/shared_folders/eResearch_glaucoma_project/andrewholmes2024/Aug2024/model_util.py'>

# Load data

In [None]:
odsl_feature_list = feature_sets.ODSL_features['feature'].values

In [2]:
merged_df = pd.read_pickle('/mnt/shared_folders/eResearch_glaucoma_project/andrewholmes2024/Aug2024/data/derived/derived_cols_merged.pkl')

In [4]:
# Set categorical features

odsl_categorical_features = feature_sets.ODSL_features[feature_sets.ODSL_features['coding_type'].isin(['binary', 'nominal'])]['feature'].values
merged_df[odsl_categorical_features] = merged_df[odsl_categorical_features].astype('category')

In [3]:
IOP_subcohort_df = merged_df[merged_df['IOP subcohort'] == 1]

In [6]:
X_train, y_train, X_test, y_test = model_util.get_train_test_datasets(
    IOP_subcohort_df, 
    'training_test_split_90_10', 
    'Glaucoma (prevalent D|TD)', 
    odsl_feature_list
)

In [14]:
len(merged_df[merged_df['IOP_available'] == 1])

113264

In [None]:
# 112 156 

# 1108 total

In [13]:
merged_df[merged_df['IOP_available'] == 1]['Glaucoma (prevalent D|TD)'].value_counts()

Glaucoma (prevalent D|TD)
Control     108428
Glaucoma      3728
Name: count, dtype: int64

In [15]:
merged_df[merged_df['IOP_available'] == 1]['Exclusion'].value_counts()

Exclusion
0.0    112156
1.0      1108
Name: count, dtype: int64

In [10]:
IOP_subcohort_df['Glaucoma (prevalent D|TD)'].value_counts()

Glaucoma (prevalent D|TD)
Control     108428
Glaucoma      3728
Name: count, dtype: int64

# Missing feature stats

In [8]:
IOP_subcohort_df['Glaucoma (prevalent D|TD)']

5          Control
7          Control
9          Control
20         Control
22         Control
            ...   
502406     Control
502409     Control
502410     Control
502415    Glaucoma
502416     Control
Name: Glaucoma (prevalent D|TD), Length: 112156, dtype: object

In [39]:
glaucoma_df = IOP_subcohort_df[IOP_subcohort_df['Glaucoma (prevalent D|TD)'] == 'Glaucoma']
control_df = IOP_subcohort_df[IOP_subcohort_df['Glaucoma (prevalent D|TD)'] == 'Control']

total_n = len(IOP_subcohort_df)
glaucoma_n = len(glaucoma_df)
control_n = len(control_df)

missing_feature_df = pd.DataFrame(columns=[
    'Feature',
    'N missing', # used for sorting
    'N missing (%)',
    'N missing, glaucoma (%)',
    'N missing, control (%)',
    'p',
])
missing_feature_df = missing_feature_df.set_index('Feature', drop=True)

for feature in odsl_feature_list:
    n_missing = IOP_subcohort_df[feature].isna().sum()
    n_missing_percent = (n_missing / total_n) * 100
    missing_feature_df.loc[feature, 'N missing'] = n_missing
    if n_missing >= 10000:
        missing_feature_df.loc[feature, 'N missing (%)'] = f'{n_missing:,} ({n_missing_percent:0.2f}%)'
    else:
        missing_feature_df.loc[feature, 'N missing (%)'] = f'{n_missing} ({n_missing_percent:0.2f}%)'

    # Glaucoma
    n_missing_glaucoma = glaucoma_df[feature].isna().sum()
    n_missing_percent = (n_missing_glaucoma / glaucoma_n) * 100
    if n_missing >= 10000:
        missing_feature_df.loc[feature, 'N missing, glaucoma (%)'] = f'{n_missing_glaucoma:,} ({n_missing_percent:0.2f}%)'
    else:
        missing_feature_df.loc[feature, 'N missing, glaucoma (%)'] = f'{n_missing_glaucoma} ({n_missing_percent:0.2f}%)'

    # Control
    n_missing_control = control_df[feature].isna().sum()
    n_missing_percent = (n_missing_control / control_n) * 100
    if n_missing >= 10000:
        missing_feature_df.loc[feature, 'N missing, control (%)'] = f'{n_missing_control:,} ({n_missing_percent:0.2f}%)'
    else:
        missing_feature_df.loc[feature, 'N missing, control (%)'] = f'{n_missing_control} ({n_missing_percent:0.2f}%)'

    # Chi-square

    if n_missing_glaucoma == 0 and n_missing_control == 0:
        continue

    # contignency tab
    ctb = pd.crosstab(index=IOP_subcohort_df[feature].isna() == True, columns=IOP_subcohort_df['Glaucoma (prevalent D|TD)'])
    
    p_val = stats.chi2_contingency(ctb).pvalue
    if p_val < 0.001:
        p_val = '<0.001'
    else:
        p_val = f'{p_val:0.3f}'
    missing_feature_df.loc[feature, 'p'] = p_val

missing_feature_df = missing_feature_df.sort_values(by='N missing', axis=0, ascending=False)
missing_feature_df = missing_feature_df.drop(columns=['N missing'])

In [40]:
missing_feature_df.to_csv('./data/imputed/missing_feature_count.tsv', sep='\t')

In [41]:
missing_feature_df.to_html('./data/imputed/missing_feature_count.html')

In [38]:
missing_feature_df.head(25)

Unnamed: 0_level_0,N missing (%),"N missing, glaucoma (%)","N missing, control (%)",p
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Exercise (summed MET minutes per week),"24,704 (22.03%)","24,704 (25.00%)","24,704 (21.92%)",<0.001
Total household income,"16,339 (14.57%)","16,339 (17.68%)","16,339 (14.46%)",<0.001
Plasma oestradiol,"15,437 (13.76%)","15,437 (14.16%)","15,437 (13.75%)",0.487
Plasma glucose,"15,351 (13.69%)","15,351 (14.40%)","15,351 (13.66%)",0.203
HDL,"15,283 (13.63%)","15,283 (14.38%)","15,283 (13.60%)",0.182
Plasma albumin,"15,205 (13.56%)","15,205 (14.30%)","15,205 (13.53%)",0.187
Plasma Vitamin D,"12,980 (11.57%)","12,980 (11.96%)","12,980 (11.56%)",0.464
HbA1c,"10,086 (8.99%)","10,086 (9.82%)","10,086 (8.96%)",0.078
Plasma testosterone,9252 (8.25%),9252 (8.93%),9252 (8.23%),0.131
Snoring,8915 (7.95%),8915 (8.32%),8915 (7.94%),0.417


# Run imputation

In [7]:
imputation_kernel = mf.ImputationKernel(
    X_train,
    num_datasets=1,
    random_state=2024,
    mean_match_strategy='normal',
    mean_match_candidates=10,
    save_all_iterations_data=True,
    imputation_order='descending',
)

In [9]:
imputation_kernel.mice(
    verbose=True,
    iterations=20, 

    # LGBM parameters 
    n_estimators=200,
    max_bin=512,
    # max_depth=10,
    # num_leaves=1023,
    # learning_rate=0.1,
)

Initialized logger with name MICE Iterations 1 - 5 and 4 levels
1 Dataset 0
 | Exercise (summed MET minutes per week) | Total household income | Plasma oestradiol | Plasma glucose | HDL | Plasma albumin | Plasma Vitamin D | HbA1c | Plasma testosterone | Snoring | Past smoking frequency | Plasma total bilirubin | LDL | C-reactive protein | Triglycerides | Plasma urate | eGFR serum creatinine | Total cholesterol | Hearing difficulty (self-reported) | Diet score | Systemic immune inflammation index | Urinary sodium-creatinine ratio | Albumin-creatinine ratio | Polygenic risk score | Speech reception threshold | Tinnitus frequency (self-reported) | Corneal hysteresis inter-eye difference | IOPg pre-treatment inter-eye difference | Education | PM2.5 exposure | Private healthcare utilisation | Arterial stiffness index | Spherical equivalent | Urban residence | Daytime sleeping frequency | Normal sleep duration | Vitamin C supplementation | Multivitamin supplementation | Ethnicity | Poor oral

In [10]:
joblib.dump(imputation_kernel, './data/imputed/imputation_kernel_20iter.pkl')

['./data/imputation/imputation_kernel_5iter.pkl']

# Save imputed data & train/test

In [12]:
imputation_kernel = load('./data/imputed/imputation_kernel_20iter.pkl')

In [18]:
X_train_imputed = imputation_kernel.complete_data().reset_index(drop=True)
X_test_imputed = imputation_kernel.impute_new_data(X_test).complete_data().reset_index(drop=True)

joblib.dump(X_train_imputed, './data/imputed/IOPsubcohort_X_train_imputed.pkl')
joblib.dump(X_test_imputed, './data/imputed/IOPsubcohort_X_test_imputed.pkl')

joblib.dump(y_train.reset_index(drop=True), './data/imputed/IOPsubcohort_y_train.pkl')
joblib.dump(y_test.reset_index(drop=True), './data/imputed/IOPsubcohort_y_test.pkl')

['./data/imputation/IOPsubcohort_y_test.pkl']

In [23]:
X_merged_imputed = pd.concat((X_train_imputed, X_test_imputed), ignore_index=True)
y_merged = pd.concat((y_train, y_test), ignore_index=True)

joblib.dump(X_merged_imputed, './data/imputed/IOPsubcohort_X_merged_imputed.pkl')
joblib.dump(y_merged, './data/imputed/IOPsubcohort_y_merged.pkl')

['./data/imputed/IOPsubcohort_y_merged.pkl']

In [19]:
# Apply scaling

scaler = MinMaxScaler()
scaler.fit(X_train_imputed)

X_train_scaled = pd.DataFrame(scaler.transform(X_train_imputed), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_imputed), columns=X_test.columns)

joblib.dump(scaler, './data/imputed/min_max_scaler.pkl')
joblib.dump(X_train_scaled, './data/imputed/IOPsubcohort_X_train_imputed_scaled.pkl')
joblib.dump(X_test_scaled, './data/imputed/IOPsubcohort_X_test_imputed_scaled.pkl')

['./data/imputation/IOPsubcohort_X_test_imputed_scaled.pkl']