In [None]:
import pandas as pd
import optuna
from optuna import pruners
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.samplers import TPESampler

import imblearn
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.combine import SMOTEENN

import sklearn
import xgboost as xgb
import lightgbm as lgb

import numpy as np
import matplotlib.pyplot as plt
import importlib
from joblib import dump, load
import os
import math
from functools import reduce

import torch
import torch.nn as nn
from torch.nn import ReLU
import random

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector

import sys
sys.path.append('../')

import model_util
importlib.reload(model_util)
from model_util import get_scoring_metrics

import optuna_util
importlib.reload(optuna_util)
from optuna_util import run_optuna_studies

import feature_sets
importlib.reload(feature_sets)

import neural_net
importlib.reload(neural_net)
from neural_net import MLP

from sklearnex import patch_sklearn
patch_sklearn()

# Load data

- Use min-max scaled X_train and X_test for all models for consistency, even if not required
- Use miceforest imputed data for all models (same reasoning)

In [None]:
X_train_imputed_scaled = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled.pkl')
y_train = load('../data/imputed/IOPsubcohort_y_train.pkl')

In [None]:
model_feature_dict = {
    'ophthalmic': feature_sets.ophthalmic_features['feature'].values,
    'demographic': feature_sets.demographic_features['feature'].values,
    'systemic': feature_sets.systemic_features['feature'].values,
    'lifestyle': feature_sets.lifestyle_features['feature'].values,

    'OD': feature_sets.OD_features['feature'].values,
    'SL': feature_sets.SL_features['feature'].values,
    'ODSL': feature_sets.ODSL_features['feature'].values,
    'ODS': feature_sets.ODS_features['feature'].values,
    'DSL': feature_sets.DSL_features['feature'].values, # Primary-care focused    
}

# Run hyperparameter studies

- For each model type, `run_optuna_studies` tunes hyperparameters for all individual models coresponding to feature sets
- 100 trials for each

In [None]:
n_trials = 100
n_cv_folds = 5
scoring_metric = 'roc_auc'

### LightGBM

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

### XGBoost

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

### Random forest

In [None]:
%%time

run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

### KNN

In [None]:
%%time

run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

### SVM

In [None]:
%%time

run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

### Logistic regression (SGD)

In [None]:
%%time

run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

### MLP

- Note only 50 trials for each model (not enough compute)

In [None]:
%%time

# 50 trials

mlp_studies = run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = model_feature_dict,
    n_trials = 50, ### 50 trials only for mlp
    
    objective_class = optuna_util.MLP_OptunaObjective,
    save_dir = './optuna_results/mlp_50trials',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# %%time

# 100 trials

mlp_studies = run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = model_feature_dict,
    n_trials = n_trials, ### 50 trials only for mlp
    
    objective_class = optuna_util.MLP_OptunaObjective,
    save_dir = './optuna_results/mlp',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

### Minimal model by RFECV with LightGBM

In [None]:
# Using LGBM with regularisation to deal with colinearity

rfe_obj = RFECV(
    estimator = lgb.LGBMClassifier(**{            
        'n_jobs': -1,
        'bagging_freq': 1,
        'force_row_wise': True,
        'bagging_seed': 2024,
        'verbosity': -100,
        'extra_trees': False,

        'n_estimators': 500,
        'importance_type': 'gain',
        'lambda_l2': 1000,
    }),
    scoring = 'roc_auc',
    importance_getter = 'feature_importances_',
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose = 100,
)

rfe_obj.fit(X_train_imputed_scaled[feature_sets.ODSL_features['feature'].values], y_train)

In [None]:
dump(rfe_obj, './rfecv_fitted.pkl')

In [None]:
minimal_features = feature_sets.ODSL_features['feature'].values[rfe_obj.support_]
minimal_features

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = {'minimal_features_rfecv': feature_sets.minimal_features_rfecv['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

### Testing class imbalance with LGBM minimal model

In [None]:
# SMOTE 1:3

run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = {'minimal_features_rfecv': feature_sets.minimal_features_rfecv['feature'].values},
    #feature_dict = {'ODSL': feature_sets.ODSL_features['feature'].values, 'minimal_features_rfecv': feature_sets.minimal_features_rfecv['feature'].values},
    n_trials = 100,
    
    objective_class = optuna_util.LGBM_OptunaObjective_SMOTE,
    save_dir = './optuna_results/imbalanced_lightgbm_SMOTE',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,

    sampler = SMOTE,
)

In [None]:
# Random oversampling 1:3

run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = {'minimal_features_rfecv': feature_sets.minimal_features_rfecv['feature'].values},
    #feature_dict = {'ODSL': feature_sets.ODSL_features['feature'].values, 'minimal_features_rfecv': feature_sets.minimal_features_rfecv['feature'].values},
    n_trials = 100,
    
    objective_class = optuna_util.LGBM_OptunaObjective_RandomOverUnderSampler,
    save_dir = './optuna_results/imbalanced_lightgbm_RandomOverSampler',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,

    sampler = RandomOverSampler,
)

In [None]:
# Random undersampling 1:3

run_optuna_studies(
    X = X_train_imputed_scaled,
    y = y_train,
    feature_dict = {'minimal_features_rfecv': feature_sets.minimal_features_rfecv['feature'].values},
    #feature_dict = {'ODSL': feature_sets.ODSL_features['feature'].values, 'minimal_features_rfecv': feature_sets.minimal_features_rfecv['feature'].values},
    n_trials = 100,
    
    objective_class = optuna_util.LGBM_OptunaObjective_RandomOverUnderSampler,
    save_dir = './optuna_results/imbalanced_lightgbm_RandomUnderSampler',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,

    sampler = RandomUnderSampler,
)

# Refit best models on entire train set

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd': sklearn.linear_model.SGDClassifier,
    'svm': sklearn.svm.SVC,
    'knn': sklearn.neighbors.KNeighborsClassifier,
    'randomforest': sklearn.ensemble.RandomForestClassifier,
    'xgboost': xgb.XGBClassifier,
    'lightgbm': lgb.LGBMClassifier,
    'mlp_50trials': MLP,
}

In [None]:
model_feature_dict = {
    'ophthalmic': feature_sets.ophthalmic_features['feature'].values,
    'demographic': feature_sets.demographic_features['feature'].values,
    'systemic': feature_sets.systemic_features['feature'].values,
    'lifestyle': feature_sets.lifestyle_features['feature'].values,
    
    'ODSL': feature_sets.ODSL_features['feature'].values,
    #'OSL': feature_sets.OSL_features['feature'].values,
    'DSL': feature_sets.DSL_features['feature'].values, # Primary-care focused    
    'OD': feature_sets.OD_features['feature'].values,
    'SL': feature_sets.SL_features['feature'].values,

    #'minimal_features_rfecv': feature_sets.minimal_features_rfecv['feature'].values
}

In [None]:
for algorithm, estimator_class in algorithms.items():
    print(algorithm)

In [None]:
# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
    #for algorithm, estimator_class in zip(algorithms.keys(), algorithms.values()):
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# see best hyperparams DF tsv

feature_set_dfs = []

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k,v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

In [None]:
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)

In [None]:
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]

In [None]:
combo_df.to_csv('./optuna_results/best_hyperparameter_results.tsv', sep='\t', index=True)

In [None]:
combo_df

In [None]:
# For final model

In [None]:
# see best hyperparams DF tsv

feature_set_dfs = []

model_name = 'minimal_features_rfecv'
algorithm = 'lightgbm'

feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

model_save_dir = f'{fitted_models_dir}/{model_name}'


study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
study = load(study_dir)
best_params = study.best_trial.params
#best_params = study.best_trial.user_attrs['all_params']

for k,v in best_params.items():
    if type(v) == str:
        feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
    else:
        x = '%s' % float('%.4g' % v)
        feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

feature_set_dfs.append(feature_set_hyperparam_df)

In [None]:
feature_set_hyperparam_df

In [None]:
feature_set_hyperparam_df.to_csv('./optuna_results/final_model_hyperparameter_results.tsv', sep='\t', index=True)