In [1]:
pip install comet_ml -q

[K     |████████████████████████████████| 441 kB 7.6 MB/s 
[K     |████████████████████████████████| 54 kB 1.2 MB/s 
[K     |████████████████████████████████| 130 kB 13.4 MB/s 
[K     |████████████████████████████████| 168 kB 44.2 MB/s 
[K     |████████████████████████████████| 54 kB 686 kB/s 
[K     |████████████████████████████████| 498 kB 30.3 MB/s 
[K     |████████████████████████████████| 140 kB 39.1 MB/s 
[K     |████████████████████████████████| 127 kB 22.4 MB/s 
[K     |████████████████████████████████| 168 kB 29.9 MB/s 
[K     |████████████████████████████████| 166 kB 31.7 MB/s 
[K     |████████████████████████████████| 166 kB 22.0 MB/s 
[K     |████████████████████████████████| 162 kB 14.3 MB/s 
[K     |████████████████████████████████| 162 kB 51.2 MB/s 
[K     |████████████████████████████████| 158 kB 41.2 MB/s 
[K     |████████████████████████████████| 157 kB 9.9 MB/s 
[K     |████████████████████████████████| 157 kB 17.0 MB/s 
[K     |█████████████████████

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

# Load data
data_df = pd.read_csv('/content/drive/MyDrive/MMD6020_Final_Project/data/processed/chbmit_preprocessed_data.csv') 

# Separate X and y
y = data_df['Outcome']
X = data_df.drop(['Outcome'], axis=1)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, stratify=y, random_state=RANDOM_SEED)

In [4]:
import os
from getpass import getpass

API_KEY = getpass('Enter Comet. ml API key:')

os.environ['COMET_API_KEY'] = API_KEY

Enter Comet. ml API key:··········


# AdaBoost

In [5]:
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.decomposition import PCA

from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, chi2

from imblearn.pipeline import Pipeline

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

from comet_ml import Experiment
from comet_ml import Optimizer
from comet_ml import API

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

def run_search(experiment, model, X, y, cv):
  # fit the model on the whole dataset
  results = cross_validate(
      model, X, y, cv=cv, 
      scoring=[
          "accuracy",
          "precision_macro", 
          "recall_macro", 
          "f1_macro", 
          "roc_auc",
      ], return_train_score=True)

  for k in results.keys():
    scores = results[k]
    for idx, score in enumerate(scores):
      experiment.log_metrics({f"cv_{k}": score}, step=idx)

    experiment.log_metrics({f"cv_mean_{k}": np.mean(scores)})
    experiment.log_metrics({f"cv_std_{k}": np.std(scores)})

    experiment.log_parameter("random_state", RANDOM_SEED)
    
def HyperParametersTuning(project_name, X_train, y_train):

    # setting the spec for bayes algorithm
    spec = {
        "objective": "minimize",
        "metric": "loss",
        "seed": RANDOM_SEED
    }

    # setting the parameters we are tuning
    model_params = {
        "n_estimators": {
            "type": "integer",
            "scaling_type": "uniform",
            "min": 20,
            "max": 100
        },
        "learning_rate": {
            "type": "discrete",
            "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        },
    }


    # defining the configuration dictionary
    config_dict = {
        "algorithm": "bayes",
        "spec": spec, 
        "parameters": model_params,
        "name": "Bayes Optimization", 
        "trials": 3
    }

    cv = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED, shuffle=True) # use 5-fold stratified cv

    # initializing the comet ml optimizer
    opt = Optimizer(
        api_key=os.environ.get('COMET_API_KEY'), # create an env var called 'COMET_API_KEY' containing the API key
        config=config_dict,
        project_name=project_name, # change name to model-selector
        workspace="mmd6020-projet-pratique")

   
    for experiment in opt.get_experiments():

        n_estimators   = experiment.get_parameter("n_estimators")
        learning_rate  = experiment.get_parameter("learning_rate")

        selector = PCA(n_components=12) # change selector for feature selection

        clf_adaboost = AdaBoostClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=RANDOM_SEED)

        # Pipeline
        steps = [('selector', selector), ("clf_adaboost", clf_adaboost)]
        pipeline = Pipeline(steps=steps)

        run_search(experiment, pipeline, X_train, y_train, cv)

        pipeline.fit(X_train, y_train)
        
        experiment.log_parameter("random_state", RANDOM_SEED)
        experiment.end()
  
HyperParametersTuning("adaboost-pca", X_train, y_train) 

COMET INFO: COMET_OPTIMIZER_ID=4540cbed00334cc893959e28a070cc73
COMET INFO: Using optimizer config: {'algorithm': 'bayes', 'configSpaceSize': 'infinite', 'endTime': None, 'id': '4540cbed00334cc893959e28a070cc73', 'lastUpdateTime': None, 'maxCombo': 0, 'name': 'Bayes Optimization', 'parameters': {'learning_rate': {'type': 'discrete', 'values': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}, 'n_estimators': {'max': 100, 'min': 20, 'scalingType': 'uniform', 'scaling_type': 'uniform', 'type': 'integer'}}, 'predictor': None, 'spec': {'gridSize': 10, 'maxCombo': 0, 'metric': 'loss', 'minSampleSize': 100, 'objective': 'minimize', 'retryAssignLimit': 0, 'retryLimit': 1000, 'seed': 42}, 'startTime': 12946187475, 'state': {'mode': None, 'seed': None, 'sequence': [], 'sequence_i': 0, 'sequence_pid': None, 'sequence_retry': 0, 'sequence_retry_count': 0}, 'status': 'running', 'suggestion_count': 0, 'trials': 3, 'version': '2.0.1'}
COMET ERROR: Failed to calculate active processors count. Fall back 

KeyboardInterrupt: ignored

# Logistic Regression

In [None]:
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2

from imblearn.pipeline import Pipeline

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

from comet_ml import Experiment
from comet_ml import Optimizer
from comet_ml import API

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

def run_search(experiment, model, X, y, cv):
  # fit the model on the whole dataset
  results = cross_validate(
      model, X, y, cv=cv, 
      scoring=[
          "accuracy",
          "precision_macro", 
          "recall_macro", 
          "f1_macro", 
          "roc_auc",
      ], return_train_score=True)

  for k in results.keys():
    scores = results[k]
    for idx, score in enumerate(scores):
      experiment.log_metrics({f"cv_{k}": score}, step=idx)

    experiment.log_metrics({f"cv_mean_{k}": np.mean(scores)})
    experiment.log_metrics({f"cv_std_{k}": np.std(scores)})

    experiment.log_parameter("random_state", RANDOM_SEED)
    
def HyperParametersTuning(project_name, X_train, y_train):

    # setting the spec for bayes algorithm
    spec = {
        "objective": "minimize",
        "metric": "loss",
        "seed": RANDOM_SEED
    }

    model_params = {
        "penalty": {
            "type": "categorical",
            "values": ["l1", "l2", "elasticnet", "none"]
        },
        # "solver": {
        #     "type": "categorical",
        #     "values": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
        # },
        "C": {
            "type": "discrete",
            "values": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50, 100]
        },
        "max_iter": {
            "type": "integer",
            "scaling_type": "uniform",
            "min": 100,
            "max": 1000
        },
        "l1_ratio": {
            "type": "discrete",
            "values": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
        },
    }


    # defining the configuration dictionary
    config_dict = {
        "algorithm": "bayes",
        "spec": spec, 
        "parameters": model_params,
        "name": "Bayes Optimization", 
        "trials": 3
    }

    cv = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED, shuffle=True) # use 5-fold stratified cv

    # initializing the comet ml optimizer
    opt = Optimizer(
        api_key=os.environ.get('COMET_API_KEY'), # create an env var called 'COMET_API_KEY' containing the API key
        config=config_dict,
        project_name=project_name, # change name to model-selector
        workspace="mmd6020-projet-pratique")

   
    for experiment in opt.get_experiments():

        penalty      = experiment.get_parameter("penalty")
        # solver       = experiment.get_parameter("solver")
        C            = experiment.get_parameter("C")
        max_iter     = experiment.get_parameter("max_iter")
        l1_ratio     = experiment.get_parameter("l1_ratio")
        
        selector = PCA(n_components=12) # change selector for feature selection

        clf_logreg = LogisticRegression(
            penalty=penalty,
            solver='saga',
            C=C,
            max_iter=max_iter,
            class_weight=None,
            l1_ratio=l1_ratio,
            random_state=RANDOM_SEED)


        # Pipeline
        steps = [('selector', selector), ("logreg", clf_logreg)]
        pipeline = Pipeline(steps=steps)

        run_search(experiment, pipeline, X_train, y_train, cv)

        pipeline.fit(X_train, y_train)
        
        experiment.log_parameter("random_state", RANDOM_SEED)
        experiment.end()
  
HyperParametersTuning("logreg-pca", X_train, y_train) 

COMET INFO: COMET_OPTIMIZER_ID=d79e0b9cf9aa44a9ac4b9d448e30db4a
COMET INFO: Using optimizer config: {'algorithm': 'bayes', 'configSpaceSize': 'infinite', 'endTime': None, 'id': 'd79e0b9cf9aa44a9ac4b9d448e30db4a', 'lastUpdateTime': None, 'maxCombo': 0, 'name': 'Bayes Optimization', 'parameters': {'C': {'type': 'discrete', 'values': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50, 100]}, 'l1_ratio': {'type': 'discrete', 'values': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]}, 'max_iter': {'max': 1000, 'min': 100, 'scalingType': 'uniform', 'scaling_type': 'uniform', 'type': 'integer'}, 'penalty': {'type': 'categorical', 'values': ['l1', 'l2', 'elasticnet', 'none']}}, 'predictor': None, 'spec': {'gridSize': 10, 'maxCombo': 0, 'metric': 'loss', 'minSampleSize': 100, 'objective': 'minimize', 'retryAssignLimit': 0, 'retryLimit': 1000, 'seed': 42}, 'startTime': 58266295606, 'state': {'mode': None, 'seed': None, 'sequence': [], 'sequence_i': 0, 'sequence_pid': None, 'sequence_retry': 0, 'sequence_retry_count': 0}, '

# Random Forest

# KNN

# XGBoost

In [None]:
# Use "gpu_hist" for training the model.
classi = xgb.XGBClassifier(tree_method="gpu_hist")
# Fit the model using predictor X and response y.
model_xgb = classi.fit(X_train, y_train)
print('Accuracy on training data: ', round(model_xgb.score(X_train,y_train),2))
result_xgb = model_xgb.score(X_test,y_test)
print('Accuracy on testing data: ', round(result_xgb*100,2))

In [None]:
from sklearn.model_selection import RandomizedSearchCV

tuned_parameters = {
  'gamma': np.linspace(0, 2, num=50),
  'max_depth': np.linspace(0, 12, num=13, dtype=int),
    'max_delta_step': np.linspace(0,10, num=11),
     'scale_pos_weight': [0.1, 1.0]
  }

cv = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED, shuffle=True)

random_search = RandomizedSearchCV(
    xgb.XGBClassifier(tree_method="gpu_hist"), 
    tuned_parameters, 
    n_iter=150, 
    scoring='f1_macro', 
    cv=cv,
)
random_search.fit(X_train, y_train);
print('Finished!')

In [None]:
print("Meilleur ensemble de paramètres trouvé")
print()
print(random_search.best_params_)
print(random_search.best_score_)

In [None]:
# Use "gpu_hist" for training the model.
classi = xgb.XGBClassifier(tree_method="gpu_hist", scale_pos_weight = random_search.best_params_['scale_pos_weight'], max_depth = random_search.best_params_['max_depth'], max_delta_step = random_search.best_params_['max_delta_step'], gamma = random_search.best_params_['gamma'])
# Fit the model using predictor X and response y.
model_xgb = classi.fit(X_train, y_train)
print('Accuracy on training data: ', round(model_xgb.score(X_train,y_train),2))
result_xgb = model_xgb.score(X_test,y_test)
print('Accuracy on testing data: ', round(result_xgb*100,2))

In [None]:
from sklearn.calibration import CalibrationDisplay
disp_test = CalibrationDisplay.from_estimator(model_xgb,X_test, y_test, n_bins = 105)
disp_test.plot(ax=axes[1,1])
axes[1,1].set_title("Calibration curve")

# Plot ROC

In [None]:
from sklearn.metrics import roc_curve, auc

colors = ['red', 'blue', 'green', 'yellow', 'orange']

def plot_ROC(classifiers_tuple, plot_name, add_random=True):

    plt.figure(figsize=(8, 8))

    for count, classifier in enumerate(classifiers_tuple):

        
        clf = classifier[0]
        clf_name = classifier[1]
        X = classifier[2]
        y = classifier[3]

        y_pred = clf.predict_proba(X)[:,1]
        fpr, tpr, _ = roc_curve(y.ravel(), y_pred.ravel())
        roc_auc = auc(fpr, tpr)

        plt.plot(fpr, tpr, color=colors[count], label=f"{clf_name}: AUC = %0.2f" % roc_auc)
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curves")

    if add_random:
        plt.plot([0, 1], [0, 1], color="black", label='Random Uniform (AUC = 0.5)', linestyle="--")
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.legend(loc="lower right")

In [None]:
classifiers = [(clf, 'AdaBoost', X_val, y_val),
               (clf3, 'Logistic Regression', X_val, y_val),
               (clf4, 'Random Forest', X_val, y_val),
               
               (model_xgb, 'XGBoost', X_val, y_val)]
plot_ROC(classifiers, plot_name='Testing', add_random=True)