In [1]:
pip install comet_ml -q

[K     |████████████████████████████████| 441 kB 7.7 MB/s 
[K     |████████████████████████████████| 54 kB 1.2 MB/s 
[K     |████████████████████████████████| 54 kB 1.2 MB/s 
[K     |████████████████████████████████| 168 kB 35.2 MB/s 
[K     |████████████████████████████████| 498 kB 46.3 MB/s 
[K     |████████████████████████████████| 130 kB 11.3 MB/s 
[K     |████████████████████████████████| 140 kB 14.3 MB/s 
[K     |████████████████████████████████| 127 kB 41.8 MB/s 
[K     |████████████████████████████████| 168 kB 25.8 MB/s 
[K     |████████████████████████████████| 166 kB 18.9 MB/s 
[K     |████████████████████████████████| 166 kB 9.0 MB/s 
[K     |████████████████████████████████| 162 kB 4.6 MB/s 
[K     |████████████████████████████████| 162 kB 10.6 MB/s 
[K     |████████████████████████████████| 158 kB 20.2 MB/s 
[K     |████████████████████████████████| 157 kB 2.8 MB/s 
[K     |████████████████████████████████| 157 kB 5.3 MB/s 
[K     |████████████████████████

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

# Load data
data_df = pd.read_csv('/content/drive/MyDrive/MMD6020_Final_Project/data/processed/chbmit_preprocessed_data.csv') 

# Separate X and y
y = data_df['Outcome']
X = data_df.drop(['Outcome'], axis=1)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, stratify=y, random_state=RANDOM_SEED)

# AdaBoost

In [7]:
import os

os.environ['COMET_API_KEY'] = "gNf7yNnQQzusQKP74RiFgyQX8"

In [None]:
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.decomposition import PCA

from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, chi2

from imblearn.pipeline import Pipeline

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

from comet_ml import Experiment
from comet_ml import Optimizer
from comet_ml import API

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

def run_search(experiment, model, X, y, cv):
  # fit the model on the whole dataset
  results = cross_validate(
      model, X, y, cv=cv, 
      scoring=[
          "accuracy",
          "precision_macro", 
          "recall_macro", 
          "f1_macro", 
          "roc_auc",
      ], return_train_score=True)

  for k in results.keys():
    scores = results[k]
    for idx, score in enumerate(scores):
      experiment.log_metrics({f"cv_{k}": score}, step=idx)

    experiment.log_metrics({f"cv_mean_{k}": np.mean(scores)})
    experiment.log_metrics({f"cv_std_{k}": np.std(scores)})

    experiment.log_parameter("random_state", RANDOM_SEED)
    
def HyperParametersTuning(project_name, X_train, y_train):

    # setting the spec for bayes algorithm
    spec = {
        "objective": "minimize",
        "metric": "loss",
        "seed": RANDOM_SEED
    }

    # setting the parameters we are tuning
    model_params = {
        "n_estimators": {
            "type": "integer",
            "scaling_type": "uniform",
            "min": 20,
            "max": 100
        },
        "learning_rate": {
            "type": "discrete",
            "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        },
    }


    # defining the configuration dictionary
    config_dict = {
        "algorithm": "bayes",
        "spec": spec, 
        "parameters": model_params,
        "name": "Bayes Optimization", 
        "trials": 3
    }

    cv = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED, shuffle=True) # use 5-fold stratified cv

    # initializing the comet ml optimizer
    opt = Optimizer(
        api_key=os.environ.get('COMET_API_KEY'), # create an env var called 'COMET_API_KEY' containing the API key
        config=config_dict,
        project_name=project_name, # change name to model-selector
        workspace="mmd6020-projet-pratique")

   
    for experiment in opt.get_experiments():

        n_estimators   = experiment.get_parameter("n_estimators")
        learning_rate  = experiment.get_parameter("learning_rate")

        selector = PCA(n_components=12) # change selector for feature selection

        clf_adaboost = AdaBoostClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=RANDOM_SEED)

        # Pipeline
        steps = [('selector', selector), ("clf_adaboost", clf_adaboost)]
        pipeline = Pipeline(steps=steps)

        run_search(experiment, pipeline, X_train, y_train, cv)

        pipeline.fit(X_train, y_train)
        
        experiment.log_parameter("random_state", RANDOM_SEED)
        experiment.end()
  
HyperParametersTuning("adaboost-pca", X_train, y_train) 

COMET INFO: COMET_OPTIMIZER_ID=0ff1602f729f4331a051051397a2cb7b
COMET INFO: Using optimizer config: {'algorithm': 'bayes', 'configSpaceSize': 'infinite', 'endTime': None, 'id': '0ff1602f729f4331a051051397a2cb7b', 'lastUpdateTime': None, 'maxCombo': 0, 'name': 'Bayes Optimization', 'parameters': {'learning_rate': {'type': 'discrete', 'values': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}, 'n_estimators': {'max': 100, 'min': 20, 'scalingType': 'uniform', 'scaling_type': 'uniform', 'type': 'integer'}}, 'predictor': None, 'spec': {'gridSize': 10, 'maxCombo': 0, 'metric': 'loss', 'minSampleSize': 100, 'objective': 'minimize', 'retryAssignLimit': 0, 'retryLimit': 1000, 'seed': 42}, 'startTime': 58264035241, 'state': {'mode': None, 'seed': None, 'sequence': [], 'sequence_i': 0, 'sequence_pid': None, 'sequence_retry': 0, 'sequence_retry_count': 0}, 'status': 'running', 'suggestion_count': 0, 'trials': 3, 'version': '2.0.1'}
COMET INFO: Optimizer metrics is 'loss' but no logged values found. 

# Logistic Regression

# Random Forest

# KNN

# XGBoost