# Step 4: Create models with SD

In [1]:
# importing all packages needed in this section
import os
import sys 
import pandas
import cudf as pd

from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             matthews_corrcoef,
                             cohen_kappa_score)

from cuml.model_selection import train_test_split, StratifiedKFold


# utility functions for the experiment
sys.path.append('../src')

from mlflow_manager import MLFlowManager
from tuning_grids import Grids
from utils import getPicklesFromDir, getExperimentConfig, translate_model_name
from gpuclassification import GPUClassifierPipeline, GPUModels, opt_tune_model

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

ModuleNotFoundError: No module named 'cudf'

Create the dataset to save the performance. Initially was going to use mlflow for this. 
However, a bugg surfaced when google colab was used, where it got stuck in a endless loop
trying to read the loggs via the colab cell. Thus this implementation.
```
Columns:
    Dataset id: str
        the dataset id that the model was evaluated on.
    model: str
        the shortend model name/id (e.g. lr = logistic regression, rf = random forest, etc.)
    F1, Accuracy, AUC: float
        performance metrics from evaluating the model on the hold-out data.
    Params: dict
        the hyperparameters for the model.
    Tuned on: str
        wheter the hyperparameters comes from tuning on original data or synthetic
    Trained on: str
        the type of data that the model was trained on, "original" or "synthetic"
    Quality: str
        if synthetic, the quality id of the generator
    SDG:
        the synthetic genenerator id.
    Dataset type: str
        if the dataset that the model trained on is "original" or "synthetic"
    USI: str
        Unique Settings Identifier, a unique string generated by pycaret setup each initialization
```

In [None]:
"""
Create the dataset to save the performance. Initially was going to use mlflow for this. 
However, a bugg surfaced when google colab was used, where it got stuck in a endless loop
trying to read the loggs via the colab cell. Thus this implementation.

Columns:
    Dataset id: str
        the dataset id that the model was evaluated on.
    model: str
        the shortend model name/id (e.g. lr = logistic regression, rf = random forest, etc.)
    F1, Accuracy, AUC: float
        performance metrics from evaluating the model on the hold-out data.
    Params: dict
        the hyperparameters for the model.
    Tuned on: str
        wheter the hyperparameters comes from tuning on original data or synthetic
    Trained on: str
        the type of data that the model was trained on, "original" or "synthetic"
    Quality: str
        if synthetic, the quality id of the generator
    SDG:
        the synthetic genenerator id.
    Dataset type: str
        if the dataset that the model trained on is "original" or "synthetic"
    USI: str
        Unique Settings Identifier, a unique string generated by pycaret setup each initialization
    
    
"""

# Create an empty DataFrame with the specified columns
columns = ["Dataset id", "model", "F1", "Accuracy", "AUC", "MCC", "Kappa", "Params", "Tuned on", "Trained on", "USI", "Quality", "SDG"]

# if it exists, read it, else create a new one
if os.path.isfile(folders['model_perf_filepath']):
    model_performance_df = pd.read_csv(folders['model_perf_filepath'])
else:
    model_performance_df = pd.DataFrame(columns=columns)

performance_row = {}

In [None]:
run_dataset = config['run_dataset']
for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        # Checks if run_dataset contains dataset_id's
        # if it does, run the experiment only on specified datasets
        continue
    print(f"Start models for {settings['meta']['id']}-{settings['meta']['name']}")

    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    target_label = settings['meta']['target']
    train_size = settings['setup_param']['train_size']
    settings['setup_param']['preprocess'] = False

    #### Define features (use meta) ####
    ordinal_features = settings['meta']['ordinal_features']
    numeric_features = settings['meta']['numeric_features']
    text_features = settings['meta']['text_features']
    categorical_features = settings['meta']['categorical_features']

    cols_dtype = None
    if 'cols_dtype' in settings['meta']:
      cols_dtype = settings['meta']['cols_dtype']
    
    # Load your dataset into a cuDF DataFrame
    original_data = pd.read_csv(dataset_path, dtype=cols_dtype)

    # Split the dataset into a train set and a test set using cuML's train_test_split function
    print("Train test split ======")
    x_train, x_test, y_train, y_test = train_test_split(
        X=original_data.drop(target_label, axis=1), 
        y=original_data[target_label], 
        train_size=train_size, 
        stratify=original_data[target_label], 
        shuffle=True)
    
    # Init experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
    
    logg_tags = {
        'Dataset id': settings['meta']['id'],
        'Tuned on': 'original',
        'Trained on': 'original',
    }
    
    mlflow.start_run(mlflow.run_name_with_original_data, tags=logg_tags)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"

        print(f"Model: {translate_model_name(ml_model)}")
        print("Logg model name")

        logg_tags['model']=ml_model
        print("start logging")
        
        mlflow.start_run(model_name, tags=logg_tags, nested=True)
        # create model and pipeline
        print("Get GPU model")
        estimator = GPUModels(ml_model)
        print("Create pipeline")
        model = GPUClassifierPipeline(
            classifier=estimator,
            numeric_features=numeric_features,
            categorical_features=categorical_features,
            ordinal_features=ordinal_features
        )

        cv = StratifiedKFold(n_splits=config['clf']['cv_folds'])
        optimize = config['clf']['tuning_param']['optimize']    
        tune_grid = Grids.get_tuning_grid(ml_model, 'cuml')
        
        print(f"Tune grid: {tune_grid}")
        
        tuned_model, val_score = opt_tune_model(X=x_train, 
                                                y=y_train, 
                                                cv=cv, 
                                                model=model, 
                                                optimize=optimize, 
                                                tune_grid=tune_grid,
                                                n_trials=config['clf']['tuning_param']['early_stopping_max_iters'])

        y_pred = tuned_model.predict(x_test).to_pandas()

        metrics =  classification_report(y_true=y_test.to_pandas(), y_pred=y_pred, output_dict=True, digits=4)
        holdout_score = pandas.DataFrame.from_dict(metrics).transpose()

        test_metrics = {
            "Accuracy": metrics['accuracy'],
            "F1": metrics['macro avg']['f1-score'],
            "MCC": matthews_corrcoef(y_true=y_test.to_pandas(), y_pred=y_pred),
            "Kappa": cohen_kappa_score(y1=y_test.to_pandas(), y2=y_pred)
        }

        # If there is a prediction_score in the from predict_model (sometimes there isn't)
        if y_test.nunique() == 2:
            y_prob = tuned_model.predict_proba(x_test)
            test_metrics['AUC'] = roc_auc_score(y_true=y_test.to_pandas, y_score=y_prob)
        
        # log parameters     
        mlflow.log_params(tuned_model.get_classifier().get_params())
        # log performance
        mlflow.log_tag('model', ml_model)
        mlflow.log_metrics(test_metrics)
        mlflow.log_metric(f"val_{optimize}", val_score)
        mlflow.log_score_report_to_html(holdout_score, "Holdout")
        # log model
        mlflow.log_model(model=tuned_model)
        # end run for the model
        mlflow.end_run()
        
        # quick fix for colab issue
        performance_row = {**logg_tags, **test_metrics}
        performance_row['Params'] = tuned_model.get_params()
        model_performance_df = model_performance_df.append(performance_row, ignore_index=True)

    # end run for this dataset
    mlflow.end_run()

# Save model performance to csv
model_performance_df.to_csv(folders['model_perf_filepath'], index=False)