# todo
- create new environment with limited packages
- separate model training from roc curve plot
- random forest repeatable 
- svm repeatable
- adaboost repeatable 
- add the notes

## contents
* [introduction](#introduction)
* [imports, load and model tester](#imports)
* [naive bayes](#naive_bayes')
* [logistic regression](#logistic_regression)
* [random forest](#random_forest)
* [AdaBoost](#adaboost)
* [support vector machine](#svm)
* [model selection and conclusion](#conclusion)

## introduction <a id='introduction'></a>

## imports, load and model tester<a id='imports'></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


from sklearnex import patch_sklearn, config_context
patch_sklearn()

warnings.filterwarnings('ignore')
%matplotlib inline


In [None]:
X_train = pd.read_parquet('../data/interim/X_train.parquet')
X_test = pd.read_parquet('../data/interim/X_test.parquet')
y_train = pd.read_parquet('../data/interim/y_train.parquet')
y_test = pd.read_parquet('../data/interim/y_test.parquet')

X_train_ind = X_train[X_train['is_independent'] == 1].copy()
X_test_ind = X_test[X_test['is_independent'] == 1].copy()
y_train_ind = y_train[y_train['is_independent'] == 1].copy()
y_test_ind = y_test[y_test['is_independent'] == 1].copy()

dataframes = [X_train, X_test, y_train, y_test, X_train_ind, X_test_ind, y_train_ind, y_test_ind]
for df in dataframes:
    df.drop('is_independent', axis=1, inplace=True)

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
y_train_ind = np.ravel(y_train_ind)
y_test_ind = np.ravel(y_test_ind)

cross_validation = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [None]:
def sk_model(param_grid, model, cv, independent=False):

    X_train_local = X_train_ind if independent else X_train
    y_train_local = y_train_ind if independent else y_train
    X_test_local = X_test_ind if independent else X_test
    y_test_local = y_test_ind if independent else y_test
    
    grid_search = GridSearchCV(
            estimator=model, 
            param_grid=param_grid, 
            cv=cv)
    
    with config_context(target_offload="gpu:0"):
        grid_search.fit(X_train_local, y_train_local)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict_proba(X_test_local)[:, 1]
    fpr, tpr, _ = roc_curve(y_test_local, y_pred)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0, 1])
    plt.ylim([0, 1.05])
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.legend(loc='lower right')
    plt.show()


## naive bayes<a id='naive_bayes'></a>

In [None]:
param_grid = {}

model = GaussianNB()
sk_model(param_grid=param_grid,
        model=model,
        cv=5,
        independent=True
)

## logistic regression<a id='logistic_regression'></a>

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1','l2']
}

model = LogisticRegression(solver='liblinear', random_state=42)
sk_model(param_grid=param_grid,
         model=model,
         cv=cross_validation,
         independent=True
)

## random forest<a id='random_forest'></a>

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestClassifier()
sk_model(param_grid=param_grid,
         model=model,
         cv=cross_validation,
         independent=True)

## AdaBoost<a id='adaboost'></a>

In [None]:
param_grid = {
    'n_estimators': [25, 50, 100, 200],  
    'learning_rate': [0.5, 1, 1.5]
}

model = AdaBoostClassifier()

sk_model(param_grid=param_grid,
         model=model,
         cv=cross_validation,
         independent=True
)


## support vector machine<a id='svm'></a>

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['rbf', 'poly']
}

model = SVC()

sk_model(param_grid=param_grid,
         model=model,
         cv=cross_validation,
         independent=False
)


## conclusion <a id='conclusion'></a>