# Model tuning

## Data preparation

In [56]:
# Ignore all warnings
from warnings import simplefilter
simplefilter(action='ignore')

# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import pickle
import imblearn.over_sampling as OS

# Import the models from SKLearn (Model 1 through Model 6)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Import the modules to evaluate the models
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve
from sklearn.metrics import roc_curve, auc

# GridSearch
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [12]:
# Local module
from ml_classification import model_performance

### Functions

In [66]:
def upsample_smote(X_train, y_train, verbose=False):

	n_pos = np.sum(y_train == 1)
	n_neg = np.sum(y_train == 0)

	# Create Nx as many positive samples
	N = 50
	ratio = {1: n_pos*N, 0: n_neg}

	# Randomly oversample
	ROS = OS.SMOTE(
		sampling_strategy = ratio,
		random_state = 42
	)

	X_train_rs, y_train_rs = ROS.fit_resample(X_train, y_train)

	if verbose:
		print(f"Original Pos Class Count: {np.sum(y_train)}")
		print(f"Oversample Pos Class Count: {np.sum(y_train_rs)}")

	return X_train_rs, y_train_rs

In [65]:
def get_predictions(model, X):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:,1]
    # return classification and probabilities
    return y_pred, y_proba

In [14]:
# REPEATED FROM MODEL PERFORMANCE NOTEBOOK
def model_performance_metrics(y_test, y_proba, y_pred, model_name='Unnamed'):
    
    # Calculate accuraccy, precision, recall and F1-score
    accuracy = accuracy_score(y_test, y_pred)*100
    precision = precision_score(y_test, y_pred)*100
    recall = recall_score(y_test, y_pred)*100
    f1 = f1_score(y_test, y_pred)*100

    # Get P-R curve parameters (numpy arrays)
    p_curve, r_curve, t_curve = precision_recall_curve(y_test, y_proba)
    f1_scores = [f1_score(y_test, (y_proba >= t)) for t in t_curve]

    # ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    auc_score = auc(fpr, tpr)*100

    # ROC curve parameters
    roc_param = {
        'fpr': fpr,
        'tpr': tpr,
        'thr': thresholds
    }

    # Precision-Recall curve parameters
    pr_curve = {
        'p_curve': p_curve,
        'r_curve': r_curve,
        't_curve': t_curve,
        'f1_curve': f1_scores
    }

    # Group all metrics
    metrics = {
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_score': auc_score
    }

    return metrics, pr_curve, roc_param

### Data import

In [15]:
with open('feat_target.bin','rb') as f:
    features_target = pickle.load(f)

X = features_target['X']
y = features_target['y']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y, test_size=0.2)
X_train, y_train = upsample_smote(X_train, y_train)

## Evaluate models

In [77]:
def search_cv(model_type, search_type, params, scoring, n_iter=100):
    
    # Select model type
    if model_type == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier()
    elif model_type == 'LogisticRegression':
        model = LogisticRegression()

    # Select search types
    if search_type == 'Random':
        grid = RandomizedSearchCV(model, params, scoring=scoring, n_iter=n_iter)
    elif search_type == 'Grid':
        grid = GridSearchCV(model, params, scoring=scoring)

    # Train models
    grid.fit(X_train, y_train)

    return grid.best_params_

### Model 0: Classification Tree

#### Random search

In [69]:
# Define search parameters
params = {
    'max_depth': np.arange(start=1, stop=11, step=1),
    'min_samples_leaf': [2, 5, 10, 20, 100]
}

scoring='precision'

best_params = search_cv('DecisionTreeClassifier', 'Random', params, scoring, n_iter=100)
best_params

{'min_samples_leaf': 5, 'max_depth': 5}

#### Grid search

In [84]:
# Define search parameters
params = {
    'max_depth': np.arange(start=1, stop=11, step=1),
    'min_samples_leaf': np.arange(start=1, stop=21, step=1)
}

scoring='precision'

best_params = search_cv('DecisionTreeClassifier', 'Grid', params, scoring)
best_params

{'max_depth': 5, 'min_samples_leaf': 4}

#### Model with best parameters

In [86]:
# Initiate and fit model (unpack best parameters)
model_0_tuned = DecisionTreeClassifier(**best_params).fit(X_train, y_train)

# Get predictions
y_pred_0, y_proba_0 = get_predictions(model_0_tuned, X_test)

# Get model performance metrics
metrics_0, pr_curve_0, roc_param_0 = model_performance_metrics(y_test, y_proba_0, y_pred_0, model_name='Tuned Class. Tree')
display(pd.DataFrame([metrics_0]))

Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,Tuned Class. Tree,98.72549,65.116279,71.794872,68.292683,88.42694


### Model 1: Linear Regression

#### Random search

In [88]:
# Define search parameters
params = {
    'C': np.linspace(.1, 2, 20),
    'penalty': ['l1', 'l2', 'elsticnet'],
    'solver': ['saga']
}

scoring='precision'

best_params_1 = search_cv('LogisticRegression', 'Random', params, scoring, n_iter=100)
best_params_1

{'solver': 'saga', 'penalty': 'l2', 'C': 0.4}

In [89]:
# Initiate and fit model (unpack best parameters)
model_1_tuned = LogisticRegression(**best_params_1).fit(X_train, y_train)

# Get predictions
y_pred_1, y_proba_1 = get_predictions(model_1_tuned, X_test)

# Get model performance metrics
metrics_1, pr_curve_1, roc_param_1 = model_performance_metrics(y_test, y_proba_1, y_pred_1, model_name='Tuned Log. Reg.')
display(pd.DataFrame([metrics_1]))

Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,Tuned Log. Reg.,97.990196,48.076923,64.102564,54.945055,89.020874


## Compare models

In [90]:
compare_models = pd.DataFrame([metrics_0, metrics_1])

display(compare_models)

Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,Tuned Class. Tree,98.72549,65.116279,71.794872,68.292683,88.42694
1,Tuned Log. Reg.,97.990196,48.076923,64.102564,54.945055,89.020874
