<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 15px; height: 80px">

# Project 4 Final Model

This notebook containts the evaluation of our final model and recommendations, using our entire dataset created from our simulation.

In [1]:
# You will need to run the folding command from your terminal to run this notebook
# or uncomment the line below:
# conda install -c conda-forge imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, \
accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
    classification_report, RocCurveDisplay)

from sklearn.metrics import make_scorer, precision_recall_fscore_support

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../data/preprocessed_subsample_100k.csv')
df.head()

In [None]:
# drop the probability density feature for now
df.drop(['probability_density'], axis=1, inplace=True)

In [None]:
X = df.drop(['is_fraud'], axis=1)
y = df['is_fraud']

In [None]:
y.value_counts()

In [None]:
y.value_counts(normalize=True).mul(100).round(2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.1,
                                                    random_state = 42, 
                                                    stratify = y)

In [None]:
print(y_train.value_counts(normalize=True).mul(100).round(2))
print(y_train.value_counts())

In [None]:
# Scale our data...
ss = StandardScaler()

# Scaler fits or trains on the X_train only - does not see the
# X_test (or imagined new data). 
X_train_sc = ss.fit_transform(X_train)

# We take the scaling from X_train and apply that to the X_test 
X_test_sc = ss.transform(X_test)

In [None]:
# leverage function from GA lesson on smote
def metrics(y_test, y_predict):
    print(f"Accuracy score: {accuracy_score(y_test, y_predict)} \n")
    print('----------------------------------------------------------------')
    print(pd.DataFrame(confusion_matrix(y_test, y_predict), 
                            index=['not_fraud', 'Actually_fraud'], 
                            columns=['Predicted_not_Fraud', 'Predicted_fraud']), '\n')
    print('-----------------------------------------------------------------')
    print(classification_report(y_test, y_predict))
    print('-----------------------------------------------------------------')

In [None]:
# Now we can create synthetic data for our training set
sm = SMOTE(random_state=42, n_jobs=-1)

X_train_smote, y_train_smote = sm.fit_resample(X_train_sc, y_train)

In [None]:
X_train_smote.shape, y_train_smote.shape

In [None]:
y_train_smote.value_counts()

## Random Forest model

In [None]:
def weighted_recall_scorer(y_true, y_pred, beta=10, **kwargs):
    
    """
    Custom scoring function to prioritize recall for the minority class.
    
    Parameters: y_true(true target values), y_pred(predicted target values),
                beta(weight of recall score)
    
    Returns: The fbeta-score of the positive class.
    """
    # we only need the fbeta_score from the precision_recall_fscore_support function
    # therefor we assign "_, _," to denote we only need to return the fbeta_score
    _, _, fbeta_score, _ = precision_recall_fscore_support(y_true, y_pred, 
                                                           beta=beta, pos_label=1)
    return fbeta_score[1]

custom_scorer = make_scorer(weighted_recall_scorer)

In [None]:
pipe_rf_cs = Pipeline([
        ('scale', StandardScaler()),
        ('sampling', SMOTE()),
        ('rfc', RandomForestClassifier())
    ])

pipe_params_rf_cs = {
     'sampling__sampling_strategy': ['minority'],
     'sampling__k_neighbors': [15],
     'rfc__max_depth': [20],
}

# pipe_params_rf_cs = {
#      'sampling__sampling_strategy': ['minority', 'not minority', 'auto'],
#      'sampling__k_neighbors': [9, 12, 15],
#      'rfc__max_depth': [20, 50, 100],
# }

# Best params: 
# {'rfc__max_depth': 20,
#  'sampling__k_neighbors': 15,
#  'sampling__sampling_strategy': 'minority'}

In [None]:
grid_rf_cs = GridSearchCV(pipe_rf_cs, pipe_params_rf_cs, n_jobs=-1, scoring=custom_scorer)
grid_rf_cs.fit(X_train, y_train)

In [None]:
grid_rf_cs.score(X_train, y_train), grid_rf_cs.score(X_test, y_test)

In [None]:
grid_rf_cs.best_params_

In [None]:
grid_preds_rf_cs = grid_rf_cs.predict(X_test)

In [None]:
metrics(y_test, grid_preds_rf_cs)

## Final model evaluation

In [None]:
ConfusionMatrixDisplay.from_estimator(grid_rf_cs, X_test, y_test, cmap='viridis', display_labels=['Not Fraud', 'Fraud'], colorbar=False)
plt.title('Test')

In [None]:
# Create an ax object
ax = plt.gca()

# Use RocCurveDisplay for both estimators
RocCurveDisplay.from_estimator(grid_rf_cs, X_test, y_test, 
                               ax=ax, name='Random Forest')

# Add 'worst case scenario' line
plt.plot([0,1], [0,1], label='no prediction value', 
         linestyle='--', color='gray')

plt.title('Receiver Operating Characteristic Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# Necessary to label the baseline
plt.legend();

# Summary and Recommendations