# Guideline

#### Export model
#### Instanciate preprocessing pipelines
#### Instanciate trained imported model
#### Get predicted results / model performance score

# Importing libraries

In [5]:
import os

# Visualisation
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Algorithms
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
# Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE

# Pipelines
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from imblearn import FunctionSampler
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import set_config; set_config(display='diagram')

# Metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, plot_confusion_matrix

from sklearn.metrics import roc_auc_score, plot_roc_curve, roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve

from sklearn.model_selection import learning_curve
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.inspection import permutation_importance

# Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

# Deploy
import pickle

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Introduction

This is a guide to impute the model with a raw dataset.

# Data Fetching

In [27]:
# define data path
data_path = '../data/data_raw.csv'

In [8]:
# read csv file
df = pd.read_csv('../data/data_raw.csv')

# Data Cleaning

In [None]:
# drop the space before title of columns
df.columns = [c.replace(" ","", 1) if c.startswith(' ') else c for c in df.columns]

In [None]:
# drop 'Net Income Flag' feature
df = df.drop('Net Income Flag', axis=1)

In [None]:
# export cleaned dataset
df_cleaned = df.to_csv('../data/data_cleaned.csv', index=False)

# Data Preprocessing

## Feature Selections

In [9]:
X = df.iloc[:, 1:]
y = df['Bankrupt?']

---------------

## Preprocessing pipelines

In [None]:
# checking features which do or do not have a normal distribution
def check_not_normal(X):
    data = []
    for i in X.columns:
        if X[i].skew() < -0.9 or X[i].skew() > 0.9:
            data.append(i)

    return data

def check_normal(X):
    data = []
    for i in X.columns:
        if X[i].skew() > -0.9 and X[i].skew() < 0.9:
            data.append(i)

    return data

ftr_to_scale = check_not_normal(X)
ftr_to_norm = check_normal(X)

In [None]:
scaling_itr = ColumnTransformer([
    ('scaling', StandardScaler(), ftr_to_scale)
], remainder='passthrough')

In [None]:
normalize_itr = ColumnTransformer([
    ('normal scaling', MinMaxScaler(), ftr_to_norm),
], remainder='passthrough')

In [None]:
scaling_nrm_itr = ColumnTransformer([
    ('normal scaling', MinMaxScaler(), ftr_to_norm),
    ('standard scaling', StandardScaler(), ftr_to_scale)
], remainder='passthrough')

In [None]:
robust_sc_itr = Pipeline([
    ('robust scaling', RobustScaler())
])

## Performance metrics

In [19]:
# Get train scores, train sizes, and validation scores using `learning_curve`, r2 score
def learning_curves(model, features, target):
    train_sizes, train_scores, test_scores = learning_curve(estimator = model,
                                                            X = features,
                                                            y = target,
                                                            train_sizes = [5,10,50,100,200,500,1000,2000,3000,5000],
                                                            cv = 5,
                                                            scoring='recall',
                                                            shuffle = True,
                                                            random_state=3)

    # Take the mean of cross-validated train scores and validation scores
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    # Plot the learning curves!
    plt.plot(train_sizes, train_scores_mean, label = 'Training score')
    plt.plot(train_sizes, test_scores_mean, label = 'Test score')
    plt.ylabel('Recall', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    plt.title('Learning curves - log model', fontsize = 18, y = 1.03)
    plt.legend()
    plt.show()


In [20]:
# ROC Curve
def roc_auc(model, X_tst, y_tst):
    plot_roc_curve(model, X_tst, y_tst)
    plt.title('ROC Curve')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.show()


In [21]:
# Confusion Matrix
def plot_confusion_matrix(y_tst, y_hat):
    rec = round(recall_score(y_tst, y_hat), 2)
    cm = confusion_matrix(y_tst, y_hat)
    sns.heatmap(cm, annot=True, fmt=".0f")
    plt.xlabel('y_pred')
    plt.ylabel('y')
    plt.title('Recall Score: {0}'.format(rec), size=20)
    plt.show()


# Model Import

In [10]:
# pickle model path
model_path = '../pickle/model.pkl'

In [11]:
# import model
model_pickle = open(model_path,'rb')
model = pickle.load(model_pickle)
model_pickle.close()

# Model Instanciation

## Data Splitting

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y) # split data into train/test sets

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Classification report
print(classification_report(y_test, y_pred))

In [None]:
learning_curves(model, X, y)

In [None]:
roc_auc(model, X_test, y_test)

In [None]:
plot_confusion_matrix(y_test, y_pred)

--------------------------------