# Model Pipeline

Author: Jasmine Qin  
Date: 2020-05-27

In [3]:
# Basics
import pandas as pd
import numpy as np
import seaborn as sns
import time
import re
from collections import defaultdict, Counter
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE, SelectFromModel

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

# Pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Evaluation
from sklearn.metrics import plot_confusion_matrix, f1_score, recall_score, precision_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score, r2_score

# Model Explanation
import eli5
import shap

In [2]:
# Options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Table of Contents
- [1. Load Data](#ld)
- [2. Summary](#s)
- [3. Feature Engineering](#fe)
- [4. Split Data](#sd)
- [5. Preprocessing](#pp)
- [6. Baseline Model](#bm)
- [7. Feature Importance and Selection](#fi)
- [8. Other Models](#om)
- [9. Hyperparameter Tuning](#ht)
- [10. Other Tests](#ot)

## 1. Load Data <a name="ld"></a>

- Run all scripts 01 to 04 to have the combined and cleaned data
- Load `combined_train.csv` and `combined_validate.csv`

In [4]:
train = pd.read_csv('../../data/processed/04_combined_train.csv',
                    low_memory=False)
#validation = pd.read_csv('../../data/processed/04_combined_validate.csv',
#                         low_memory=False)

## 2. Summary <a name="s"></a>

In [7]:
train.head(3)

Unnamed: 0,FOLDERYEAR,LocalArea,NumberofEmployees,FeePaid,label,BusinessIndustry,Parking meters,Disability parking,Unemployment_rate,Without children at home,1 child,2 children,3 or more children,English,French,Chinese,Italian,German,Spanish,other language,Married or living with a or common-law partner,Not living with a married spouse or common-law partner,age below 20,age between 20 and 35,age between 35 and 60,age above 60,female,male,Not a visible minority,Total visible minority population,dwelling_House,dwelling_Apartment,dwelling_Other,Owned shelter,Rented shelter,Female lone parent,Male lone parent,Canadian citizens,Not Canadian citizens,1st generation,2nd generation,3rd generation and over,1 person household,2 persons household,3 persons household,4 to 5 persons household,6 or more persons household,One-family households,Multiple-family households,Non-family households,...,"Arts, entertainment and recreation",Construction,Educational services,Finance and insurance,Health care and social assistance,Information and cultural industries,Management of companies and enterprises,Manufacturing,"Mining, quarrying, and oil and gas extraction",Other services (except public administration),"Professional, scientific and technical services",Public administration,Real estate and rental and leasing,Retail trade,Transportation and warehousing,Utilities,Wholesale trade,Employment rate,Unemployment rate,Non-movers 1 yr ago,Non-migrants 1 yr ago,Migrants 1 yr ago,Occupations n/a,Management,Business and finance,Natural and applied sciences,Health,Social Science and education,Art,Sales and service,Trades and transport,Natural resources and agriculture,Manufacturing and utilities,full time,part time,car as driver,car as passenger,public transportation,walked,bicycle,other transportation,Worked at home,Worked at usual place,Worked outside Canada,No fixed workplace,education below postsecondary,education above postsecondary,Non-immigrants,Non-permanent residents,Immigrants
0,2018,Downtown,1.0,189.0,0,"Professional, scientific and technical services",3771.0,19.0,4.4,0.730159,0.173055,0.081688,0.015099,0.559279,0.021882,0.120096,0.004872,0.010514,0.02872,0.254637,0.466052,0.534034,0.083427,0.364501,0.388764,0.163308,0.466468,0.533613,0.563466,0.436534,0.025967,0.973891,0.000143,0.428205,0.571795,0.831288,0.171779,0.7939,0.2061,0.505055,0.216209,0.278651,0.515121,0.352068,0.086733,0.03709,0.008987,0.408475,0.003281,0.588244,...,0.026743,0.033117,0.060485,0.075981,0.083354,0.08048,0.004749,0.029493,0.007623,0.032117,0.202699,0.032742,0.037741,0.084979,0.028118,0.005749,0.034366,68.8,5.6,0.750343,0.130487,0.11917,0.018373,0.177728,0.195476,0.124609,0.058868,0.109861,0.084614,0.188851,0.032496,0.002875,0.006499,0.392405,0.349968,0.303788,0.021667,0.212424,0.413788,0.030909,0.017424,0.111626,0.787606,0.014698,0.08607,0.232857,0.767143,0.50446,0.094724,0.400901
1,2004,Grandview-Woodland,,100.0,1,Wholesale trade,312.0,7.0,6.8,0.54888,0.237271,0.155804,0.057026,0.629542,0.023256,0.146621,0.022892,0.013445,0.023801,0.140443,0.415306,0.584694,0.160787,0.281865,0.416416,0.140932,0.508243,0.491757,0.669828,0.330172,0.142807,0.186802,0.670391,0.33764,0.66236,0.797688,0.202312,0.906781,0.093219,0.384082,0.223469,0.392449,0.453561,0.318087,0.122207,0.09148,0.014316,0.440098,0.011876,0.548027,...,0.052213,0.057605,0.093927,0.022418,0.089955,0.068388,0.001419,0.081442,0.001419,0.059024,0.080874,0.031215,0.017877,0.078036,0.039444,0.00454,0.039728,66.3,7.8,0.791817,0.141853,0.06633,0.020999,0.075199,0.145573,0.073212,0.033201,0.129115,0.104427,0.238365,0.11748,0.016742,0.045403,0.427602,0.520362,0.424283,0.058685,0.341315,0.081282,0.07285,0.021248,0.081639,0.762785,0.004929,0.150647,0.460827,0.539173,0.658794,0.01986,0.321346
2,2005,Grandview-Woodland,,103.0,1,Wholesale trade,312.0,7.0,5.7,0.54888,0.237271,0.155804,0.057026,0.629542,0.023256,0.146621,0.022892,0.013445,0.023801,0.140443,0.415306,0.584694,0.160787,0.281865,0.416416,0.140932,0.508243,0.491757,0.669828,0.330172,0.142807,0.186802,0.670391,0.33764,0.66236,0.797688,0.202312,0.906781,0.093219,0.384082,0.223469,0.392449,0.453561,0.318087,0.122207,0.09148,0.014316,0.440098,0.011876,0.548027,...,0.052213,0.057605,0.093927,0.022418,0.089955,0.068388,0.001419,0.081442,0.001419,0.059024,0.080874,0.031215,0.017877,0.078036,0.039444,0.00454,0.039728,66.3,7.8,0.791817,0.141853,0.06633,0.020999,0.075199,0.145573,0.073212,0.033201,0.129115,0.104427,0.238365,0.11748,0.016742,0.045403,0.427602,0.520362,0.424283,0.058685,0.341315,0.081282,0.07285,0.021248,0.081639,0.762785,0.004929,0.150647,0.460827,0.539173,0.658794,0.01986,0.321346


In [9]:
# get categorical and numerical variables
cat_vars = ['FOLDERYEAR', 'BusinessIndustry', 'LocalArea']
label = ['label']

num_vars = [i for i in train.columns if i not in cat_vars and i not in label]

In [None]:
def describe_data(df, num_vars, cat_vars):
    """
    This function prints shape of the given dataframe
    and returns two separate reports for both 
    numeric and categorical variables.
    """
    
    print("The dataset has", df.shape[0], "rows and", df.shape[1], "columns.\n")
    
    num_summary = df[num_vars].describe()
    num_summary.loc['missing'] = [df[s].isnull().sum() for s in num_vars]
    
    cat_summary = {'cat_var':[], 'missing':[], 'unique':[]}
    for c in cat_vars:
        cat_summary['cat_var'].append(c)
        cat_summary['missing'].append(sum(df[c].isnull()))
        cat_summary['unique'].append(len(df[c].unique()))
    
    return num_summary, pd.DataFrame(cat_summary)

In [None]:
num_df, cat_df = describe_data(train, num_vars, cat_vars)
display(num_df, cat_df)

In [None]:
train[train.BusinessIndustry.isnull()].BusinessType.unique()

In [None]:
train['label'].value_counts()

## 3. Feauture Engineering <a name="fe"></a>

- Including just cleaning for now, other steps to add in the future
- Will be moved to a separate feature engineering script

In [None]:
def feature_engineering(df):
    df = df[df.LocalArea.notnull()]
    df = df[df.Status == 'Issued']

    df = df[num_vars+cat_vars+label]
    return df.drop(columns=label), df['label']


## 4. Split Data <a name="sd"></a>

- Split data here because Validation set is used directly as Test

In [None]:
X_train_valid, y_train_valid = feature_engineering(train)
#X_test, y_test = feature_engineering(validation)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.2, random_state=2020)

## 5. Preprocessing <a name="pp"></a>

- include polynomial features (interactions, e.g., local area x business type)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',
                              fill_value='missing')),
    ('onehot', OneHotEncoder(
        handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_vars),
        ('cat', categorical_transformer, cat_vars)
    ])

## 6. Baseline Model <a name="bm"></a>

[ROC AUC](https://github.com/dariyasydykova/open_projects/tree/master/ROC_animation) source

In [None]:
def evaluate_model(model, X_train=X_train, X_test=X_valid, y_train=y_train, y_test=y_valid, verbose=True):
    """
    This function prints train and test accuracies,
    classification report, and confusion matrix.
    """
    model.fit(X_train, y_train)
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    
    if verbose:
        print("Train Accuracy:", train_acc)
        print("Validation Accuracy", test_acc, "\n")

        print(classification_report(y_test, model.predict(X_test)))
        print(plot_confusion_matrix(model, X_test, y_test,
                                     display_labels=[0, 1],
                                     cmap=plt.cm.Blues,
                                     values_format='d'))
    else:
        report = {}
        f1 = f1_score(y_test, model.predict(X_test))
        recall = recall_score(y_test, model.predict(X_test))
        precision = precision_score(y_test, model.predict(X_test))
        report['renewed'] = [f1, recall, precision]
        
        f1 = f1_score(y_test, model.predict(X_test), pos_label=0)
        recall = recall_score(y_test, model.predict(X_test), pos_label=0)
        precision = precision_score(y_test, model.predict(X_test), pos_label=0)
        report['not_renewed'] = [f1, recall, precision]
        
        report['accuracy'] = [train_acc, test_acc]
        
        return report

In [None]:
def roc_curve_auc(model, X_train=X_train, X_test=X_valid, y_train=y_train, y_test=y_valid):
    """
    This function plots ROC curve and prints AUC score.
    """
    # assume fit model first
    #mode.fit(X_train, y_train)
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
        
    fig = plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr)
    plt.plot((0,1),(0,1),'--k')
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.annotate(f'AUC score = {auc: .4f}', (0.0, 0.9))

In [None]:
lr = LogisticRegression(solver='saga', class_weight='balanced')

lr_pip = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', lr)])

evaluate_model(lr_pip)

In [None]:
roc_curve_auc(lr_pip)

## 7. Feature Importance and Selection <a name="fi"></a>

- weights
- RFE
- L1, L2
- PCA

In [None]:
def explain_model(pip, df, verbose=True):
    pp1_features = num_vars + \
        list(pip['preprocessor'].transformers_[
             1][1]['onehot'].get_feature_names())
    
    return eli5.show_weights(pip['classifier'],
                             feature_names=pp1_features,
                             top=30)


In [None]:
explain_model(lr_pip, X_train)

## 8. Other Models <a name="om"></a>

In [None]:
lgbm = LGBMClassifier(class_weight='balanced')

lgbm_pip = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', lgbm)])

evaluate_model(lgbm_pip)

In [None]:
roc_curve_auc(lgbm_pip)

In [None]:
explain_model(lgbm_pip, X_train)

## 9. Hyperparameter Tuning <a name="ht"></a>

- regularization strength
- tree max depth
- sampling strategy for SMOTE
- missing value imputing strategy
- degree of polynomial terms

In [None]:
lrhp = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LGBMClassifier(class_weight='balanced'))])

param_grid = {
    'classifier__reg_lambda': [0.1, 1],
}

#gs = GridSearchCV(lrhp, param_grid, cv=5, scoring='f1')
#gs.fit(X_train, y_train);

## 10. Other Tests <a name="ot"></a>

- SMOTE

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
smote_pip = Pipeline(steps=[('preprocessor', preprocessor),
                            ('smote', SMOTE(sampling_strategy=0.85)),
                            ('classifier', lgbm)])

param_grid = {
    'smote__sampling_strategy': [0.1, 0.5, 0.8, 1.0],
}

#gs = GridSearchCV(smote_pip, param_grid, cv=5, scoring='f1')
#gs.fit(X_train_valid, y_train_valid)
evaluate_model(smote_pip)