# Model Notebook

## Contents
- Util.py
- Preprocessing.py
    - Encoding categorical columns
    - Preprocessed data
    - Rescaling numerical columns
- Data.py
    - Dependent and independent variables
    - imensionality reduction: PCA, TSNE
    - Class imbalance: SMOTE
- Model.py
    - uilding model
    - Cross validation: KFold, StratifiedKfold
    - Evaluation metrics
    - Prediction
- Main.py
    - Dataset
    - Preprocessing
    - Data
    - Machine Learning model
        - Logistic Regression
        - XGBoost
        - CatBoost
        - Multi Layer Perceptron

## Util

In [None]:
%%writefile Util.py
# import system libraries
import warnings
warnings.filterwarnings('ignore')

### Preprocesssing

In [1]:
%%writefile Preprocess.py

# import library for preprocessing
from sklearn.preprocessing import StandardScaler

# import libraies for data manipulation
import pandas as pd

import Util

# preprocessing class
class preprocess:
    
    # create list containing categorical columns
    cat_cols = ['job', 'marital', 'education', 'default', 'housing',
                'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    # create list containing numerical columns
    num_cols = ['duration', 'campaign', 'emp.var.rate',"pdays","age", 'cons.price.idx', 
                'cons.conf.idx', 'euribor3m', 'nr.employed', 'previous']
    
    # function to encode categorical columns
    def encode(self, data):
        cat_var_enc = pd.get_dummies(data[self.cat_cols], drop_first=False)
        return cat_var_enc
    
    # function to 
    def preprocessed(self, data):
        # adding the encoded columns to the dataframe
        data = pd.concat([data, self.encode(data)], axis=1)
        # saving the column names of categorical variables
        cat_cols_all = list(self.encode(data).columns)
        # creating a new dataframe with features and output
        cols_input = self.num_cols + cat_cols_all
        preprocessed_data = data[cols_input + ['subscribed']]
        return preprocessed_data
    
    # function to rescale numerical columns
    def rescale(self, data):
        # creating an instance of the scaler object
        scaler = StandardScaler()
        data[self.num_cols] = scaler.fit_transform(data[self.num_cols])
        return data
    
# create class methods
preprocess.encode = classmethod(preprocess.encode)
encode = preprocess.encode
preprocess.preprocessed = classmethod(preprocess.preprocessed)
preprocessed = preprocess.preprocessed
preprocess.rescale = classmethod(preprocess.rescale)
rescale = preprocess.rescale

Writing Preprocess.py


### Data Loader

In [2]:
%%writefile Data.py

# importing all necessary libraries

# import methods from Preprocess.py
from Preprocess import encode, preprocessed, rescale

# import libraies for data manipulation
import pandas as pd

# import libraries for visualization
import matplotlib.pyplot as plt

# import library for splitting dataset
from sklearn.model_selection import train_test_split

# import library for dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# import library for dealing with class imbalance
from imblearn.over_sampling import SMOTE

# function to get the dependent and independent variable
def data_loader(data):
    X = data.drop(columns=[ "subscribed", 'duration'])
    y = data["subscribed"]
    print("X shape:",X.shape)
    print("y shape:",y.shape)
    return X,y

# function to split dataset
def split_data(X, y):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state=1)
    # printing the shape of training set
    print(f'Train set X shape: {X_train.shape}')
    print(f'Train set y shape: {y_train.shape}')
    # printing the shape of test set
    print(f'Test set X shape: {X_test.shape}')
    print(f'Test set y shape: {y_test.shape}')
    return X_train,X_test,y_train,y_test

# function to get the number of components for dimensionality reduction
def pca(data):
    # create an instance of pca
    pca = PCA()
    # fit pca to our data
    pca.fit(data)
    # saving the explained variance ratio
    explained = pca.explained_variance_ratio_
    # plot the cumulative variance explained by total number of components
    plt.figure(figsize=(12,6))
    plt.plot(range(1,61), explained.cumsum(), marker='o', linestyle='--')
    plt.title('Explained Variance by Components')
    plt.xlabel('Number of Components')
    plt.ylabel('Cummulative Explained Variance')
    plt.savefig('pca.png')
    plt.show()

# function to reduce dimensions
def dimension_reduction(method, components, train_data, test_data):
    # PCA
    if (method == 'PCA'):
        pca = PCA(n_components=components)
        pca.fit(train_data)
        pca_train = pca.transform(train_data)
        X_train_reduced = pd.DataFrame(pca_train)
        print("original shape:   ", train_data.shape)
        print("transformed shape:", X_train_reduced.shape)
        print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
        # applying method transform to X_test
        pca_test = pca.transform(test_data)
        X_test_reduced = pd.DataFrame(pca_test)
        
    # TSNE
    elif (method == 'TSNE'):
        tsne = TSNE(n_components=components)
        tsne_train = tsne.fit_transform(train_data)
        X_train_reduced = pd.DataFrame(tsne_train)
        print("original shape:   ", train_data.shape)
        print("transformed shape:", X_train_reduced.shape)
        # applying method transform to X_test
        tsne_test = tsne.fit_transform(test_data)
        X_test_reduced = pd.DataFrame(tsne_test)
    
    else:
        print('Dimensionality reduction method not found!')
        
    return X_train_reduced, X_test_reduced

# function to deal with imbalanced class
def class_imbalance(X_data, y_data):
    # creating an instance
    sm = SMOTE(random_state=27)
    # applying it to the data
    X_train_smote, y_train_smote = sm.fit_sample(X_data, y_data)
    return X_train_smote, y_train_smote

Writing Data.py


## Machine Learning Model

In [3]:
%%writefile Model.py

# import all necessary libraries

# import methods from data.py
from Data import data_loader, split_data, pca
from Data import dimension_reduction, class_imbalance

# import libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import machine learning model libraries
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# import libraries for cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# import evaluation metrics
from sklearn.metrics import accuracy_score,recall_score,precision_recall_curve, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix

# function to build machine learning models
def model(model, cv_method, metrics, X_train, X_test, y_train):
    if (model == 'LR'):
        # creating an instance of the regression
        model_inst = LogisticRegression()
        print('Logistic Regression\n----------------------')
    elif (model == 'XGB'):
        # creating an instance of the classifier
        model_inst = XGBClassifier()
        print('XGBoost\n----------------------')
    elif (model == 'MLP'):
        # creating an instance of the classifier
        model_inst = MLPClassifier()
        print('Multi Layer Perceptron\n----------------------')
    elif (model == 'SVM'):
        # creating an instance of the classifier
        kernel = input('Enter the kernel (rbf, linear, or poly):')
        model_inst = SVC(kernel=kernel, C=1.0)
        print('Support Vector Classification\n----------------------')
    
    # cross validation
    if (cv_method == 'KFold'):
        print('Cross validation: KFold\n--------------------------')
        cv = KFold(n_splits=10, random_state=100)
    elif (cv_method == 'StratifiedKFold'):
        print('Cross validation: StratifiedKFold\n--------------------------')
        cv = StratifiedKFold(n_splits=10, random_state=100)
    else:
        print('Cross validation method not found!')
    try:
        cv_scores = cross_validate(model_inst, X_train, y_train, 
                                   cv=cv, scoring=metrics)   
        # displaying evaluation metric scores
        cv_metric = cv_scores.keys()
        for metric in cv_metric:
            mean_score = cv_scores[metric].mean()*100
            print(metric+':', '%.2f%%' % mean_score)
            print('')
            
    except:
        metrics = ['accuracy', 'f1', 'precision', 'recall']
        cv_scores = cross_validate(model_inst, X_train, y_train, 
                                   cv=cv, scoring=metrics)
        # displaying evaluation metric scores
        cv_metric = cv_scores.keys()
        for metric in cv_metric:
            mean_score = cv_scores[metric].mean()*100
            print(metric+':', '%.2f%%' % mean_score)
            print('')

    return model_inst
    
# function to make predictions
def prediction(model, model_name, X_train, y_train, X_test, y_test):
    model_ = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #Get the confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(cf_matrix, annot=True, fmt='.0f')
    plt.title(f'{model_name} Confusion Matrix')
    plt.savefig(f'conf_{model_name}.png')
    plt.show()

Writing Model.py


## Main - Original Dataset


In [4]:
%%writefile Main_org.py

# import all necessary libraries

# import library to get working directory
import os

# import libraies for data manipulation
import pandas as pd
import numpy as np

import Util

# import methods from Preprocess.py
from Preprocess import encode, preprocessed, rescale

# import methods from Data.py
from Data import data_loader, split_data, pca
from Data import dimension_reduction, class_imbalance

# import methods from Model.py
from Model import model, prediction

# Dataset
# function to change working directory
def change_dir(path):
    print("Old directory: ",os.getcwd())
    os.chdir(path)
    print("New directory: ",os.getcwd())
    
# changing the working directory to access the dataset
change_dir('C:\\Users\\PC\\Desktop\\Data Science\\10 Academy\\Training\\Week 6\\Challenge\\Dataset')    

# import the original dataset
dataset = pd.read_csv('bank-additional-full.csv', sep=';')
dataset.name = 'dataset'
print("Original Dataset\n-------------------------")
print(dataset.head())

# changing the working directory to back to original working directory
change_dir('C:\\Users\\PC\\Desktop\\Data Science\\10 Academy\\Training\\Week 6\\Challenge\\Notebooks')

# Preprocessing - Using the new dataset i.e. data without outliers
# replacing basic.4y, basic.6y, basic.9y as basic
dataset['education'] = dataset['education'].replace(['basic.4y', 'basic.6y', 'basic.9y'], 'basic')

# defining output variable for classification
dataset['subscribed'] = (dataset.y == 'yes').astype('int')

# encoding categorical columns
encoded_data = encode(dataset)
print("Encoded Data\n-------------------------")
print(encoded_data.head())

# preprocessed data
preprocessed_data = preprocessed(dataset)
print("Preprocessed Data\n-------------------------")
print(preprocessed_data.head())

# rescaling numerical columns
preprocessed_data = rescale(preprocessed_data)
print("Rescaled Data\n-------------------------")
print(preprocessed_data.head())

# Data
# dependent and independent variables
X, y = data_loader(preprocessed_data)

# splitting the data
X_train,X_test,y_train,y_test = split_data(X, y)

# pca visualization to get number of components
pca(X_train)

# dimensionality reduction
X_train_reduced, X_test_reduced = dimension_reduction('PCA', 20, X_train, X_test)

# dealing with imbalanced class
X_train_smote, y_train_smote = class_imbalance(X_train_reduced, y_train)

# machine learning model
metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']

# 1. Logistic Regression
# KFold cross validation
model_res = model('LR', 'KFold', metrics, X_train_smote, X_test_reduced, y_train_smote)
# StratifiedKFold cross validation
model_res = model('LR', 'StratifiedKFold', metrics, X_train_smote, X_test_reduced, y_train_smote)
# make prediction
prediction(model_res, 'Linear Regression', X_train_smote, y_train_smote, X_test_reduced, y_test)

# 2. XGBoost
# KFold cross validation
model_res = model('XGB', 'KFold', metrics, X_train_smote, X_test_reduced, y_train_smote)
# StratifiedKFold cross validation
model_res = model('XGB', 'StratifiedKFold', metrics, X_train_smote, X_test_reduced, y_train_smote)
# make prediction
prediction(model_res, 'XGBoost Classifier', X_train_smote, y_train_smote, X_test_reduced, y_test)

# 3. Multi Layer Perceptron
# KFold cross validation
model_res = model('MLP', 'KFold', metrics, X_train_smote, X_test_reduced, y_train_smote)
# StratifiedKFold cross validation
model_res = model('MLP', 'StratifiedKFold', metrics, X_train_smote, X_test_reduced, y_train_smote)
# make prediction
prediction(model_res, 'Multi Layer Perceptron', X_train_smote, y_train_smote, X_test_reduced, y_test)

Writing Main_org.py
