**********************************************************************************************
# Initialization
**********************************************************************************************

In [1]:
# Load librarie
import numpy as np
from scipy.stats import uniform
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)
import time

from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


In [1]:
# ''' Remove this comment out if needs to be used
from google.colab import drive
drive.mount('/content/drive')
# ''

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# ''' Remove this comment out if needs to be used
import os
##############################################################################

os.chdir("/content/drive/My Drive/DataMining/Final_Project")
!ls
# '''

**********************************************************************************************
# Functions
**********************************************************************************************

ROC Curve plot function

This is a callable ROC curve plot function. We will use this function to plot ROC Curve for all the models. We have used Seaborn package.

In [2]:
import seaborn as sns
sns.set('talk', 'whitegrid', 'dark', font_scale=1,rc={"lines.linewidth": 2, 'grid.linestyle': '--'})
def plotAUC(truth, pred, lab):
    fpr, tpr, _ = metrics.roc_curve(truth,pred)
    roc_auc = metrics.auc(fpr, tpr)
    lw = 2
    c = (np.random.rand(), np.random.rand(), np.random.rand())
    plt.plot(fpr, tpr, color= c,lw=lw, label= lab +'(AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve') #Receiver Operating Characteristic 
    plt.legend(loc="lower right")


def plotRocCurve(pred,truth, lab):
  import matplotlib.pyplot as plt

  # AUC Curve
  # y_pred_proba = classifier.predict_proba(pred)[:,1]
  fpr, tpr, thresholds = metrics.roc_curve(truth,pred) 
  auc = metrics.auc(fpr, tpr)#metrics.roc_auc_score(fpr, tpr)
  plt.plot(fpr,tpr,label= lab + ", auc="+str(auc))
  plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
  plt.legend(loc="lower right")
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Receiver operating characteristic (ROC) ')
  plt.show()

  print('\nArea under ROC curve is (AUC) :',str(format(auc,'.6f') ))

Confusion Matrix Viz function

This is a callable Confusion Matrix Visualization function. We have used this function to visualize True positives, True Negatives, False Positives and False Negatives for all the models.

In [None]:
import itertools
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(Y,Ypred, normalize=False): # This function prints and plots the confusion matrix.
    cm = confusion_matrix(Y, Ypred, labels=[0, 1])
    classes=["Will Pay", "Will Default"]
    cmap = plt.cm.Blues
    title = "Confusion Matrix"
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=3)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
def accuracy(confusion_matrix):
   diagonal_sum = confusion_matrix.trace()
   sum_of_all_elements = confusion_matrix.sum()
   return diagonal_sum / sum_of_all_elements

**********************************************************************************************
# 1.load data
**********************************************************************************************

In [None]:
import pandas as pd
##############################################################################
 
data = pd.read_pickle('dataset.pkl')


data.columns = ['id', 'dates', 'transaction_amount', 'days_before_request',
                       'loan_amount', 'loan_date', 'isDefault']

# drop all those rows which  
# have any 'nan' value in it. 
data.dropna(inplace = True) 

print('\n')
print('*' * 80)
print('\n')
print('Bank Customer Transactions Dataset:\n')
# print(data.head())
data

## Formulate Data to new prospective

In [None]:
x_data = np.zeros([15000,6])
y_label = np.zeros(15000)

cust_id_list = []

for cust in np.arange(0,15000):
  cust_id_list.append(data['id'][cust][1])
  ts = data['transaction_amount'][cust][1]
  sum_ts = np.sum(ts)
  loan_amt = data['loan_amount'][cust][1]

  if cust <=9999:
    isdef = data['isDefault'][cust][1]
    y_label[cust] = isdef
    # print(y_label)

  x_data[cust,0] = np.mean(ts[np.where(ts>0)])
  x_data[cust,1] = np.mean(ts[np.where(ts<0)])
  x_data[cust,2] = sum_ts
  x_data[cust,3] = np.max(ts)
  x_data[cust,4] = np.min(ts)
  x_data[cust,5] = loan_amt


dataframe1 = pd.DataFrame(np.array(cust_id_list), columns = ['id'])

coulumsName = ['mean_+ts', 'mean_-ts', 'sum_ts', 'max_ts',
                       'min_ts', 'loan_amount']
dataframe2 = pd.DataFrame.from_records(x_data, columns = coulumsName)
dataframe2['isDefault'] = y_label
dataframe2

# Place the DataFrames side by side
data_formulated = pd.concat([dataframe1, dataframe2], axis=1)
data_formulated = data_formulated.drop('id', axis=1)


# drop all those rows which  
# have any 'nan' value in it. 
# data_formulated.dropna(inplace = True) 
data_formulated.fillna(data_formulated.mean(),inplace = True)


print('\n')
print('*' * 80)
print('\n')
print('New Formulated Data For Bank Customer Transactions Dataset:\n')
data_formulated

data_to_use = data_formulated.head(10000)
data_to_use

**********************************************************************************************
# 2.Summarize Data
**********************************************************************************************

**********************************************************************************************
## 2.A Understand the Data With Descriptive Statistics
**********************************************************************************************
We must understand the data in order to get the best results. We will discover 7 recipes that we can use in Python to better understand our machine learning data. 
1. Take a peek at our raw data.
2. Review the dimensions of our dataset.
3. Review the data types of attributes in our data.
4. Summarize the distribution of instances across classes in our dataset.
5. Summarize our data using descriptive statistics.
6. Understand the relationships in our data using correlations.
7. Review the skew of the distributions of each attribute.

2.A.1. Peek at our Data

In [None]:
#2.A.1. Peek at our Data
#review the first 20 rows
print('\n')
print('*' * 80)
print('\n')
print('New Formulated Data For Bank Customer Transactions Dataset:\n')
peek = data_to_use.head(5)
peek
# We can confirm that the scales for the attributes are all over the place 
# because of the differing units. We may benefit from some transforms later on.

2.A.2.Dimensions of Our Data

In [None]:
#2.A.2.Dimensions of Our Data
print(data_to_use.shape)

2.A.3.Data Type For Each Attribute

In [None]:
#2.A.3.Data Type For Each Attribute
# types = data_to_use.dtypes
# print(types)
print('\n')
print('*' * 80)
print('\n')
print('Data Type:\n')
data_to_use.info()

# We can see that all of the attributes are numeric of integers (int) type

2.A.4.Descriptive Statistics

In [None]:
#2.A.4.Descriptive Statistics
# Descriptive statistics can give us great insight into the shape of each attribute. Often we can
# create more summaries than we have time to review. The describe() function on the Pandas
# DataFrame lists 8 statistical properties of each attribute. They are:
#  Count.
#  Mean.
#  Standard Deviation.
#  Minimum Value.
#  25th Percentile.
#  50th Percentile (Median).
#  75th Percentile.
#  Maximum Value.
print('\n')
print('*' * 80)
print('\n')
print('Descriptive Statistics For Bank Customer Transactions Dataset:\n')
pd.set_option('display.width', 100)
# pd.set_option('precision', 3)
description = data_to_use.describe()
description

# We now have a better feeling for how different the attributes are. The min and max values
# as well as the means vary a lot. We are likely going to get better results by rescaling the data
# in some way.

# Data has interestingly differing mean values.
# There may be some benefit from standardizing the data.

2.A.5.Class Distribution (Classi
cation Only)

In [None]:
# 2.A.5.Class Distribution (Classi
cation Only)
print('\n')
print('*' * 80)
# print('\n')
print('Class Distribution:\n')
class_counts = data_to_use.groupby('isDefault').size()
print(class_counts)

# We can see that the classes are reasonably balanced between output values.

2.A.6.Correlations Between Attributes

In [None]:
# 2.A.6.Correlations Between Attributes
# Correlation refers to the relationship between two variables and how they may or may not
# change together. The most common method for calculating correlation is Pearson's Correlation
# Coecient, that assumes a normal distribution of the attributes involved. A correlation of -1
# or 1 shows a full negative or positive correlation respectively. Whereas a value of 0 shows no
# correlation at all. Some machine learning algorithms like linear and logistic regression can su
ffer
# poor performance if there are highly correlated attributes in your dataset. As such, it is a good
# idea to review all of the pairwise correlations of the attributes in our dataset. We can use the
# corr() function on the Pandas DataFrame to calculate a correlation matrix.

# Pairwise Pearson correlations
print('\n')
print('*' * 80)
print('\n')
print('Correlations Marix Between Attributes :\n ')
correlations = data_to_use.corr(method='pearson')
correlations

# This is interesting. We can see that many of the attributes have a weak correlation

2.A.7.Skew of Univariate Distributions

In [None]:
# 2.A.7.Skew of Univariate Distributions
# Skew refers to a distribution that is assumed Gaussian (normal or bell curve) that is shifted or
# squashed in one direction or another. Many machine learning algorithms assume a Gaussian
# distribution. Knowing that an attribute has a skew may allow you to perform data preparation
# to correct the skew and later improve the accuracy of your models. We can calculate the skew
# of each attribute using the skew() function on the Pandas DataFrame.

# Skewness refers to distortion or asymmetry in a symmetrical bell curve, 
# or normal distribution, in a set of data. If the curve is shifted to the 
# left or to the right, it is said to be skewed. Skewness can be quantified 
# as a representation of the extent to which a given distribution varies 
# from a normal distribution

# Skew for each attribute
print('\n')
print('*' * 80)
print('\n')
print('Skew of Univariate Distributions:\n ')
skew = data_to_use.skew()
print(skew)

**********************************************************************************************
## 2.B Understand the Data With Visualization
**********************************************************************************************
We must understand the data in order to get the best results from machine learning algorithms.
The fastest way to learn more about the data is to use data visualization.

plot the data using:

**Univariate Plots**
1.   Histograms.
2.   Density Plots.
3.   Box and Whisker Plots.

**Multivariate Plots**
4.   Correlation Matrix Plot.
5.   Scatter Plot Matrix.
  
  




**********************************************************************************************
### (2.B.1)Univariate Plots
**********************************************************************************************
1.   Histograms.
2.   Density Plots.
3.   Box and Whisker Plots.

**Histograms**

A fast way to get an idea of the distribution of each attribute is to look at histograms. Histograms
group data into bins and provide us a count of the number of observations in each bin. From
the shape of the bins we can quickly get a feeling for whether an attribute is Gaussian, skewed
or even has an exponential distribution. It can also help us see possible outliers.

In [None]:
# Univariate Histograms
print('\n')
print('*' * 80)
print('\n')
print('Univariate Histograms:\n ')
data_to_use.hist(figsize=(15,15))
plt.show()

# We can see that no attributes  has an exponential
# distribution. We can also see that perhaps some attributes
# have a Gaussian or nearly Gaussian distribution. This is interesting because many machine learning
# techniques assume a Gaussian univariate distribution on the input variables.
# This is useful to note as we can use algorithms that can exploit this assumption (Gaussian distribution).

#Some has binomial distribution such as Gender

#It also looks like some attributes may be skewed Gaussian distributions, which
# might be helpful later with transforms.

Density Plots
Density plots are another way of getting a quick idea of the distribution of each attribute. The plots look like an abstracted histogram with a smooth curve drawn through the top of each bin, much like your eye tried to do with the histograms.

In [None]:
# Univariate Density Plots
print('\n')
print('*' * 80)
print('\n')
print('Univariate Density Plots:\n ')
data_to_use.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
plt.show()

# This is useful, you can see that some of the attributes have a skewed distribution. A power
# transform like a Box-Cox transform that can correct for the skew in distributions might be
# useful.

# This helps point out the skew in many distributions so much so that data looks like outliers
# (e.g. beyond the whisker of the plots).

**Box and Whisker Plots**

Another useful way to review the distribution of each attribute is to use Box and Whisker Plots
or boxplots for short. Boxplots summarize the distribution of each attribute, drawing a line for
the median (middle value) and a box around the 25th and 75th percentiles (the middle 50% of
the data). The whiskers give an idea of the spread of the data and dots outside of the whiskers
show candidate outlier values (values that are 1.5 times greater than the size of spread of the
middle 50% of the data).

In [None]:
# Box and Whisker Plots
print('\n')
print('*' * 80)
print('\n')
print('Box and Whisker Plots:\n ')
s = data_to_use.plot(kind='box', color='red', subplots=True, layout=(3,3), sharex=False, sharey=False)

# We can see that attributes do have quite dierent spreads. Given(if) the scales are the same, it
# may suggest some benefit in standardizing the data for modelling to get all of the means lined
# up.

**********************************************************************************************
### (2.B.2)Multivariate Plots
**********************************************************************************************
This section describes two plots that show the interactions between multiple variables
in the dataset.

1.   Correlation Matrix Plot.
2.   Scatter Plot Matrix.





**Correlation Matrix Plot**

Correlation gives an indication of how related the changes are between two variables. If two
variables change in the same direction they are positively correlated. If they change in opposite
directions together (one goes up, one goes down), then they are negatively correlated. We can
calculate the correlation between each pair of attributes. This is called a correlation matrix. we
can then plot the correlation matrix and get an idea of which variables have a high correlation with each other. This is useful to know, because some machine learning algorithms like linear and logistic regression can have poor performance if there are highly correlated input variables in our data.

In [None]:
# Correlation Matrix Plot
print('\n')
print('*' * 80)
print('\n')
print('Correlation Matrix Plot:\n ')
correlations = data_to_use.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,6,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = list(data_to_use.columns)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

# The dark yellow color shows positive correlation whereas the dark mauve color shows negative
# correlation. if there is strong correlatoin(+ve or -ve), this suggest candidates for removal
# to better improve accuracy of models later on.

**********************************************************************************************
# 3.Prepare the Data For Machine Learning
**********************************************************************************************
Many machine learning algorithms make assumptions about the data. It is often a very good
idea to prepare the data in such way to best expose the structure of the problem to the machine
learning algorithms that we intend to use. We will prepare
the data for machine learning (Data Transforms) in Python using scikit-learn:


1.   Discretization-Criteria
2. Rescale data.
3. Standardize data.
4. Normalize data.
5. Binarize data.
6. Log Transform certain features
7. Dealing with categorical features

**********************************************************************************************
## A. Data Transforms
**********************************************************************************************

Need For Data Pre-processing
We almost always need to pre-process the data. It is a required step. A difficulty is that diff
erent algorithms make diff
erent assumptions about the data and may require diff
erent transforms. Further, when we follow all of the rules and prepare the data, sometimes algorithms can deliver better results without pre-processing. Generally, We would recommend creating many diff
erent views and transforms of the data, then exercise a handful of algorithms on each view of the dataset. This will help us to flush out which data transforms might be better at exposing the structure of our problem in general.

**Data Transforms**

We will work through 4 di
fferent data pre-processing recipes for machine learning.

Each recipe follows the same structure:
 
1.   Load the dataset.
2.   Split the dataset into the input and output variables for machine learning.
3. Apply a pre-processing transform to the input variables.
4. Summarize the data to show the change.

The scikit-learn library provides two standard idioms for transforming data. Each are useful in diff
erent circumstances. The transforms are calculated in such a way that they can be applied to the training data and any samples of data we may have in the future. The scikit-learn documentation has some information on how to use various di
fferent pre-processing methods:
 Fit and Multiple Transform.
Combined Fit-And-Transform.

**********************************************************************************************
### 3.A.2. Rescale Data
**********************************************************************************************
When the data is comprised of attributes with varying scales, many machine learning algorithms
can benefi
t from rescaling the attributes to all have the same scale. Often this is referred to
as normalization and attributes are often rescaled into the range between 0 and 1. This is
useful for optimization algorithms used in the core of machine learning algorithms like gradient
descent. It is also useful for algorithms that weight inputs like regression and neural networks
and algorithms that use distance measures like k-Nearest Neighbors. We can rescale the data
using scikit-learn using the MinMaxScaler class

In [None]:
# ''' Remove this comment out if needs to be used
no_of_coulmns = data_to_use.shape[1]
no_of_attrib = no_of_coulmns -1

array = data_to_use.values

X = array[:,0:no_of_attrib]
Y = array[:,no_of_attrib]


from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)


dataframe = pd.DataFrame.from_records(data=X_scale, 
                                      columns=data_to_use.drop(columns=['isDefault']).columns)

dataframe['isDefault'] = Y

data_to_use = dataframe
data_to_use
# '''

**********************************************************************************************
## B. Feature Selection For Machine Learning
**********************************************************************************************

The data features that we use to train our machine learning models have a huge influence on the performance we can achieve. Irrelevant or partially relevant features can negatively impact model performance. We will discover automatic feature selection techniques
that we can use to prepare our machine learning data in Python with scikit-learn. 

1. Univariate Selection.
2. Recursive Feature Elimination.
3. Principle Component Analysis.
4. Feature Importance.

**********************************************************************************************
### 3.B.5. Finding Important Features Using RandomForestClassifier
**********************************************************************************************
perform this task in the following steps:
1. First, we need to create a random forests model.
2. Second, use the feature importance variable to see feature importance scores.
3. Third, visualize these scores using the seaborn library.

In [None]:
# ''' Remove this comment out if needs to be used
from sklearn.ensemble import RandomForestClassifier
##############################################################################
no_of_coulmns = data_to_use.shape[1]
no_of_attrib = no_of_coulmns -1

array = data_to_use.values

X = array[:,0:no_of_attrib]
Y = array[:,no_of_attrib]

classifier_randomForest = RandomForestClassifier(n_estimators=100)

#Fitting the training data to the network
classifier_randomForest.fit(X, Y)

feature_imp = pd.Series(classifier_randomForest.feature_importances_,index=data_to_use.drop(columns=['isDefault']).columns).sort_values(ascending=False)

print('\n')
print('*' * 80)
print('\nFinding Important Features')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend(feature_imp)
plt.show()

print('\nImportance of Features: \n\n',feature_imp)
# '''

In [None]:
# ''' Remove this comment out if needs to be used
#Remove some features from Dataset
list_to_delete=[]; list_to_delete.clear()

# print('\n\n')
for i in range(-2,0):
  # print(i)
  list_to_delete.append(feature_imp.index[i])

# Delete multiple columns from the dataframe
data_to_use = data_to_use.drop(list_to_delete, axis=1)
data_to_use
# '''

**********************************************************************************************
# 5.Evaluate Some Algorithms
**********************************************************************************************
Now it is time to create some models of the data and estimate their accuracy on unseen data.
Here is what we are going to cover in this step:
1. Separate out a validation dataset.
2. Setup the test harness to use 10-fold cross validation.
3. Build 5 di
erent models to predict species from 
ower measurements
<!-- 4. Select the best model. -->

**********************************************************************************************
## 5.1 Create a Validation Dataset
**********************************************************************************************
We will split the loaded dataset into two, 67%
of which we will use to train our models and 33% that we will hold back as a validation dataset.


In [None]:
from sklearn.model_selection import train_test_split
##############################################################################

# Split-out validation dataset
no_of_coulmns = data_to_use.shape[1]
no_of_attrib = no_of_coulmns -1

array = data_to_use.values
# no_of_attributes = data_to_use_feature_selected.shape[1]
# array = data_to_use_feature_selected.values
# no_of_attributes = discretizedData.shape[1]
# array = discretizedData.values

X = array[0:10000,0:no_of_attrib]
Y = array[0:10000,no_of_attrib]

seed = 7

X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X, Y, test_size=0.33,random_state=seed)
X_validation, X_test, Y_validation, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5,random_state=seed)

print('\n')
print('*' * 80)
print('\nCreate a Training, Validation & Test Dataset:\n')
# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Validation set has {} samples.".format(X_validation.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

# We now have training data in the X train and Y train for preparing models and a
# X validation/test and Y validation/test sets that we can use later.

**********************************************************************************************
## 5.2 Test Harness
**********************************************************************************************
We will use 10-fold cross validation to estimate accuracy. This will split our dataset into 10
parts, train on 9 and test on 1 and repeat for all combinations of train-test splits. We are using
the metric of accuracy to evaluate models. This is a ratio of the number of correctly predicted
instances divided by the total number of instances in the dataset multiplied by 100 to give a
percentage (e.g. 95% accurate). We will be using the scoring variable when we run build and
evaluate each model next.

**********************************************************************************************
## 5.3 Build Models
**********************************************************************************************
We don't know which algorithms would be good on this problem or what con
figurations to use.
We get an idea from the plots that some of the classes are partially linearly separable in some
dimensions, so we are expecting generally good results. Let's evaluate six di
erent algorithms:

*   Logistic Regression (LR).
*   Linear Discriminant Analysis (LDA).
*   k-Nearest Neighbors (KNN).
*   Classi
cation and Regression Trees (CART).
*   Gaussian Naive Bayes (NB).
*   Support Vector Machines (SVM).

This list is a good mixture of simple linear (LR and LDA), nonlinear (KNN, CART, NB
and SVM) algorithms. We reset the random number seed before each run to ensure that the
evaluation of each algorithm is performed using exactly the same data splits. It ensures the
results are directly comparable. Let's build and evaluate our 
six models using **Spot-Check Classification Algorithms:**


**What Techniques to Use When**

This section lists some tips to consider what resampling technique to use in di
fferent circum-stances.
Generally k-fold cross validation is the gold standard for evaluating the performance of a machine learning algorithm on unseen data with k set to 3, 5, or 10.
Using a train/test split is good for speed when using a slow algorithm and produces performance estimates with lower bias when using large datasets.
Techniques like leave-one-out cross validation and repeated random splits can be useful intermediates when trying to balance variance in the estimated performance, model training speed and dataset size.
The best advice is to experiment and 
find a technique for your problem that is fast and produces reasonable estimates of performance that you can use to make decisions. If in doubt, use 10-fold cross validation.

**********************************************************************************************
### Spot-Check Classi
cation Algorithms
**********************************************************************************************
Spot-checking is a way of discovering which algorithms perform well on our machine learning
problem. We cannot know which algorithms are best suited to our problem beforehand. We
must try a number of methods and focus attention on those that prove themselves the most
promising. We will discover six machine learning algorithms that we can use
when spot-checking our classi
cation problem in Python with scikit-learn :
1. How to spot-check machine learning algorithms on a classi
cation problem.
2. How to spot-check two linear classi
cation algorithms.
3. How to spot-check four nonlinear classi
cation algorithms.

We are going to take a look at six classi
cation algorithms that we can spot-check on our
dataset. Starting with **two linear machine learning algorithms**:

1.   **Logistic Regression.**
Logistic regression assumes a Gaussian distribution for the numeric input variables and can model binary classifi
cation problems. we can construct a logistic regression model using the LogisticRegression class1.
2.   **Linear Discriminant Analysis.** Linear Discriminant Analysis or LDA is a statistical technique for binary and multiclass classi
fication. It too assumes a Gaussian distribution for the numerical input variables. We can construct an LDA model using the LinearDiscriminantAnalysis class

Then looking at **four nonlinear machine learning algorithms**:

*   **k-Nearest Neighbors.**The k-Nearest Neighbors algorithm (or KNN) uses a distance metric to 
find the k most similar instances in the training data for a new instance and takes the mean outcome of the neighbors as the prediction. You can construct a KNN model using the KNeighborsClassifier class
*   **Naive Bayes.**Naive Bayes calculates the probability of each class and the conditional probability of each class given each input value. These probabilities are estimated for new data and multiplied together,
assuming that they are all independent (a simple or naive assumption). When working with real-valued data, a Gaussian distribution is assumed to easily estimate the probabilities for input variables using the Gaussian Probability Density Function. We can construct a Naive
Bayes model using the GaussianNB class
*   **Classi
cation and Regression Trees.**Classi
cation and Regression Trees (CART or just decision trees) construct a binary tree from the training data. Split points are chosen greedily by evaluating each attribute and each value of each attribute in the training data in order to minimize a cost function (like the Gini index).
We can construct a CART model using the DecisionTreeClassifier class
*   **Support Vector Machines.**Support Vector Machines (or SVM) seek a line that best separates two classes. Those data instances that are closest to the line that best separates the classes are called support vectors
and influence where the line is placed. SVM has been extended to support multiple classes.
Of particular importance is the use of di
erent kernel functions via the kernel parameter. A powerful Radial Basis Function is used by default. We can construct an SVM model using the SVC class

In [None]:

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
##############################################################################

# Spot-Check Algorithms
start=time.time()

models = [] ; models.clear
modelName = []; modelName.clear
# Linear Regression
# models.append(('Linear Regression', LinearRegression()))
# Logistic Regression Classification
models.append(('LR', LogisticRegression(C= 0.001, solver='lbfgs', multi_class='auto', max_iter=1000)))
modelName.append('LR')
# Linear Discriminant Analysis
models.append(('LDA', LinearDiscriminantAnalysis()))
modelName.append('LDA')
# KNN Classification
models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
modelName.append('KNN')
# Gaussian Naive Bayes Classification
models.append(('NB', GaussianNB(priors=None, var_smoothing=1e-09)))
modelName.append('NB')
# Classication and Regression Trees (CART or just decision trees) Calssification
models.append(('CART', DecisionTreeClassifier()))
modelName.append('CART')
# Support Vector Machines (or SVM) Calssification
models.append(('SVM', SVC(gamma='auto', C=10, probability=True)))
modelName.append('SVM')
# XGBoost Calssification
models.append(('XGBoost', XGBClassifier()))
modelName.append('XGBoost')
# MLPClassifier
models.append(('MLP', MLPClassifier(activation='tanh', 
                                    alpha= 0.0001, hidden_layer_sizes= (100,), 
                                    learning_rate= 'adaptive', solver= 'lbfgs')))
modelName.append('MLP')
# Gradient Boosting Calssification
models.append(('Gradient Boosting', GradientBoostingClassifier()))
modelName.append('Gradient Boosting')
# ExtraTreesClassifier
models.append(('Extra Trees Classifier', ExtraTreesClassifier(n_estimators=100)))
modelName.append('Extra Trees Classifier')

# evaluate each model in turn
results = []
aucies = [] ; aucies.clear
names = [] ; names.clear

print('\n')
print('*' * 80)
print('\nApplying Spot-Check Classication Algorithms:\n')
seed = 7
for name, model in models:
  # 10-fold cross validation: run the training algorithm 10 times, 
  # with a different 1/3 of the data as test set each time
  kfold = KFold(n_splits=3, random_state=seed)
  # print(model)
  # cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') 
  cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='roc_auc') 
  results.append(cv_results)
  names.append(name)
  aucies.append(cv_results.mean())
  msg = "%s| auc_mean:%f auc_std(%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg) #mean estimated accuracy & Standard deviation

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))

**********************************************************************************************
### Training Classifiers on the training data
**********************************************************************************************

In [None]:
#  import and initialize the following classifers from sklearn:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
##############################################################################
start=time.time()

data_ = [] ; data_.clear()

#for HCV-Egy-Data
data_.append(('Bank Customer Transactions Dataset:', X_train, Y_train ))
# We now have training data in the X train and Y train for preparing models.

accuracies = [] ; accuracies.clear
names = [] ; names.clear
matrix_accuracy = np.zeros((1,len(models)))

i=0;

print('\n')
print('*' * 80)
print('\nTraining Classifiers on the training data:\n')

for dataname, Xtrain, Ytrain in data_:
  print(dataname)
  j = 0
  for name, model in models:

    #Fitting the training data to the network
    model.fit(X_train, Y_train)
    
    # Using the trained network to predict
    accuracy = model.score(X_validation, Y_validation)
    
    accuracies.append(accuracy)
    names.append(name)
    msg = "%s (accuracy): %f" % (name, accuracy)
    print(msg) # name of classifier & its accuracy after training it.
    matrix_accuracy[i][j] = accuracy
    j = j + 1
  i = i + 1
  print('\n')

print('Matrix for Accuracy-Rate (Dataset vs. Classifiers) :\n\n' , matrix_accuracy)

accuracies.index(max(accuracies))

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))

### Ensemble Methods

Another way that we can improve the performance of algorithms on this problem is by using
ensemble methods. In this section we will evaluate four different ensemble machine learning
algorithms, two boosting and two bagging methods:

*   Boosting Methods: AdaBoost (AB) and Gradient Boosting (GBM).
*   Bagging Methods: Random Forests (RF) and Extra Trees (ET).

We will use the same test harness as before, 10-fold cross validation. No data standardization is used in this case because all four ensemble algorithms are based on decision trees that are
less sensitive to data distributions.

In [None]:
# ''' Remove this comment out if needs to be used
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
##############################################################################

start=time.time()
max_features_ = data_to_use.shape[1]-1
# ensembles
ensembles = []
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier(n_estimators=100, max_features=max_features_)))
ensembles.append(('ET', ExtraTreesClassifier()))
results = []
names = []

print('\n')
print('*' * 80)
print('\nApplying Ensemble Methods to improve the performance of algorithms:\n')

num_folds = 10
scoring = 'roc_auc'
for name, model in ensembles:
  kfold = KFold(n_splits=num_folds, random_state=seed)
  cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
  results.append(cv_results)
  names.append(name)
  msg = "%s| auc_mean:%f auc_std(%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
  # '''

**********************************************************************************************
### Plotting Accuracy Matrix (Dataset vs. Classifiers)
**********************************************************************************************

In [None]:
import matplotlib.pyplot as plt
##############################################################################

# Limits for the extent
size = len(models)
x_start = 3.0
x_end = 9.0
y_start = 6.0
y_end = 7.0

extent = [x_start, x_end, y_start, y_end]

# The normal figure
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot(111)
im = ax.imshow(matrix_accuracy, extent=extent, origin='lower', interpolation='None', cmap='viridis')

# Add the text
jump_x = (x_end - x_start) / (2.0 * size)
jump_y = (y_end - y_start) / (2.0 * size)
x_positions = np.linspace(start=x_start, stop=x_end, num=size, endpoint=False)
y_positions = np.linspace(start=y_start, stop=y_end, num=1, endpoint=False)

for y_index, y in enumerate(y_positions):
    for x_index, x in enumerate(x_positions):
        label = format(matrix_accuracy[y_index, x_index],'.6f')
        text_x = x + jump_x
        text_y = y + jump_y
        ax.text(text_x, text_y, label,  ha='center', va='center')

fig.colorbar(im)

ax.tick_params(direction='out', length=6, width=2, colors='r',
               grid_color='r', grid_alpha=0.5)
ax.set_title('Accuracy-Rate: \n Training each classifier on a dataset', 
             fontsize='large', fontweight='bold')
ax.set_xlabel(modelName, fontsize='large', fontweight='bold')
ax.set_ylabel('Bank Customer Transactions Dataset', fontsize='large', fontweight='bold')

print('\n')
print('*' * 80)
print('Plotting Accuracy Matrix (Dataset vs. Classifiers)\n')
plt.show()

## 5.4 Evaluate Algorithms: Standardize Data

We suspect that the differing distributions of the raw data may be negatively impacting the skill of some of the algorithms. Let's evaluate the same algorithms with a standardized copy of the dataset. This is where the data is transformed such that each attribute has a mean value of zero and a standard deviation of one. We also need to avoid data leakage when we transform the data. A good way to avoid leakage is to use pipelines that standardize the data and build the model for each fold in the cross validation test harness. That way we can get a fair estimation of how each model with standardized data might perform on unseen data.

In [None]:
# ''' Remove this comment out if needs to be used
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
##############################################################################

# Standardize the dataset

start=time.time()
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',
LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000))])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA',
LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',
KNeighborsClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB',
GaussianNB())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART',
DecisionTreeClassifier())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC(gamma='auto'))])))
pipelines.append(('ScaledXGBoost', Pipeline([('Scaler', StandardScaler()),('XGBoost', XGBClassifier())])))
pipelines.append(('ScaledMLP', Pipeline([('Scaler', StandardScaler()),('MLP', MLPClassifier(activation='tanh', 
                                    alpha= 0.0001, hidden_layer_sizes= (100,), 
                                    learning_rate= 'adaptive', solver= 'lbfgs'))])))
pipelines.append(('ScaledGB', Pipeline([('Scaler', StandardScaler()),('Gradient Boosting',
GradientBoostingClassifier())])))
pipelines.append(('ScaledExtra Tree Classiffier', Pipeline([('Scaler', StandardScaler()),('Extra Tree Classiffier',
ExtraTreesClassifier(n_estimators=100))])))

print('\n')
print('*' * 80)
print('Evaluate Algorithms: Standardize Data\n')

results = []
names = []
scoring = 'roc_auc'
for name, model in pipelines:
  kfold = KFold(n_splits=num_folds, random_state=seed)
  cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
  results.append(cv_results)
  names.append(name)
  msg = "%s| auc_mean:%f auc_std(%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
  # '''

**********************************************************************************************
# Tuning XGBoost with Grid Search CV
**********************************************************************************************

In [None]:
seed = 7

**********************************************************************************************
## Step 1- Find the number of estimators for a high learning rate
*********************************************************************************************

In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
     'n_estimators' : np.arange(100, 1000, 100) #Start, Stop, Increment
}

#Start with default parameters
model = XGBClassifier()

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)

grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

**********************************************************************************************
## Step 2: Tune max_depth and min_child_weight
*********************************************************************************************


In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2),
}

model = XGBClassifier(n_estimators = 100)

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)

grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

Lets go one step deeper and look for optimum values. We’ll search for values 1 above and below the optimum values because we took an interval of two.

In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
    'max_depth':[2,3,4],
    'min_child_weight':[4,5,6]
}

#Start with default parameters
model = XGBClassifier(n_estimators = 100)

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

**********************************************************************************************
## Step 3: Tune gamma
*********************************************************************************************


In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
    'gamma':[i/10.0 for i in range(0,5)]
}

#Start with default parameters
model = XGBClassifier(n_estimators = 100, max_depth = 2, min_child_weight = 5)

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

**********************************************************************************************
## Step 4: Tune subsample and colsample_bytree
*********************************************************************************************


In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}

#Start with default parameters
model = XGBClassifier(n_estimators = 100, max_depth = 2, min_child_weight = 5,
                      gamma = 0)

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

Here, we found 0.7 & 0.9 as the optimum value for  subsample and colsample_bytree respectively. 
But the values tried are very widespread, we should try values closer to the optimum values here (0.7 & 0.9) to see if we get something better.

Now we should try values in 0.05 interval around these.

In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
    'subsample':[i/100.0 for i in range(65,80,4)],
    'colsample_bytree':[i/100.0 for i in range(85,100,4)]
}

#Start with default parameters
model = XGBClassifier(n_estimators = 100, max_depth = 3, min_child_weight = 4,
                      gamma = 0,
                      subsample = 0.7, colsample_bytree = 0.9)

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

**********************************************************************************************
## Step 5: Tuning Regularization Parameters
*********************************************************************************************


Next step is to apply regularization to reduce overfitting.

In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
    'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}

#Start with default parameters
# model = XGBClassifier(n_estimators = 100, max_depth = 3, min_child_weight = 4,
#                       gamma = 0,
#                       subsample = 0.7, colsample_bytree = 0.9)

model = XGBClassifier(n_estimators = 100, max_depth = 3, min_child_weight = 4,
                      gamma = 0)

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''


# #####################
# # ''' Remove this comment out if needs to be used
# from sklearn.ensemble import RandomForestClassifier
# # Randomized for Algorithm Tuning
# from sklearn.model_selection import RandomizedSearchCV
# ##############################################################################

# maxFeatures = range(1,data_to_use.shape[1]-1)
# param_grid = {'reg_alpha': uniform()}
# model = XGBClassifier(n_estimators = 100, max_depth = 3, min_child_weight = 4,
#                       gamma = 0,
#                       subsample = 0.7, colsample_bytree = 0.9)
# scoring = 'roc_auc'
# rsearch =RandomizedSearchCV(estimator=model, param_distributions=param_grid
#                             , cv=5, scoring=scoring, n_iter=len(maxFeatures), random_state=seed)
# rsearch.fit(X, Y)
# print("Best accuracy is "+ str(rsearch.best_score_))
# print(rsearch.best_estimator_)
# # '''

We can see that the AUC score is less than the previous case. But the values tried are very widespread, we should try values closer to the optimum here (0.1) to see if we get something better.

In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
    'reg_alpha':[0, 0.01, 0.05, 0.1, 0.5]
}

#Start with default parameters
model = XGBClassifier(n_estimators = 100, max_depth = 3, min_child_weight = 4,
                      gamma = 0,
                      subsample = 0.7, colsample_bytree = 0.9)

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

**********************************************************************************************
## Step 6: Reducing Learning Rate
*********************************************************************************************


Lastly, we should lower the learning rate and add more trees

In [None]:
# ''' Remove this comment out if needs to be used

# Tune the classiffier
start=time.time()

# tunning Parameters:
parameter_space = {
    'learning_rate':[0.01, 0.1]
}

#Start with default parameters
# model = XGBClassifier(n_estimators = 100, max_depth = 3, min_child_weight = 4,
#                       gamma = 0,
#                       subsample = 0.7, colsample_bytree = 0.9)

model = XGBClassifier(n_estimators = 100, max_depth = 3, min_child_weight = 4,
                      gamma = 0)

num_folds = 5
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, n_jobs=4, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best AUC : %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("auc_mean:%f| auc_std:%f param(%s)" % (mean, stdev, param))

print('\n')
print(grid_result.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''


#####################
# ''' Remove this comment out if needs to be used
from sklearn.ensemble import RandomForestClassifier
# Randomized for Algorithm Tuning
from sklearn.model_selection import RandomizedSearchCV
##############################################################################

maxFeatures = range(1,data_to_use.shape[1]-1)
param_grid = {'learning_rate': uniform()}
model = XGBClassifier(n_estimators = 100, max_depth = 3, min_child_weight = 4,
                      gamma = 0,
                      subsample = 0.7, colsample_bytree = 0.9)
scoring = 'roc_auc'
rsearch =RandomizedSearchCV(estimator=model, param_distributions=param_grid
                            , cv=5, scoring=scoring, n_iter=len(maxFeatures), random_state=seed)
rsearch.fit(X, Y)
print("Best accuracy is "+ str(rsearch.best_score_))
print(rsearch.best_estimator_)
# '''

In [None]:
# ''' Remove this comment out if needs to be used 
##############################################################################
xGradientBoosting = grid_result.best_estimator_
xGradientBoosting.fit(X_train,Y_train)
xgbPredict = xGradientBoosting.predict(X_validation)
xgbPredictproba = xGradientBoosting.predict_proba(X_validation)[:,1] #for ROC curve
xgbAccuracy = accuracy_score(Y_validation,xgbPredict)
roc_score = metrics.roc_auc_score(Y_validation,xgbPredict)
print("(VALIDATION DATA) X-Gradient Boosting accuracy is ",xgbAccuracy)
plotRocCurve(xgbPredictproba, Y_validation, 'X-Gradient Boosting')
# '''

In [None]:
# ''' Remove this comment out if needs to be used 
##############################################################################
xGradientBoosting = grid_result.best_estimator_
xGradientBoosting.fit(X_train,Y_train)
xgbPredict = xGradientBoosting.predict(X_test)
xgbPredictproba = xGradientBoosting.predict_proba(X_test)[:,1] #for ROC curve
xgbAccuracy = accuracy_score(Y_test,xgbPredict)
roc_score = metrics.roc_auc_score(Y_test,xgbPredict)
print("(TEST DATA) X-Gradient Boosting accuracy is ",xgbAccuracy)
plotRocCurve(xgbPredictproba, Y_test, 'X-Gradient Boosting')
# '''

In [None]:
# ''' Remove this comment out if needs to be used 
##############################################################################
xGradientBoosting = grid_result.best_estimator_
xGradientBoosting.fit(X_train,Y_train)
xgbPredict = xGradientBoosting.predict(X_train)
xgbPredictproba = xGradientBoosting.predict_proba(X_train)[:,1] #for ROC curve
xgbAccuracy = accuracy_score(Y_train,xgbPredict)
roc_score = metrics.roc_auc_score(Y_train,xgbPredict)
print("(TRAINING DATA) X-Gradient Boosting accuracy is ",xgbAccuracy)
plotRocCurve(xgbPredictproba, Y_train, 'X-Gradient Boosting')
# '''

**********************************************************************************************
## Tuning XGBoost with Randomized Search
**********************************************************************************************

In [None]:
# ''' Remove this comment out if needs to be used
# Randomized for Algorithm Tuning
##############################################################################
start=time.time()

no_of_coulmns = data_to_use.shape[1]
no_of_attrib = no_of_coulmns -1

array = data_to_use.values
X = array[:,0:no_of_attrib]
Y = array[:,no_of_attrib]

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

maxFeatures = range(1,data_to_use.shape[1]-1)
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
max_depth_list = [i for i in range(1,11)]
estimators_list = [10**i for i in range(1,3)]
# param_grid = {'learning_rate': uniform(0,1), 'n_estimators': uniform(100,500) }
# param_grid = dict(max_features=maxFeatures,learning_rate= lr_list, 
#                   max_depth = max_depth_list, n_estimators = estimators_list )
param_grid = {'learning_rate': uniform()}
# model = XGBClassifier(n_estimators = 100, max_depth = 10)
#INITIALY
model = XGBClassifier(n_estimators = 100,  max_depth=5,
                      min_child_weight = 1, gamma = 0, subsample = 0.8,
                      colsample_bytree = 0.8, scale_pos_weight=1)


# model = RandomForestClassifier(n_estimators=100, criterion='gini')
xgbSearch =RandomizedSearchCV(estimator=model, param_distributions=param_grid
                            , cv=10, scoring='roc_auc', n_iter=len(maxFeatures), random_state=seed)
xgbSearch.fit(X_train, Y_train)
print("Best auc is "+ str(xgbSearch.best_score_))
print(xgbSearch.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

In [None]:
# ''' Remove this comment out if needs to be used 
##############################################################################
xGradientBoosting = xgbSearch.best_estimator_
xGradientBoosting.fit(X_train,Y_train)
xgbPredict = xGradientBoosting.predict(X_validation)
xgbPredictproba = xGradientBoosting.predict_proba(X_validation)[:,1] #for ROC curve
xgbAccuracy = accuracy_score(Y_validation,xgbPredict)
roc_score = metrics.roc_auc_score(Y_validation,xgbPredict)
print("X-Gradient Boosting accuracy is ",xgbAccuracy)
plotRocCurve(xgbPredictproba, Y_validation, 'X-Gradient Boosting')
# '''

**********************************************************************************************
## Tuning SVM with Grid Search CV
**********************************************************************************************

In [None]:
# ''' Remove this comment out if needs to be used
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
##############################################################################

# Tune scaled SVM
start=time.time(
    
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

# tunning Parameters:
parameter_space = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100],
}

model = SVC(gamma='auto', probability=True) 
num_folds = 10
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'accuracy'
# grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid = GridSearchCV(model, parameter_space, n_jobs=-1, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

print(model)
# model.get_params().keys()

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2))
# '''

## Tuning Support Vector Machines(SVM) with Grid Search CV

SVM (Support Vector Machines) when implemented with grid search, we got the best accuracies and minimum false negatives. We used the Grid search to find the best hyper paramters for the model.Later we used this value to find the predictions and plot the ROC curve.

In [None]:
# ''' Remove this comment out if needs to be used
# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
##############################################################################
start=time.time()

no_of_coulmns = data_to_use.shape[1]
no_of_attrib = no_of_coulmns -1

array = data_to_use.values
X = array[:,0:no_of_attrib]
Y = array[:,no_of_attrib]

powers = range(1,2)
# powers = range(-2,5)
cs = [10**i for i in powers]
param_grid = dict(C=cs)

# tunning Parameters:
parameter_space = param_grid

model = SVC(gamma='auto', probability=True) 
num_folds = 10
kfold = KFold(n_splits=num_folds, random_state=seed)
scoring = 'roc_auc'#'accuracy'
grid = GridSearchCV(estimator=model, param_grid=parameter_space, scoring=scoring, cv=kfold)

grid_result = grid.fit(X, Y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

print(model)
# model.get_params().keys()

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))

# '''

In [None]:
# ''' Remove this comment out if needs to be used
clf_svm = model
clf_svm.fit(X_train,Y_train)
predictions_svm = clf_svm.predict(X_test)
predictproba_svm = clf_svm.decision_function(X_test)
SVM_Accuracy = accuracy_score(Y_test,predictions_svm)
print("SVM accuracy is ",SVM_Accuracy)
plotAUC(Y_test,predictproba_svm, 'SVM')
# plotAUC(Y_test,rfPredictproba, 'Random Forest')
# plotAUC(y_test,LR_Predict,'Logistic Regression')
plt.show()
plt.figure(figsize=(6,6))
plot_confusion_matrix(predictions_svm, normalize=True)
plt.show()
# '''

## Random Forest with randomized Search

Random forest when implemented with randomized search we got the best accuracies and minimum false negatives(predicting borowwer will not default eventhough he will. This might impact on the credibility of the company). We used the randomized search to find the best hyper paramters for the model.

In [None]:
# ''' Remove this comment out if needs to be used
from sklearn.ensemble import RandomForestClassifier
# Randomized for Algorithm Tuning
from sklearn.model_selection import RandomizedSearchCV
##############################################################################
no_of_coulmns = data_to_use.shape[1]
no_of_attrib = no_of_coulmns -1

array = data_to_use.values
X = array[:,0:no_of_attrib]
Y = array[:,no_of_attrib]

maxFeatures = range(1,data_to_use.shape[1]-1)
param_grid = dict(max_features=maxFeatures)
# param_grid = {'alpha': uniform()}
model = RandomForestClassifier(n_estimators=100, criterion='gini')
rsearch =RandomizedSearchCV(estimator=model, param_distributions=param_grid
                            , cv=10, scoring='accuracy', n_iter=len(maxFeatures), random_state=seed)
rsearch.fit(X, Y)
print("Best accuracy is "+ str(rsearch.best_score_))
print(rsearch.best_estimator_)
# '''

In [None]:
''' Remove this comment out if needs to be used 
##############################################################################

randomForest = rsearch.best_estimator_
randomForest.fit(X_train,Y_train)
rfPredict = randomForest.predict(X_test)
rfPredictproba = randomForest.predict_proba(X_test)[:,1] #for ROC curve
rfAccuracy = accuracy_score(Y_test,rfPredict)
roc_score = metrics.roc_auc_score(Y_test,rfPredict)
print("Random Forest accuracy is ",rfAccuracy)
plotRocCurve(rfPredictproba, Y_test, 'Random Forest')
'''

##Tuning Gradient Boosting with randomized Search

In [None]:
# ''' Remove this comment out if needs to be used
# Randomized for Algorithm Tuning
##############################################################################
start=time.time()

no_of_coulmns = data_to_use.shape[1]
no_of_attrib = no_of_coulmns -1

array = data_to_use.values
X = array[:,0:no_of_attrib]
Y = array[:,no_of_attrib]

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

maxFeatures = range(1,data_to_use.shape[1]-1)
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
max_depth_list = [i for i in range(1,11)]
estimators_list = [10**i for i in range(1,5)]
# param_grid = {'learning_rate': uniform(), 'n_estimators': estimators_list }
param_grid = dict(max_features=maxFeatures,learning_rate= lr_list, 
                  max_depth = max_depth_list, n_estimators = estimators_list )
# param_grid = {'alpha': uniform()}
model = GradientBoostingClassifier()
# model = RandomForestClassifier(n_estimators=100, criterion='gini')
gbSearch =RandomizedSearchCV(estimator=model, param_distributions=param_grid
                            , cv=10, scoring='roc_auc', n_iter=len(maxFeatures), random_state=seed)
gbSearch.fit(X, Y)
print("Best auc is "+ str(gbSearch.best_score_))
print(gbSearch.best_estimator_)

end=time.time()
print("\nThis script took {} minutes to complete".format(round((end-start)/60,2)))
# '''

In [None]:
# ''' Remove this comment out if needs to be used 
##############################################################################
gradientBoosting = gbSearch.best_estimator_
gradientBoosting.fit(X_train,Y_train)
gbPredict = gradientBoosting.predict(X_test)
gbPredictproba = gradientBoosting.predict_proba(X_test)[:,1] #for ROC curve
gbAccuracy = accuracy_score(Y_test,gbPredict)
roc_score = metrics.roc_auc_score(Y_test,gbPredict)
print("Gradient Boosting accuracy is ",gbAccuracy)
plotRocCurve(gbPredictproba, Y_test, 'Gradient Boosting')
# '''

In [None]:
''' Remove this comment out if needs to be used
plt.figure(figsize=(6,6))
plot_confusion_matrix(rfPredict, normalize=True)
plt.show()
'''

**********************************************************************************************
# Select Best Model
**********************************************************************************************

In [None]:
print('\n')
print('*' * 80)
print('Selecting Best Model: \n')

model_name = models[aucies.index(max(aucies))][0]
model = models[aucies.index(max(aucies))][1]
model

**********************************************************************************************
# 6.Save and Load Machine Learning Models
**********************************************************************************************
Finding an accurate machine learning model is not the end of the project. We will discover how to save and load our machine learning model in Python using scikit-learn.
This allows us to save our model to 
file and load it later in order to make predictions:

1. The importance of serializing models for reuse.
2. How to use pickle to serialize and deserialize machine learning models.
3. How to use Joblib to serialize and deserialize machine learning models.

**********************************************************************************************
## 6.1.Finalize Your Model with pickle
**********************************************************************************************
Pickle is the standard way of serializing objects in Python. You can use the pickle1 operation
to serialize our machine learning algorithms and save the serialized format to a 
file. Later we
can load this 
file to deserialize our model and use it to make new predictions. The example
below demonstrates how we can train a logistic regression model on the Pima Indians onset of
diabetes dataset, save the model to fi
le and load it to make predictions on the unseen test set.

In [None]:
# Training the Algorithm on selected model
##############################################################################
# print(models)
# models[2][1]

# model = models[1][1] #SVC(gamma='auto')
# The fit method of SVC class is called to train the 
# algorithm on the training data, which is passed as a parameter to the fit method
model.fit(X_train, Y_train)

from pickle import dump
##############################################################################

#Save Model to disk
filename = model_name + '_finalized_model.sav'
dump(model, open(filename, 'wb'))
print(model_name, ' Model Saved.')

**********************************************************************************************
## 6.2Make Predictions: Precision,recall,F1score for all models
**********************************************************************************************

In [None]:
# load the model from disk
from pickle import load
##############################################################################

loaded_model = load(open(filename, 'rb'))

# Running the example saves the model to finalized model.sav in our local working
# directory. Load the saved model and evaluating it provides an estimate of accuracy of the model
# on unseen data.

In [None]:
print('\n')
print('*' * 80)
print("Prediction on Validation data:\n")
Y_pred = loaded_model.predict(X_validation)
cm = confusion_matrix(Y_validation,Y_pred)
print(classification_report(Y_validation,Y_pred, target_names=None))
print('Accuracy of ' , model_name, ' Model : ', accuracy(cm)*100, '%')

print('\n')
print('*' * 80)
print("Prediction on Test data:\n")
Y_pred = loaded_model.predict(X_test)
cm = confusion_matrix(Y_test,Y_pred)
print(classification_report(Y_test, Y_pred, target_names=None))
print('Accuracy of ' , model_name, ' Model : ', accuracy(cm)*100, '%')

print('\n')
print('*' * 80)
print("Prediction on Train data:\n")
Y_pred = loaded_model.predict(X_train)
cm = confusion_matrix(Y_train,Y_pred)
print(classification_report(Y_train, Y_pred, target_names=None))
print('Accuracy of ' , model_name, ' Model : ', accuracy(cm)*100, '%')


In [None]:
# Making Predictions on Validation Data
print('\n')
print('*' * 80)
print("Prediction on Validation data:\n")

# Evaluating the Algorithm
# Confusion matrix, precision, recall, and F1 measures are the most commonly 
# used metrics for classification tasks. Scikit-Learn's metrics 
# library contains the classification_report and confusion_matrix methods, 
# which can be readily used to find out the values for these important metrics.
# 

from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics 

##############################################################################

Y_pred = loaded_model.predict(X_validation)
cm = confusion_matrix(Y_validation,Y_pred)
print(cm)
print(classification_report(Y_validation,Y_pred))

# Printing the accuracy
print('Accuracy of ' , model_name, ' Model : ', accuracy(cm)*100, '%')

In [None]:
####################################################################################################
Y_pred = loaded_model.predict(X_validation)
gbPredictproba = loaded_model.predict_proba(X_validation)[:,1] #for ROC curve
gbAccuracy = accuracy_score(Y_validation,Y_pred)
roc_score = metrics.roc_auc_score(Y_validation,Y_pred)
print("\nModel %s  accuracy is %f" % (model_name , gbAccuracy))
plotRocCurve(gbPredictproba, Y_validation, model_name)#

plt.figure(figsize=(6,6))
plot_confusion_matrix(Y_validation,Y_pred, normalize=True)
plt.show()

In [None]:
# Making Predictions
print('\n')
print('*' * 80)
print("Prediction on Test data:\n")
Y_pred = loaded_model.predict(X_test)

cm = confusion_matrix(Y_test,Y_pred)
print(cm)
print(classification_report(Y_test,Y_pred))

# Printing the accuracy
print('Accuracy of ' , model_name, ' Model : ', accuracy(cm)*100, '%')

In [None]:
####################################################################################################
Y_pred = loaded_model.predict(X_test)
gbPredictproba = loaded_model.predict_proba(X_test)[:,1] #for ROC curve
gbAccuracy = accuracy_score(Y_test,Y_pred)
roc_score = metrics.roc_auc_score(Y_test,Y_pred)
print("Model %s  accuracy is %f" % (model_name , gbAccuracy))
plotRocCurve(gbPredictproba, Y_test, 'Gradient Boosting')#

plt.figure(figsize=(6,6))
plot_confusion_matrix(Y_test,Y_pred, normalize=True)
plt.show()

In [None]:
# Making Predictions
print('\n')
print('*' * 80)
print("Prediction on Train data:\n")

Y_pred = loaded_model.predict(X_train)
cm = confusion_matrix(Y_train,Y_pred)
print(cm)
print(classification_report(Y_train,Y_pred))

# Printing the accuracy
print('Accuracy of ' , model_name, ' Model : ', accuracy(cm)*100, '%')

In [None]:
####################################################################################################
Y_pred = loaded_model.predict(X_train)
gbPredictproba = loaded_model.predict_proba(X_train)[:,1] #for ROC curve
gbAccuracy = accuracy_score(Y_train,Y_pred)
roc_score = metrics.roc_auc_score(Y_train,Y_pred)
print("Model %s  accuracy is %f" % (model_name , gbAccuracy))
plotRocCurve(gbPredictproba, Y_train, 'Gradient Boosting')#

plt.figure(figsize=(6,6))
plot_confusion_matrix(Y_train,Y_pred, normalize=True)
plt.show()

**********************************************************************************************
## Printing Probabilities
**********************************************************************************************

In [None]:
last_5000_data_to_use = data_formulated.tail(5000)

no_of_columns = last_5000_data_to_use.shape[1]
no_of_attrib = no_of_columns -1

array = last_5000_data_to_use.values
X_5000 = array[:,0:no_of_attrib]

prob = loaded_model.predict_proba(X_train)
# print(X_5000.shape)
# print(prob.shape)
# print(last_5000_data_to_use.shape)


df_cali_with_prob = data_to_use.copy()

df_cali_with_prob['Prob<=3000'] = prob[:,0]
df_cali_with_prob['Prob>3000'] = prob[:,1]

print('*' * 80)
print('\nprinting probabilities...\n')
print(df_cali_with_prob)

# print('\n\nProb<=3000' , '\t\tProb>3000')
# for index, row in df_cali.iterrows():
#     print(row['Prob<=3000'], row['Prob>3000'])