In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import LocalOutlierFactor
from __init__ import *


# Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.set_option("display.width", 500)

In [3]:
data = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")
sample = pd.read_csv("Data/sample_submission.csv")

In [4]:
data.drop("id", axis = 1, inplace = True)
test.drop("id", axis = 1, inplace = True)

In [5]:
data_summary(data)

############## SHAPE ##############
165034
############## TYPES ##############
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
dtype: object
############## NULL ##############
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64
############ DESCRIBE ############
                    count        mean      std         min          0%         25%         50%         75%         90%         95%         99%         max
CustomerId      165034.00 15692005.02 71397.8

In [6]:
data_summary(test)

############## SHAPE ##############
110023
############## TYPES ##############
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
dtype: object
############## NULL ##############
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64
############ DESCRIBE ############
                    count        mean      std         min          0%         25%         50%         75%         90%         95%         99%         max
CustomerId      110023.00 15692096.61 71684.99 15565701.00 15565701.00 15632859.00 15690175.0

# **Modelling with No preprocessing**

In [7]:
df = data.copy()
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [8]:
# Label Encoder

lab = LabelEncoder()
df["Gender"] = lab.fit_transform(data["Gender"])
print(lab.inverse_transform([0,1]))

df["Geography"] = lab.fit_transform(data["Geography"])
print(lab.inverse_transform([0,1,2]))

['Female' 'Male']
['France' 'Germany' 'Spain']


In [9]:
df.drop(["CustomerId", "Surname"], axis = 1, inplace=True)

In [10]:
y = df["Exited"]
x = df.drop("Exited", axis = 1)

In [11]:
print(x.shape)
print(y.shape)

(165034, 10)
(165034,)


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.20, random_state= 42)

In [13]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Support Vector Machines' : LinearSVC(),
    'Decision Trees' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(),
    'Naive Bayes' : GaussianNB(),
    'K-Nearest Neighbor' : KNeighborsClassifier()
}

In [14]:
evaluation(models, x_train, x_test, y_train, y_test)



Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.79,0.49,0.12,0.2
Support Vector Machines,0.76,0.38,0.2,0.26
Decision Trees,0.8,0.52,0.54,0.53
Random Forest,0.86,0.72,0.54,0.61
Naive Bayes,0.79,0.53,0.19,0.28
K-Nearest Neighbor,0.76,0.31,0.12,0.17


Reminder:
* Accuracy: It is the ratio of the entire sample that is correctly classified.
* Precision: It is the ratio of actually positive predictions to all positive predictions.
* Recall: It shows how much of the values we estimate as positive are actually positive.