### Importing dataset

In [12]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

### Loading Dataset

In [13]:
dataset = pd.read_csv("Dataset/BankCustomers.csv")
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Exploring Data

In [20]:
dataset.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [14]:
dataset["Exited"].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

Here, we can see the labels are unbalances. In the real life time also such cases are ought to be unbalanced. So, instead of accuracy, we need to reduce the false negative. 

#### Categorical Data

In [15]:
categorical_data = dataset.select_dtypes("object")
categorical_data.head(10)

Unnamed: 0,Surname,Geography,Gender
0,Hargrave,France,Female
1,Hill,Spain,Female
2,Onio,France,Female
3,Boni,France,Female
4,Mitchell,Spain,Female
5,Chu,Spain,Male
6,Bartlett,France,Male
7,Obinna,Germany,Female
8,He,France,Male
9,H?,France,Male


In [16]:
categorical_data.drop(columns=["Surname"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [17]:
categorical_data.isnull().sum()

Geography    0
Gender       0
dtype: int64

In [18]:
for feature in categorical_data.columns:
    print(feature,"\n",categorical_data[feature].value_counts(),"\n\n")

Geography 
 France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64 


Gender 
 Male      5457
Female    4543
Name: Gender, dtype: int64 




In [19]:
dataset[["Geography", "Exited"]].value_counts()

Geography  Exited
France     0         4204
Spain      0         2064
Germany    0         1695
           1          814
France     1          810
Spain      1          413
dtype: int64

#### Numerical Data

In [22]:
numerical_data = dataset.select_dtypes(["int64", "float64"])
numerical_data.head()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,619,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,608,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,502,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,699,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,850,43,2,125510.82,1,1,1,79084.1,0


In [23]:
numerical_data.drop(columns=["RowNumber", "CustomerId"], inplace=True)
numerical_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,42,2,0.0,1,1,1,101348.88,1
1,608,41,1,83807.86,1,0,1,112542.58,0
2,502,42,8,159660.8,3,1,0,113931.57,1
3,699,39,1,0.0,2,0,0,93826.63,0
4,850,43,2,125510.82,1,1,1,79084.1,0


In [24]:
for feature in numerical_data:
    print(feature, len(numerical_data[feature].value_counts()))

CreditScore 460
Age 70
Tenure 11
Balance 6382
NumOfProducts 4
HasCrCard 2
IsActiveMember 2
EstimatedSalary 9999
Exited 2
