# Project 1 - churn detection applied to bank scenario
- The object is using data, predict if a customer will become a churn or not

In [31]:
import numpy  as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


## Splitting data for training and evalution of the model

In [3]:
dataset = pd.read_csv('churn.csv')
dataset.shape

(10000, 14)

In [4]:
dataset.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
y = dataset['Exited']
X = dataset.drop(['Exited'], axis=1)

In [6]:
y.head(5)

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [7]:
X.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [8]:
# Splitting dataset into train and test
# The objective is to use the test dataset as a simulation of real life scenarios, it's important to get some metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Data exploration and cleaning

### First check: "text" columns
We cannot use text columns into machine learning models, because of that we need to encode it to numeric representations.
We will use the simplest way to proceed using a technique named *label encoding*.

Even though it can make algorithms misinterpret the numerical labels as having an inherent order or magnitude, we did it because the objective of this project is to focus on the deploy, not on the performance of the model.

__Hint__: When the type is "object" we must understand it as "text"

In [9]:
X_train.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
9254,9255,15601116,P'an,686,France,Male,32,6,0.0,2,1,1,179093.26
1561,1562,15766374,Leak,632,Germany,Male,42,4,119624.6,2,1,1,195978.86
1670,1671,15716994,Green,559,Spain,Male,24,3,114739.92,1,1,0,85891.02
6087,6088,15730759,Chukwudi,561,France,Female,27,9,135637.0,1,1,0,153080.4
6669,6670,15797900,Chinomso,517,France,Male,56,9,142147.32,1,0,0,39488.04


In [10]:
X_train.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
dtype: object

In [11]:
# Showing only text type objects
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Object columns: {object_columns}")

Object columns: ['Surname', 'Geography', 'Gender']


### Surname
As we see on the description, this column have no influence in the churn decision because it's only a name.
Because of that we will have no further analysis on it, we will only remove it from de dataframe.

In [18]:
# Removing column
X_train = X_train.drop('Surname', axis = 1)
X_train.head(5)

Unnamed: 0,RowNumber,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
9254,9255,15601116,686,France,Male,32,6,0.0,2,1,1,179093.26
1561,1562,15766374,632,Germany,Male,42,4,119624.6,2,1,1,195978.86
1670,1671,15716994,559,Spain,Male,24,3,114739.92,1,1,0,85891.02
6087,6088,15730759,561,France,Female,27,9,135637.0,1,1,0,153080.4
6669,6670,15797900,517,France,Male,56,9,142147.32,1,0,0,39488.04


### Geography

In [None]:
# Checking the amount of unique values of the column
X_train.Geography.unique().size

3

In [20]:
# Checking the unique values of the column
X_train.Geography.unique()

array(['France', 'Germany', 'Spain'], dtype=object)

In [23]:
country_dict = {"France" : 1,
                "Germany": 2,
                "Spain"  : 3}

In [24]:
# Let's convert the countries to the respective dictionary number 
X_train["Geography"] = X_train.Geography.apply(lambda x: country_dict[x])

In [25]:
X_train.Geography.value_counts()

Geography
1    3994
2    2011
3    1995
Name: count, dtype: int64

### Gender
Even though using gender information can improve model accuracy, it can also amplify biases leading to unfair or discriminatory outcomes.

__Because of that I decided to not use this column__.

In [26]:
# Removing column
X_train = X_train.drop('Gender', axis = 1)
X_train.head(5)

Unnamed: 0,RowNumber,CustomerId,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
9254,9255,15601116,686,1,32,6,0.0,2,1,1,179093.26
1561,1562,15766374,632,2,42,4,119624.6,2,1,1,195978.86
1670,1671,15716994,559,3,24,3,114739.92,1,1,0,85891.02
6087,6088,15730759,561,1,27,9,135637.0,1,1,0,153080.4
6669,6670,15797900,517,1,56,9,142147.32,1,0,0,39488.04


### Second check: null values
It's not a good practice to use null columns into the machine learning models.
There are many ways to handle that (fill with a value like average, median or other value).
The focus of the project is to show the deployment, because of that I will remove rows with null values to make the proccess quickier.

In [27]:
# Good news: there is not a single row null
X_train.isnull().sum()

RowNumber          0
CustomerId         0
CreditScore        0
Geography          0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [28]:
# If we needed to drop that the command would be:
# X_train = X_train.drop(['RowNumber'], axis = 1) # Remove a column with every row missing
# X_train = X_train.dropna() # Remove all rows with null

### Third check: irrelevant columns
Let's clean the dataframe from irrelevant columns.

In [30]:
# Removing column
X_train = X_train.drop(['RowNumber','CustomerId'], axis = 1)
X_train.head(5)

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
9254,686,1,32,6,0.0,2,1,1,179093.26
1561,632,2,42,4,119624.6,2,1,1,195978.86
1670,559,3,24,3,114739.92,1,1,0,85891.02
6087,561,1,27,9,135637.0,1,1,0,153080.4
6669,517,1,56,9,142147.32,1,0,0,39488.04


### Summary of the cleaning
It's inportant to list it, because in the future we will create a preprocessing script using those steps for new requests.

A. Use only the following columns:
- CreditScore
- Geography
- Age
- Tenure
- Balance
- NumOfProducts
- HasCrCard
- IsActiveMember
- EstimatedSalary

B. The column Geography will be label encoded using a dictionary

## Applying scaler
Even though values from the dataframe now are all numeric, we need to transform all values into the similar range.

__Hint__: Applying the same scale to your columns help the model to avoid bias giving more importance to biggest values.

In [32]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [34]:
X_train

array([[0.672     , 0.        , 0.18918919, ..., 1.        , 1.        ,
        0.89555028],
       [0.564     , 0.5       , 0.32432432, ..., 1.        , 1.        ,
        0.98002868],
       [0.418     , 1.        , 0.08108108, ..., 1.        , 0.        ,
        0.42926087],
       ...,
       [0.77      , 0.        , 0.27027027, ..., 0.        , 0.        ,
        0.46092526],
       [0.634     , 0.        , 0.33783784, ..., 1.        , 0.        ,
        0.48738065],
       [0.694     , 0.5       , 0.44594595, ..., 1.        , 1.        ,
        0.26761502]], shape=(8000, 9))