# Project 1 - churn detection applied to bank scenario
- The object is using data, predict if a customer will become a churn or not

In [15]:
import numpy  as np
import pandas as pd
from sklearn.model_selection import train_test_split


## Splitting data for training and evalution of the model

In [None]:
dataset = pd.read_csv('churn.csv')
dataset.shape

(10000, 14)

In [5]:
dataset.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [10]:
y = dataset['Exited']
X = dataset.drop(['Exited'], axis=1)

In [13]:
y.head(5)

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [12]:
X.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [None]:
# Splitting dataset into train and test
# The objective is to use the test dataset as a simulation of real life scenarios, it's important to get some metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Data exploration and cleaning

### First check: "text" columns
We cannot use text columns into machine learning models, because of that we need to "translate" it to numeric representations

__Hint__: When the type is "object" we must understand it as "text"

In [31]:
X_train.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
9254,9255,15601116,P'an,686,France,Male,32,6,0.0,2,1,1,179093.26
1561,1562,15766374,Leak,632,Germany,Male,42,4,119624.6,2,1,1,195978.86
1670,1671,15716994,Green,559,Spain,Male,24,3,114739.92,1,1,0,85891.02
6087,6088,15730759,Chukwudi,561,France,Female,27,9,135637.0,1,1,0,153080.4
6669,6670,15797900,Chinomso,517,France,Male,56,9,142147.32,1,0,0,39488.04


In [32]:
X_train.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
dtype: object

In [33]:
# Showing only text type objects
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Object columns: {object_columns}")

Object columns: ['Surname', 'Geography', 'Gender']


### Second check: null values
It's not a good practice to use null columns into the machine learning models.
There are many ways to handle that (fill with a value like average, median or other value).
The focus of the project is to show the deployment, because of that I will remove rows with null values to make the proccess quickier.

In [None]:
# Good news: there is not a single row null
X_train.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [None]:
# If we needed to drop that the command would be:
# X_train = X_train.drop(['RowNumber'], axis = 1) # Remove a column with every row missing
# X_train = X_train.dropna() # Remove all rows with null