# Task:  
## Preprocessing:  
 1. Load the dataset
 2. Preprocess the data  
  2.1. Explore the data  
  2.2. Drop the unnecessary columns  
  2.3. Encode the categorical columns      
3. Train test split the data (y = 'exit' column)  
4. Scale the features using StandardScaler()
5. Save encoders and scaler in pickle files (le_gender, ohe_geography, scaler)  
## ANN:  
 1. Build model Architecture  
 2. Compile the model  
 3. Setup Callbacks  
    3.1. Setup logs and Tensorboard  
    3.2. Setup early stopping  
 4. Fit the model
 5. Save the model
 5. Visualize metrics in Tensorboard

# Importing and exploring data

In [30]:
import pandas as pd

df = pd.read_csv('../data/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [31]:
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Encoding the categorical columns

In [32]:
## Encoding gender using label encoder
from sklearn.preprocessing import LabelEncoder

le_gender = LabelEncoder()

df['Gender'] = le_gender.fit_transform(df['Gender'])
df.sample(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
5573,681,France,1,29,8,0.0,1,1,0,66367.33,0
612,605,France,0,64,2,129555.7,1,1,1,13601.79,0
5637,549,France,0,29,8,0.0,2,1,1,189558.44,0
1706,719,Spain,1,38,0,0.0,1,1,0,126876.47,0
6457,615,Spain,1,51,6,81818.49,1,1,1,169149.38,0


In [33]:
# Encoding geography using one hot encoder
from sklearn.preprocessing import OneHotEncoder

ohe_geography = OneHotEncoder()

ohe_geo = ohe_geography.fit_transform(df[['Geography']])

geo_features = ohe_geography.get_feature_names_out(['Geography'])

df = pd.concat([df.drop('Geography', axis=1), pd.DataFrame(ohe_geo.toarray(), columns=geo_features)], axis=1)
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


# Train test spliting

In [34]:
# Train test split
from sklearn.model_selection import train_test_split

X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Saving encoders and scaler in pickle files

In [36]:
import pickle

with open('../assets/scaler.pkl', 'wb') as f:
  pickle.dump(scaler, f)

with open('../assets/label_encoder.pkl', 'wb') as f:
  pickle.dump(le_gender, f)
  
with open('../assets/one_hot_encoder.pkl', 'wb') as f:
  pickle.dump(ohe_geography, f)


In [37]:
X_train.shape

(8000, 12)

# ANN

## Model Architecture

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
  Dense(12, activation='relu'),
  Dense(8, activation='relu'),
  Dense(1, activation='sigmoid'),
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Setup Callbacks

In [39]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
log_dir = '../log/fits/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tb = TensorBoard(log_dir=log_dir, histogram_freq=1)

## Fit Model

In [40]:
history = model.fit(X_train,
            y_train,
            validation_data=[X_test, y_test],
            epochs=100,
            shuffle=True,
            callbacks=[tb, es])

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7217 - loss: 0.5854 - val_accuracy: 0.8050 - val_loss: 0.4500
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7901 - loss: 0.4671 - val_accuracy: 0.8100 - val_loss: 0.4257
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7894 - loss: 0.4506 - val_accuracy: 0.8130 - val_loss: 0.4165
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8095 - loss: 0.4401 - val_accuracy: 0.8175 - val_loss: 0.4092
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8177 - loss: 0.4205 - val_accuracy: 0.8215 - val_loss: 0.4034
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8221 - loss: 0.4204 - val_accuracy: 0.8290 - val_loss: 0.3986
Epoch 7/100
[1m250/25

## Save the model

In [43]:
model.save('../assets/churn_model.h5')



## Loading Tensorboard and visualizing

In [41]:
%load_ext tensorboard
%tensorboard --logdir ../log/fits/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 5772), started 0:08:40 ago. (Use '!kill 5772' to kill it.)