In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle

In [86]:
data = pd.read_csv('Churn_Modelling.csv')
data.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Preprocessing the data


In [87]:
### drop the irrelevant columns
data = data.drop(columns=['RowNumber','CustomerId','Surname'],axis=1
                 )
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [88]:
### Encode the categorical variables
label_encoder_gender = LabelEncoder()
data['Gender']= label_encoder_gender.fit_transform(data['Gender'])
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [89]:
### onehot encoding the Geography columns 
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geo = OneHotEncoder()
geo_encoder = onehot_encoder_geo.fit_transform(data[['Geography']])
print(geo_encoder)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>
  Coords	Values
  (0, 0)	1.0
  (1, 2)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 2)	1.0
  (5, 2)	1.0
  (6, 0)	1.0
  (7, 1)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 2)	1.0
  (12, 0)	1.0
  (13, 0)	1.0
  (14, 2)	1.0
  (15, 1)	1.0
  (16, 1)	1.0
  (17, 2)	1.0
  (18, 2)	1.0
  (19, 0)	1.0
  (20, 0)	1.0
  (21, 2)	1.0
  (22, 2)	1.0
  (23, 0)	1.0
  (24, 0)	1.0
  :	:
  (9975, 1)	1.0
  (9976, 0)	1.0
  (9977, 0)	1.0
  (9978, 0)	1.0
  (9979, 0)	1.0
  (9980, 2)	1.0
  (9981, 1)	1.0
  (9982, 1)	1.0
  (9983, 0)	1.0
  (9984, 1)	1.0
  (9985, 0)	1.0
  (9986, 1)	1.0
  (9987, 2)	1.0
  (9988, 0)	1.0
  (9989, 2)	1.0
  (9990, 1)	1.0
  (9991, 0)	1.0
  (9992, 2)	1.0
  (9993, 0)	1.0
  (9994, 0)	1.0
  (9995, 0)	1.0
  (9996, 0)	1.0
  (9997, 0)	1.0
  (9998, 1)	1.0
  (9999, 0)	1.0


In [90]:
geo_encoder_df = pd.DataFrame(geo_encoder.toarray(),columns =onehot_encoder_geo.get_feature_names_out(['Geography']))

In [91]:
### combine the one hot encoded columns with the original data 
data = pd.concat([data.drop('Geography',axis=1),geo_encoder_df],axis=1)
data.head(5)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [92]:
### Divivde the dataset into independent and dependent features
x = data.drop('EstimatedSalary',axis=1)
y = data['EstimatedSalary']
### Dividing the data into training and testing set 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [93]:
### scaling the Feature 
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [94]:
### save the encoders and scaler for future
with open('regression_label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder_gender,file)
with open('regresssion_onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(onehot_encoder_geo,file)
with open('regression_scale.pkl','wb') as file:
    pickle.dump(scaler,file)        

### Training  the ANN 

In [95]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [96]:
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard

In [97]:
### Building the model
model_regression = Sequential({
Dense(64,activation='relu',input_shape=(x_train.shape[1],)),    
Dense(32,activation='relu'),
Dense(1)    ## output layer for Regression
    
    
}
                   )

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [98]:
### compile the model
model_regression.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mae'])
model_regression.summary()

In [99]:
import datetime
## set up Tensorboard
log_dir = "regressionlogs/fit/"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
TensorBoard_callback = TensorBoard(log_dir=log_dir,histogram_freq=1)

In [100]:
### set up early stoping 
early_stopping_callback = EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)



In [101]:
### Training the Ann regression model
regression_train_model = model_regression.fit(
    x_train,y_train,
    validation_data=(x_test,y_test),
    epochs=100,
    callbacks = [early_stopping_callback,TensorBoard_callback]
    )

Epoch 1/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 100216.7109 - mae: 100216.7109 - val_loss: 98353.7812 - val_mae: 98353.7812
Epoch 2/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 100688.2500 - mae: 100688.2500 - val_loss: 98335.2344 - val_mae: 98335.2344
Epoch 3/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 98949.0234 - mae: 98949.0234 - val_loss: 98282.7188 - val_mae: 98282.7188
Epoch 4/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 100386.3672 - mae: 100386.3672 - val_loss: 98183.4141 - val_mae: 98183.4141
Epoch 5/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 100371.7734 - mae: 100371.7734 - val_loss: 98027.6719 - val_mae: 98027.6719
Epoch 6/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 99944.9375 - mae: 99944.9375 - val_loss: 97809.2188 

In [102]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [103]:
%tensorboard --logdir regressionlog/fit

Reusing TensorBoard on port 6006 (pid 10384), started 2:55:58 ago. (Use '!kill 10384' to kill it.)

In [104]:
### evaluate model on the test data
test_loss,test_mae = model_regression.evaluate(x_test,y_test)
print(f'Test MAE : {test_mae}')

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 69763.0156 - mae: 69763.0156
Test MAE : 68452.265625


In [105]:
### Save the regression model
model_regression.save('regression_model.h5')

