**ANN Practical > Regression problem**

***Dataset***
- Churn dataset
- Output feature
    - Estimated Salary
        
        
        
- Input features
    - Non important features: RowNumber, CustomerId, Surname
    - Important features: Geography, Gender, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember

***Problem Statement***
- Predict the Estimated Salary
- Type of problem: Regression problem (Predict the salaries of a group of people)


***Steps to Follow***

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle
import datetime

# ANN related imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Tensorflow
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard


In [27]:
# Step 1 - Load the dataset
data=pd.read_csv('./resources/data/Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [28]:
# Step 2 - Preprocess the data
# Step 2.1 -  Drop unnecessary columns
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])
data.head()


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [29]:
# Step 2.2) Encode the categorical column Gender (Convert from Male/Female to 0/1)
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [30]:
# Step 2.3) Encode the column Geography using OneHotEncoder
onehot_encoder_geography = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
geography_encoded = onehot_encoder_geography.fit_transform(data[['Geography']])
geography_encoded_df = pd.DataFrame(geography_encoded, columns=onehot_encoder_geography.get_feature_names_out(['Geography']))
data = pd.concat([data.drop(columns=['Geography']), geography_encoded_df], axis=1)
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [31]:
# Step 3 - Split the data into features and target
# x = inputs = all columns except EstimatedSalary
# y = output = EstimatedSalary column
x = data.drop(columns=['EstimatedSalary'],axis=1)
y = data['EstimatedSalary']      


In [32]:
# Step 4 - Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [36]:
# Step 5 - Scale the features using StandardScaler
standard_scaler_xtrain_xtest = StandardScaler()
x_train = standard_scaler_xtrain_xtest.fit_transform(x_train)
x_test = standard_scaler_xtrain_xtest.transform(x_test)
x_train


array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]])

In [37]:
# Step 6 - Save the Encoders (LabelEncoder, OneHotEncoder) and Scaler (StandardScaler) using pickle
with open('./resources/pickle/2_label_encoder_gender.pkl', 'wb') as f:
     pickle.dump(label_encoder_gender, f) 

with open('./resources/pickle/2_onehot_encoder_geography.pkl', 'wb') as f:
    pickle.dump(onehot_encoder_geography, f)

with open('./resources/pickle/2_standard_scaler_xtrain_xtest.pkl', 'wb') as f:
    pickle.dump(standard_scaler_xtrain_xtest, f)

In [43]:
# Step 7 - Build the ANN model
# Input Layer = input_shape=(x_train.shape[1]
# Hidden Layer 1 = 64 (neurons), activation='relu'
# Hidden Layer 2 = 32 (neurons), activation='relu'
# Output Layer = 1 neuron
ann_model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

## Compile the model
ann_model.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mae'])

ann_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [46]:
# Step 8 - Set up TensorBoard
# Step 8.1 - Write the logs to the log directory
log_dir = "./resources/logs/2/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [47]:
# Step 8.2 - Set up Early Stopping to prevent overfitting
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [48]:
# Step 9 - Train the ANN model created in Step 7
ann_model_trained = ann_model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=100,
    callbacks=[early_stopping_callback, tensorboard_callback]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 100383.5781 - mae: 100383.5781 - val_loss: 98542.7422 - val_mae: 98542.7422
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 99734.4297 - mae: 99734.4297 - val_loss: 97238.0859 - val_mae: 97238.0859
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 97461.5391 - mae: 97461.5391 - val_loss: 93901.1094 - val_mae: 93901.1094
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 92933.2031 - mae: 92933.2031 - val_loss: 88207.7109 - val_mae: 88207.7109
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 86209.0703 - mae: 86209.0703 - val_loss: 80664.7969 - val_mae: 80664.7969
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 77962.0859 - mae: 77962.0859 - val_loss: 72304.2031 - val_

In [51]:
# Step 10 - Load the TensorBoard
%load_ext tensorboard
%tensorboard --logdir ./resources/logs/2/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 43944), started 0:00:44 ago. (Use '!kill 43944' to kill it.)