In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Softmax, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [2]:
mobile_train = pd.read_csv('mobile_train.csv')
mobile_test = pd.read_csv('mobile_test.csv')
mobile_train.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
mobile_train_vif = mobile_train.drop(['price_range'], axis=1)

def calculate_vif(data_frame):
    features = data_frame.columns
    vif_data = pd.DataFrame()
    vif_data["Feature"] = features
    vif_data["VIF"] = [variance_inflation_factor(data_frame.values, i) for i in range(data_frame.shape[1])]
    return vif_data.sort_values(by='VIF', ascending=False)
    
def drop_high_vif_features(data_frame, threshold=5):
    while True:
        vif_results = calculate_vif(data_frame)
        max_vif_feature = vif_results.loc[vif_results['VIF'].idxmax(), 'Feature']
        max_vif_value = vif_results.loc[vif_results['VIF'].idxmax(), 'VIF']
        
        if max_vif_value > threshold:
            print(f"Dropping feature '{max_vif_feature}' with VIF {max_vif_value}")
            data_frame = data_frame.drop(columns=max_vif_feature)
        else:
            break
    return data_frame
mobile_train_vif = drop_high_vif_features(mobile_train_vif)

Dropping feature 'mobile_wt' with VIF 12.972548425819065
Dropping feature 'px_width' with VIF 11.470014131904488
Dropping feature 'sc_h' with VIF 11.086593845458365
Dropping feature 'battery_power' with VIF 7.543843177190293
Dropping feature 'pc' with VIF 6.050059878559392
Dropping feature 'three_g' with VIF 5.930418164840767


It is a good practice to drop correlated input features or handle them. However, in case of  non linear models like neural networks, they are fairly robust to multicollinearity and will not need special attention to mitigate it. 

So we will progress with all the features in the dataset without removing the correlated features. If overfitting proves to be an issue down the line we can use techniques like regularization/batch normalization to mitigate it, and in spite of all this if overfitting isnt resolved, we can come back to dropping the correlated input features.

In [4]:
### edit this cell to drop correlated variables
X = mobile_train.drop(['price_range'], axis=1) # use mobile_train_vif if handling correlated inputs
y = mobile_train['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# standardize the inputs
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 2 layer NN using tensorflow (Sequential API)

In [5]:
# keras expected the inputs to be of the shape: x-(m,n), y-(m,)
X_train_nn, X_test_nn = X_train_scaled, X_test_scaled
y_train_nn,y_test_nn = y_train, y_test

we are trying to replicate the same thing done on the previous notebook (4 hidden units in the hidden layer)

In [26]:
np.random.seed(3)
input_size = X_train_nn.shape[1]

# Define the model
model = models.Sequential()
model.add(layers.Dense(4, activation='relu', input_shape=(input_size,))) # expects the (number of features, none) 
model.add(layers.Dense(4, activation='softmax')) 

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # For integer-encoded labels
              metrics=['accuracy'])

# Train the model
model.fit(X_train_nn, y_train_nn, epochs=90, batch_size=64)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_nn, y_test_nn)
print(f'Test accuracy: {test_acc}')

Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90
Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 83/90
Epoch 84/90
E

training the model for different epochs of time gives us different accuracies

We see certain differences in convergence rate of the algorithm (basically how fast the loss reduces) based on the batch size used for training

1. using a batch size of 1600 (the entire size of the training data) basically performing batch gradient descent, the algorithm requires 1000 epochs over the entire data to achieve an accuracy of 89% in the test data
2. using a batch size of 64, the algorithm needs around 57 epochs to reach a an accuracy of 89% in test data.

we see this difference because the mini batch gradient descent (2) updates the parameters 25 {1600/64} times while passing through one epoch, whereas the bacth gradient descent updates it once in one epoch. so the parameters converge much faster.

In [23]:
# Make predictions on the training set
train_predictions = model.predict(X_train_nn)
train_predictions = np.argmax(train_predictions, axis=1)

# Make predictions on the test set
test_predictions = model.predict(X_test_nn)
test_predictions = np.argmax(test_predictions, axis=1)

# Evaluate the model on the training set
print("Training Set Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(np.squeeze(y_train_nn), np.squeeze(train_predictions)))

print("\nClassification Report:")
print(classification_report(np.squeeze(y_train_nn), np.squeeze(train_predictions)))

# Evaluate the model on the test set
print("\nTest Set Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(np.squeeze(y_test_nn), np.squeeze(test_predictions)))

print("\nClassification Report:")
print(classification_report(np.squeeze(y_test_nn), np.squeeze(test_predictions)))

Training Set Evaluation:
Confusion Matrix:
[[392   3   0   0]
 [ 19 382   8   0]
 [  0  11 376  21]
 [  0   0   2 386]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       395
           1       0.96      0.93      0.95       409
           2       0.97      0.92      0.95       408
           3       0.95      0.99      0.97       388

    accuracy                           0.96      1600
   macro avg       0.96      0.96      0.96      1600
weighted avg       0.96      0.96      0.96      1600


Test Set Evaluation:
Confusion Matrix:
[[103   2   0   0]
 [  6  85   0   0]
 [  0   2  86   4]
 [  0   0   0 112]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       105
           1       0.96      0.93      0.94        91
           2       1.00      0.93      0.97        92
           3       0.97      1.00      0.98       112

    accu

### 2 layer NN using tensorflow (Functional API)

The same thing done above can also be done using a functional API and this proves to be more useful in cases where we want there to be shared layers among other advantages. over here it should make any difference

In [27]:
# we will be using the same inputs from previous implementation
np.random.seed(3)
input_size = X_train_nn.shape[1]

# define the model
input_layer = layers.Input(shape=(input_size,))
hidden_layer = layers.Dense(4, activation='relu')(input_layer) ## Hidden layer
output_layer = layers.Dense(4, activation='softmax')(hidden_layer) ## output layer

# Create the model
model2 = models.Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model2.fit(X_train_nn, y_train_nn, epochs=100, batch_size=64)

# Evaluate the model
test_loss, test_acc = model2.evaluate(X_test_nn, y_test_nn)
print(f'Test accuracy: {test_acc}')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78