In [26]:
#importing libraries
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from summarytools import dfSummary
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

In [2]:
#readind the data 
titanic = sns.load_dataset("titanic")
print(titanic.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


In [3]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [4]:
dfSummary(titanic, is_collapsible=False)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,survived [int64],Mean (sd) : 0.4 (0.5) min < med < max: 0.0 < 0.0 < 1.0 IQR (CV) : 1.0 (0.8),2 distinct values,,0 (0.0%)
2,pclass [int64],Mean (sd) : 2.3 (0.8) min < med < max: 1.0 < 3.0 < 3.0 IQR (CV) : 1.0 (2.8),3 distinct values,,0 (0.0%)
3,sex [object],1. male 2. female,577 (64.8%) 314 (35.2%),,0 (0.0%)
4,age [float64],Mean (sd) : 29.7 (14.5) min < med < max: 0.4 < 28.0 < 80.0 IQR (CV) : 17.9 (2.0),88 distinct values,,177 (19.9%)
5,sibsp [int64],Mean (sd) : 0.5 (1.1) min < med < max: 0.0 < 0.0 < 8.0 IQR (CV) : 1.0 (0.5),7 distinct values,,0 (0.0%)
6,parch [int64],Mean (sd) : 0.4 (0.8) min < med < max: 0.0 < 0.0 < 6.0 IQR (CV) : 0.0 (0.5),7 distinct values,,0 (0.0%)
7,fare [float64],Mean (sd) : 32.2 (49.7) min < med < max: 0.0 < 14.5 < 512.3 IQR (CV) : 23.1 (0.6),248 distinct values,,0 (0.0%)
8,embarked [object],1. S 2. C 3. Q 4. nan,644 (72.3%) 168 (18.9%) 77 (8.6%) 2 (0.2%),,2 (0.2%)
9,class [category],1. Third 2. First 3. Second,491 (55.1%) 216 (24.2%) 184 (20.7%),,0 (0.0%)
10,who [object],1. man 2. woman 3. child,537 (60.3%) 271 (30.4%) 83 (9.3%),,0 (0.0%)


In [6]:
#We see these have missing values 
#deck, embark_town,embarked,age and duplicates
print(titanic.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


Preprocessing data

In [7]:
#dropping some columns 
titanic.drop(columns=['deck', 'embark_town', 'alive', 'who', 'class', 'adult_male'], inplace=True, errors='ignore')

In [8]:
print(titanic.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'alone'],
      dtype='object')


In [10]:
#Filling the missing values 
titanic['age'] = titanic['age'].fillna(titanic['age'].median())
titanic['embarked'] = titanic['embarked'].fillna(titanic['embarked'].mode()[0])

In [11]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,male,22.0,1,0,7.2500,S,False
1,1,1,female,38.0,1,0,71.2833,C,False
2,1,3,female,26.0,0,0,7.9250,S,True
3,1,1,female,35.0,1,0,53.1000,S,False
4,0,3,male,35.0,0,0,8.0500,S,True
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,True
887,1,1,female,19.0,0,0,30.0000,S,True
888,0,3,female,28.0,1,2,23.4500,S,False
889,1,1,male,26.0,0,0,30.0000,C,True


In [12]:
#removing duplicates 
titanic = titanic.drop_duplicates()

In [16]:
# Encode categorical variables
titanic.loc[:, 'sex'] = label_encoder.fit_transform(titanic['sex']).astype(int)
titanic.loc[:, 'embarked'] = label_encoder.fit_transform(titanic['embarked']).astype(int)

In [17]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,1,22.0,1,0,7.2500,2,False
1,1,1,0,38.0,1,0,71.2833,0,False
2,1,3,0,26.0,0,0,7.9250,2,True
3,1,1,0,35.0,1,0,53.1000,2,False
4,0,3,1,35.0,0,0,8.0500,2,True
...,...,...,...,...,...,...,...,...,...
885,0,3,0,39.0,0,5,29.1250,1,False
887,1,1,0,19.0,0,0,30.0000,2,True
888,0,3,0,28.0,1,2,23.4500,2,False
889,1,1,1,26.0,0,0,30.0000,0,True


In [19]:
# Convert boolean values to integers
titanic.loc[:, 'alone'] = titanic['alone'].astype(int)

In [20]:
# Define features (X) and target variable (y)
X = titanic.drop(columns=['survived'])
y = titanic['survived']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Normalize Features
Since neural networks perform better with scaled data, we standardize the numeric features.

In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Build the ANN Model

In [23]:
def build_model():
    model = keras.Sequential([
        layers.Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.2),  # Dropout to prevent overfitting
        layers.Dense(16, activation='relu'),
        layers.Dense(8, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
    ])

    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

model = build_model()


Train the Model with Early Stopping

In [24]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(X_train_scaled, y_train, 
                    validation_data=(X_test_scaled, y_test),
                    epochs=100, batch_size=32, 
                    callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


Evaluate the Model

In [25]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

Test Loss: 0.4139, Test Accuracy: 0.8258


Loss (0.4139): A lower loss generally indicates a better model fit, though the optimal value depends on the problem and the loss function used (e.g., cross-entropy for classification).

Accuracy (82.58%): This suggests the model correctly predicted around 83% of the test data, which is a decent result depending on the complexity of the problem.

### Analyzing misclassifications to help identify where the model struggles.

In [27]:
# Assuming y_test are true labels and y_pred are model predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert probabilities to class labels

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))

Confusion Matrix:
[[95  0]
 [60  0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        95
           1       0.00      0.00      0.00        60

    accuracy                           0.61       155
   macro avg       0.31      0.50      0.38       155
weighted avg       0.38      0.61      0.47       155



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Severe Class Imbalance:
The model predicts only class 0, meaning it never identifies class 1.

Precision, recall, and F1-score for class 1 are 0.00, which is a red flag.

Bias Toward the Majority Class:
If class 0 is much more frequent than class 1, the model learns to always predict 0 because it minimizes overall error.

Even though accuracy is 61%, it’s misleading because the model isn’t learning class 1 at all.