Import Required libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

Data Preprocessing

In [None]:

# Load data
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

# Preprocessing
def preprocess_data(df):
    # Drop unnecessary columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    # Handle missing values
    df['Age'] = SimpleImputer(strategy='median').fit_transform(df[['Age']])
    
    # Fix for Embarked column
    embarked_imputer = SimpleImputer(strategy='most_frequent')
    df['Embarked'] = embarked_imputer.fit_transform(df[['Embarked']]).ravel()  # <- Add .ravel()
    
    # Convert categorical variables
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')
    
    # Convert Fare to float and fill missing
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    return df

processed_data = preprocess_data(data)
X = processed_data.drop('Survived', axis=1)
y = processed_data['Survived']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (for models that require scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))

Logistic Regression:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)  # No scaling needed
y_pred_rf = rf.predict(X_test)

print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

Random Forest:
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       105
           1       0.77      0.77      0.77        74

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



Neural Network

In [5]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Build model
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train model
history = model.fit(X_train_scaled, y_train,
                    epochs=100,
                    batch_size=32,
                    validation_split=0.2,
                    verbose=0)

# Evaluate
y_pred_nn = (model.predict(X_test_scaled) > 0.5).astype(int)
print("Neural Network:")
print(classification_report(y_test, y_pred_nn))

2025-02-02 15:41:18.040880: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Neural Network:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       105
           1       0.83      0.73      0.78        74

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.82       179



### **Summary of Performance Metrics for Different Models**

| Metric       | Logistic Regression | Random Forest | Neural Network |
|-------------|---------------------|--------------|---------------|
| **Accuracy** | 0.81                | 0.81         | **0.83**      |
| **Precision (Class 0)** | 0.83 | **0.84** | 0.82 |
| **Precision (Class 1)** | 0.79 | 0.77 | **0.83** |
| **Recall (Class 0)** | 0.86 | 0.84 | **0.90** |
| **Recall (Class 1)** | 0.74 | 0.77 | **0.73** |
| **F1-score (Class 0)** | 0.84 | 0.84 | **0.86** |
| **F1-score (Class 1)** | 0.76 | 0.77 | **0.78** |
| **Macro Average (F1-score)** | 0.80 | 0.80 | **0.82** |
| **Weighted Average (F1-score)** | 0.81 | 0.81 | **0.82** |

### **Key Observations:**
1. **Neural Network** achieved the highest accuracy (**0.83**) compared to Logistic Regression and Random Forest (both **0.81**).
2. **Neural Network** also has the best F1-score for Class 0 (**0.86**) and Class 1 (**0.78**), making it more balanced.
3. **Random Forest and Logistic Regression** performed similarly, but Random Forest had a slight advantage in precision for Class 0 (**0.84** vs. 0.83).
4. **Neural Network excels in recall for Class 0** (**0.90**), meaning it identifies more positive cases but sacrifices recall for Class 1.

### **Conclusion:**
- If **overall accuracy and balanced performance** are the priorities, **Neural Network** is the best choice.
- If **precision for Class 0** (negative class) is crucial, **Random Forest** performs slightly better.
- If **recall for Class 1** (positive class) is more important, **Logistic Regression or Random Forest** might be preferred.
