In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ===============================
# 1. Load and preprocess dataset
# ===============================

# Adjust the path if needed
file_path = "titanic.csv"
df = pd.read_csv(file_path)

# Drop non-numeric / ID-heavy columns
df_model = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Handle missing values
df_model["Age"] = df_model["Age"].fillna(df_model["Age"].median())
df_model["Embarked"] = df_model["Embarked"].fillna(df_model["Embarked"].mode()[0])

# One-hot encode categorical variables
df_model = pd.get_dummies(df_model, columns=["Sex", "Embarked"], drop_first=True)

# Separate features and target
X = df_model.drop(columns=["Survived"]).values
y = df_model["Survived"].values

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

print("Train shape:", X_train_std.shape)
print("Test shape:", X_test_std.shape)

# =========
# 2.  PCA
# =========

# Keep 95% of the variance
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print("PCA train shape:", X_train_pca.shape)
print("PCA test shape:", X_test_pca.shape)
print("Explained variance (PCA):", pca.explained_variance_ratio_.sum())

# ============================
# 3. SVM from scratch (Linear)
# ============================

class LinearSVM:
    """
    Simple linear SVM with hinge loss and L2 regularization,
    optimized via (very basic) gradient descent.
    """
    def __init__(self, C=1.0, lr=0.001, n_iters=1000):
        self.C = C
        self.lr = lr
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Map labels from {0,1} to {-1, +1}
        y_ = np.where(y <= 0, -1, 1)

        self.w = np.zeros(n_features)
        self.b = 0.0

        lambda_param = 1.0 / self.C

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1

                if condition:
                    # Only regularization term
                    dw = 2 * lambda_param * self.w
                    db = 0.0
                else:
                    # Regularization + hinge loss gradient
                    dw = 2 * lambda_param * self.w - y_[idx] * x_i
                    db = -y_[idx]

                # Gradient descent update
                self.w -= self.lr * dw
                self.b -= self.lr * db

    def decision_function(self, X):
        return np.dot(X, self.w) - self.b

    def predict(self, X):
        approx = self.decision_function(X)
        # Map back to {0,1}
        return np.where(approx >= 0, 1, 0)


# ================================
# 4. Train & evaluate all models
# ================================

results = {}

# (a) Logistic Regression without PCA
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train_std, y_train)
y_pred_lr = log_reg.predict(X_test_std)
acc_lr = accuracy_score(y_test, y_pred_lr)
results["LogReg (no PCA)"] = acc_lr

print("\n=== Logistic Regression (no PCA) ===")
print("Accuracy:", acc_lr)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

# (b) Logistic Regression with PCA
log_reg_pca = LogisticRegression(max_iter=2000)
log_reg_pca.fit(X_train_pca, y_train)
y_pred_lr_pca = log_reg_pca.predict(X_test_pca)
acc_lr_pca = accuracy_score(y_test, y_pred_lr_pca)
results["LogReg (with PCA)"] = acc_lr_pca

print("\n=== Logistic Regression (with PCA) ===")
print("Accuracy:", acc_lr_pca)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_pca))
print("Classification Report:\n", classification_report(y_test, y_pred_lr_pca))

# (c) SVM from scratch without PCA
svm = LinearSVM(C=1.0, lr=0.001, n_iters=500)
svm.fit(X_train_std, y_train)
y_pred_svm = svm.predict(X_test_std)
acc_svm = accuracy_score(y_test, y_pred_svm)
results["SVM (no PCA, scratch)"] = acc_svm

print("\n=== SVM from scratch (no PCA) ===")
print("Accuracy:", acc_svm)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

# (d) SVM from scratch with PCA
svm_pca = LinearSVM(C=1.0, lr=0.001, n_iters=500)
svm_pca.fit(X_train_pca, y_train)
y_pred_svm_pca = svm_pca.predict(X_test_pca)
acc_svm_pca = accuracy_score(y_test, y_pred_svm_pca)
results["SVM (with PCA, scratch)"] = acc_svm_pca

print("\n=== SVM from scratch (with PCA) ===")
print("Accuracy:", acc_svm_pca)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm_pca))
print("Classification Report:\n", classification_report(y_test, y_pred_svm_pca))

# ==================
# 5. Final summary
# ==================

print("\n========== Summary of Accuracies ==========")
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}")


Shape of X: (891, 8)
Shape of y: (891,)
Train shape: (712, 8)
Test shape: (179, 8)
PCA train shape: (712, 7)
PCA test shape: (179, 7)
Explained variance (PCA): 0.9548692220455597

=== Logistic Regression (no PCA) ===
Accuracy: 0.8044692737430168
Confusion Matrix:
 [[98 12]
 [23 46]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179


=== Logistic Regression (with PCA) ===
Accuracy: 0.8100558659217877
Confusion Matrix:
 [[98 12]
 [22 47]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.89      0.85       110
           1       0.80      0.68      0.73        69

    accuracy                           0.81       179
   macro avg    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== SVM from scratch (with PCA) ===
Accuracy: 0.3854748603351955
Confusion Matrix:
 [[  0 110]
 [  0  69]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       110
           1       0.39      1.00      0.56        69

    accuracy                           0.39       179
   macro avg       0.19      0.50      0.28       179
weighted avg       0.15      0.39      0.21       179


LogReg (no PCA): 0.8045
LogReg (with PCA): 0.8101
SVM (no PCA, scratch): 0.3855
SVM (with PCA, scratch): 0.3855


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
