In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# For reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [2]:
# ===============================
# 1. Load and inspect dataset
# ===============================

file_path = "titanic.csv"  # Adjust path if needed
df = pd.read_csv(file_path)

print("First 5 rows:")
display(df.head())

print("\nMissing values per column:")
print(df.isna().sum())


First 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



Missing values per column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [3]:
# =======================================
# 2. Preprocess: clean, encode, scale
# =======================================

# Drop non-numeric / ID-heavy columns
df_model = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Handle missing values
df_model["Age"] = df_model["Age"].fillna(df_model["Age"].median())
df_model["Embarked"] = df_model["Embarked"].fillna(df_model["Embarked"].mode()[0])

# One-hot encode categorical variables
df_model = pd.get_dummies(df_model, columns=["Sex", "Embarked"], drop_first=True)

# Separate features and target
X = df_model.drop(columns=["Survived"]).values
y = df_model["Survived"].values

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# Feature scaling (VERY important for SVM & PCA)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

print("Train shape (std):", X_train_std.shape)
print("Test shape (std):", X_test_std.shape)


Shape of X: (891, 8)
Shape of y: (891,)
Train shape (std): (712, 8)
Test shape (std): (179, 8)


In [4]:
# =========
# 3. PCA
# =========

# Keep 95% of the variance
pca = PCA(n_components=0.95, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print("PCA train shape:", X_train_pca.shape)
print("PCA test shape:", X_test_pca.shape)
print("Explained variance (PCA):", pca.explained_variance_ratio_.sum())
print("Explained variance per component:", pca.explained_variance_ratio_)


PCA train shape: (712, 7)
PCA test shape: (179, 7)
Explained variance (PCA): 0.9548692220455597
Explained variance per component: [0.22969511 0.21202173 0.19070433 0.10363124 0.09594764 0.07040417
 0.05246499]


In [5]:
# ==================================
# 4. Linear SVM from scratch (GD)
# ==================================

class LinearSVM:
    """
    Simple linear SVM with hinge loss and L2 regularization,
    optimized via (vectorized) batch gradient descent.
    """
    def __init__(self, C=1.0, lr=0.001, n_iters=1000, verbose=False):
        self.C = C
        self.lr = lr
        self.n_iters = n_iters
        self.verbose = verbose
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Map labels from {0,1} to {-1, +1}
        y_ = np.where(y <= 0, -1, 1).astype(float)

        self.w = np.zeros(n_features)
        self.b = 0.0

        for it in range(self.n_iters):
            # Decision scores
            scores = X @ self.w - self.b
            margins = y_ * scores

            # Points that violate margin (hinge loss active)
            misclassified = margins < 1

            if np.any(misclassified):
                # Gradient for w: w - C * (sum(y_i * x_i) over misclassified) / n_samples
                dw = self.w - self.C * (X[misclassified].T @ y_[misclassified]) / n_samples
                # Gradient for b: C * mean(y_i) over misclassified
                db = -self.C * np.mean(y_[misclassified])
            else:
                # Only regularization if nothing misclassified
                dw = self.w
                db = 0.0

            # Update parameters
            self.w -= self.lr * dw
            self.b -= self.lr * db

            # Optional: monitor loss every few iterations
            if self.verbose and (it + 1) % 100 == 0:
                loss = self._compute_loss(X, y_)
                print(f"Iter {it+1}/{self.n_iters} - loss: {loss:.4f}")

    def _compute_loss(self, X, y_):
        # Hinge loss + regularization (for monitoring only)
        scores = X @ self.w - self.b
        margins = 1 - y_ * scores
        hinge_loss = np.maximum(0, margins).mean()
        reg_term = 0.5 * np.dot(self.w, self.w)
        return reg_term + self.C * hinge_loss

    def decision_function(self, X):
        return X @ self.w - self.b

    def predict(self, X):
        scores = self.decision_function(X)
        return (scores >= 0).astype(int)  # Map back to {0,1}


In [6]:
# ==========================================
# 5. Helper function: evaluate a classifier
# ==========================================

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"\n=== {name} ===")
    print("Accuracy:", f"{acc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n",
          classification_report(y_test, y_pred, digits=4, zero_division=0))
    
    return acc


In [7]:
# ===========================================
# 6. Train & evaluate all models (4 variants)
# ===========================================

results = {}

# (a) Logistic Regression without PCA
log_reg = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)
log_reg.fit(X_train_std, y_train)
results["LogReg (no PCA)"] = evaluate_model(
    "Logistic Regression (no PCA)", log_reg, X_test_std, y_test
)

# (b) Logistic Regression with PCA
log_reg_pca = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)
log_reg_pca.fit(X_train_pca, y_train)
results["LogReg (with PCA)"] = evaluate_model(
    "Logistic Regression (with PCA)", log_reg_pca, X_test_pca, y_test
)

# (c) SVM from scratch without PCA
svm = LinearSVM(C=1.0, lr=0.001, n_iters=1000, verbose=False)
svm.fit(X_train_std, y_train)
results["SVM (no PCA, scratch)"] = evaluate_model(
    "Linear SVM from scratch (no PCA)", svm, X_test_std, y_test
)

# (d) SVM from scratch with PCA
svm_pca = LinearSVM(C=1.0, lr=0.001, n_iters=1000, verbose=False)
svm_pca.fit(X_train_pca, y_train)
results["SVM (with PCA, scratch)"] = evaluate_model(
    "Linear SVM from scratch (with PCA)", svm_pca, X_test_pca, y_test
)



=== Logistic Regression (no PCA) ===
Accuracy: 0.8045
Confusion Matrix:
 [[98 12]
 [23 46]]
Classification Report:
               precision    recall  f1-score   support

           0     0.8099    0.8909    0.8485       110
           1     0.7931    0.6667    0.7244        69

    accuracy                         0.8045       179
   macro avg     0.8015    0.7788    0.7864       179
weighted avg     0.8034    0.8045    0.8007       179


=== Logistic Regression (with PCA) ===
Accuracy: 0.8101
Confusion Matrix:
 [[98 12]
 [22 47]]
Classification Report:
               precision    recall  f1-score   support

           0     0.8167    0.8909    0.8522       110
           1     0.7966    0.6812    0.7344        69

    accuracy                         0.8101       179
   macro avg     0.8066    0.7860    0.7933       179
weighted avg     0.8089    0.8101    0.8068       179


=== Linear SVM from scratch (no PCA) ===
Accuracy: 0.6313
Confusion Matrix:
 [[55 55]
 [11 58]]
Classificatio

In [8]:
# ==================
# 7. Final summary
# ==================

print("\n========== Summary of Accuracies ==========")
for model_name, acc in results.items():
    print(f"{model_name:30s}: {acc:.4f}")

# Optional: show as DataFrame
summary_df = pd.DataFrame.from_dict(results, orient="index", columns=["Accuracy"])
display(summary_df)



LogReg (no PCA)               : 0.8045
LogReg (with PCA)             : 0.8101
SVM (no PCA, scratch)         : 0.6313
SVM (with PCA, scratch)       : 0.6480


Unnamed: 0,Accuracy
LogReg (no PCA),0.804469
LogReg (with PCA),0.810056
"SVM (no PCA, scratch)",0.631285
"SVM (with PCA, scratch)",0.648045
