In [None]:
import pandas as pd

# Load the Titanic dataset to inspect its structure and contents
file_path = "/Users/aadya.mukherjee/Documents/git my/mrm_pca/titanic.csv"
titanic_data = pd.read_csv(file_path)

# Display basic information about the dataset
titanic_data.info(), titanic_data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


(None,
    PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450  

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define target and features
target = 'Survived'
features = titanic_data.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])

# Separate numerical and categorical columns
num_cols = features.select_dtypes(include=['float64', 'int64']).columns
cat_cols = features.select_dtypes(include=['object']).columns

# Preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ]
)

# Apply preprocessing
X = features
y = titanic_data[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform the training data, transform the test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

X_train_preprocessed.shape, X_test_preprocessed.shape

((712, 10), (179, 10))

In [8]:
from sklearn.decomposition import PCA

# Perform PCA to reduce dimensionality
pca = PCA(n_components=5)  # Choosing 5 principal components for simplicity
X_train_pca = pca.fit_transform(X_train_preprocessed)
X_test_pca = pca.transform(X_test_preprocessed)

# Check the explained variance ratio to ensure sufficient information is retained
explained_variance = pca.explained_variance_ratio_
explained_variance.cumsum()

array([0.29122822, 0.57890099, 0.70515895, 0.8033786 , 0.87244666])

In [9]:
import numpy as np

class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        # Map y to {-1, 1} for SVM
        y_ = np.where(y <= 0, -1, 1)
        n_samples, n_features = X.shape

        # Initialize weights and bias
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.learning_rate * y_[idx]

    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)

# Train SVM on original data
svm = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
svm.fit(X_train_preprocessed, y_train.values)
y_pred_svm_original = svm.predict(X_test_preprocessed)

# Train SVM on PCA-transformed data
svm_pca = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
svm_pca.fit(X_train_pca, y_train.values)
y_pred_svm_pca = svm_pca.predict(X_test_pca)

# Evaluate performance
from sklearn.metrics import accuracy_score
accuracy_svm_original = accuracy_score(y_test, y_pred_svm_original)
accuracy_svm_pca = accuracy_score(y_test, y_pred_svm_pca)

accuracy_svm_original, accuracy_svm_pca

(0.2905027932960894, 0.3128491620111732)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Logistic Regression without PCA
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_preprocessed, y_train)
y_pred_log_reg_original = log_reg.predict(X_test_preprocessed)

# Logistic Regression with PCA
log_reg_pca = LogisticRegression(max_iter=1000)
log_reg_pca.fit(X_train_pca, y_train)
y_pred_log_reg_pca = log_reg_pca.predict(X_test_pca)

# Evaluate models
accuracy_log_reg_original = accuracy_score(y_test, y_pred_log_reg_original)
accuracy_log_reg_pca = accuracy_score(y_test, y_pred_log_reg_pca)

# Display results
print("=== Model Performance ===")
print(f"SVM (Original Data): Accuracy = {accuracy_svm_original:.2f}")
print(f"SVM (PCA Data): Accuracy = {accuracy_svm_pca:.2f}")
print(f"Logistic Regression (Original Data): Accuracy = {accuracy_log_reg_original:.2f}")
print(f"Logistic Regression (PCA Data): Accuracy = {accuracy_log_reg_pca:.2f}")

# Classification Reports
print("\n=== Classification Report: SVM (Original Data) ===")
print(classification_report(y_test, y_pred_svm_original))
print("\n=== Classification Report: SVM (PCA Data) ===")
print(classification_report(y_test, y_pred_svm_pca))
print("\n=== Classification Report: Logistic Regression (Original Data) ===")
print(classification_report(y_test, y_pred_log_reg_original))
print("\n=== Classification Report: Logistic Regression (PCA Data) ===")
print(classification_report(y_test, y_pred_log_reg_pca))

=== Model Performance ===
SVM (Original Data): Accuracy = 0.29
SVM (PCA Data): Accuracy = 0.31
Logistic Regression (Original Data): Accuracy = 0.81
Logistic Regression (PCA Data): Accuracy = 0.84

=== Classification Report: SVM (Original Data) ===
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         0.0       0.00      0.00      0.00       105
         1.0       0.75      0.70      0.73        74

    accuracy                           0.29       179
   macro avg       0.25      0.23      0.24       179
weighted avg       0.31      0.29      0.30       179


=== Classification Report: SVM (PCA Data) ===
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         0.0       0.00      0.00      0.00       105
         1.0       0.85      0.76      0.80        74

    accuracy                           0.31       179
   macro avg       0.28      0.25      0.27       17

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
