## Impementing SVM

### Imports

In [1]:
import os, sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print('Python:', sys.version.split()[0])
import sklearn, pandas
print('pandas:', pandas.__version__)
print('scikit-learn:', sklearn.__version__)

Python: 3.13.5
pandas: 2.2.3
scikit-learn: 1.6.1


### Loading The Titanic Dataset

In [3]:
paths = ['./titanic.csv']
df = None
for p in paths:
    if os.path.exists(p):
        df = pd.read_csv(p)
        print('Loaded:', p)
        break

# Keep it simple: choose a few useful columns
df.columns = [c.strip() for c in df.columns]
target = 'Survived'
feat_num = ['Age','SibSp','Parch','Fare']
feat_cat = ['Pclass','Sex','Embarked']

# Minimal model table (drop rows with missing target)
dfm = df[[target]+feat_num+feat_cat].dropna(subset=[target]).copy()

X = dfm.drop(columns=[target])
y = dfm[target].astype(int)

print('Data shape:', X.shape, ' Pos rate:', y.mean().round(3))
X.head()

Loaded: ./titanic.csv
Data shape: (891, 7)  Pos rate: 0.384


Unnamed: 0,Age,SibSp,Parch,Fare,Pclass,Sex,Embarked
0,22.0,1,0,7.25,3,male,S
1,38.0,1,0,71.2833,1,female,C
2,26.0,0,0,7.925,3,female,S
3,35.0,1,0,53.1,1,female,S
4,35.0,0,0,8.05,3,male,S


### Pre-Processing

In [4]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocess = ColumnTransformer([
    ('num', num_pipe, [c for c in feat_num if c in X.columns]),
    ('cat', cat_pipe, [c for c in feat_cat if c in X.columns])
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y if len(np.unique(y))==2 else None
)
X_train.shape, X_test.shape

((712, 7), (179, 7))

### Training SVM

In [10]:
svm_model = Pipeline([
    ('prep', preprocess),
    ('clf', SVC(kernel='rbf', random_state=RANDOM_STATE))
])

svm_model.fit(X_train, y_train)
svm_acc = accuracy_score(y_test, svm_model.predict(X_test))
print('SVM Test Accuracy:', round(svm_acc, 4))

SVM Test Accuracy: 0.8156


### Training LR

In [11]:
lr_model = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])

lr_model.fit(X_train, y_train)
lr_acc = accuracy_score(y_test, lr_model.predict(X_test))
print('Logistic Regression Test Accuracy:', round(lr_acc, 4))

Logistic Regression Test Accuracy: 0.8045


### Comparing them both

In [12]:
print('--- Simple Comparison ---')
print(f'SVM accuracy:              {svm_acc:.4f}')
print(f'Logistic Regression:        {lr_acc:.4f}')

if svm_acc > lr_acc:
    print('✅ SVM won on this split.')
elif lr_acc > svm_acc:
    print('✅ Logistic Regression won on this split.')
else:
    print('⚖️ They tied on this split.')

# Optional: quick tip
if lr_acc + 0.02 >= svm_acc:
    print('\nTip: LR is simpler + faster; if accuracy is close, LR may be preferable.')

--- Simple Comparison ---
SVM accuracy:              0.8156
Logistic Regression:        0.8045
✅ SVM won on this split.

Tip: LR is simpler + faster; if accuracy is close, LR may be preferable.
