In [19]:
# Cell 1 — imports & data load
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV

# Models
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

DATA_DIR = Path("../data")
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

print(train.shape, test.shape)
train.head()

(891, 12) (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [21]:
# Cell 2 — feature engineering
def engineer(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Title from Name
    out["Title"] = (
        out["Name"].str.extract(r",\s*([^\.]+)\.").fillna("U")
        .replace(
            ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],
            'Rare'
        )
        .replace({'Mlle':'Miss','Ms':'Miss','Mme':'Mrs'})
    )

    # Family features
    out["FamilySize"] = out["SibSp"].fillna(0) + out["Parch"].fillna(0) + 1
    out["IsAlone"] = (out["FamilySize"] == 1).astype(int)

    # Deck from Cabin (first char; unknown -> 'U')
    deck = out["Cabin"].astype(str).str[0]
    deck = deck.where(deck.isin(list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")), "U")
    out["Deck"] = deck

    # Fare per person (avoid divide-by-zero)
    out["FarePer"] = out["Fare"] / out["FamilySize"].replace(0, 1)

    # Mark categoricals explicitly
    for col in ["Sex", "Embarked", "Pclass", "Title", "Deck", "IsAlone"]:
        out[col] = out[col].astype("category")

    return out

train_fe = engineer(train)
test_fe  = engineer(test)

y = train_fe["Survived"].astype(int)
X = train_fe.drop(columns=["Survived","Name","Ticket","Cabin","PassengerId"])
X_test = test_fe.drop(columns=["Name","Ticket","Cabin","PassengerId"])

X.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,Deck,FarePer
0,3,male,22.0,1,0,7.25,S,Mr,2,0,U,3.625
1,1,female,38.0,1,0,71.2833,C,Mrs,2,0,C,35.64165
2,3,female,26.0,0,0,7.925,S,Miss,1,1,U,7.925
3,1,female,35.0,1,0,53.1,S,Mrs,2,0,C,26.55
4,3,male,35.0,0,0,8.05,S,Mr,1,1,U,8.05


In [22]:
# Cell 3 — preprocessing pipeline
from sklearn.compose import make_column_selector

# Select numeric vs categorical columns from X
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

print("Numeric:", num_cols)
print("Categorical:", cat_cols)

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", IterativeImputer(random_state=42)),
            ("sc", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ],
    remainder="drop"
)

Numeric: ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePer']
Categorical: ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone', 'Deck']


In [25]:
# Cell 4 — 9 models: CV accuracy (mean ± std) and CSV export
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, random_state=42),
    "SVC": SVC(random_state=42),
    "LinearSVC": LinearSVC(random_state=42),
    "KNN": KNeighborsClassifier(),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GaussianNB": GaussianNB(),
    "Perceptron": Perceptron(random_state=42),
    "SGD": SGDClassifier(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rows = []
for name, clf in models.items():
    pipe = Pipeline([("pre", preprocess), ("clf", clf)])
    scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    rows.append({"Model": name, "MeanAcc": scores.mean(), "Std": scores.std()})

df_q2 = pd.DataFrame(rows).sort_values("MeanAcc", ascending=False).reset_index(drop=True)
display(df_q2)

# Save for report
out_path = Path("../results/q2_cv_scores.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
df_q2.to_csv(out_path, index=False)
print("Saved:", out_path.resolve())



Unnamed: 0,Model,MeanAcc,Std
0,LinearSVC,0.831636,0.017159
1,LogisticRegression,0.830513,0.017678
2,SVC,0.829389,0.010651
3,KNN,0.818172,0.022148
4,RandomForest,0.817017,0.027192
5,DecisionTree,0.806942,0.022201
6,SGD,0.791225,0.024622
7,GaussianNB,0.769889,0.033712
8,Perceptron,0.741824,0.038328


Saved: /Users/abheeshtroy/Documents/University/1_DM/Homeworks/homework_1/results/q2_cv_scores.csv


In [26]:
# Cell 5 — light tuning (LogReg, RandomForest, SVC)
from sklearn.model_selection import GridSearchCV

grids = {
    "LogisticRegression": (
        Pipeline([("pre", preprocess), ("clf", LogisticRegression(max_iter=4000, random_state=42))]),
        {
            "clf__C": [0.25, 0.5, 1, 2, 4],
            "clf__penalty": ["l2"],
            "clf__class_weight": [None, "balanced"],
            "clf__solver": ["lbfgs", "liblinear"],
        },
    ),
    "RandomForest": (
        Pipeline([("pre", preprocess), ("clf", RandomForestClassifier(random_state=42))]),
        {
            "clf__n_estimators": [300, 600, 900],
            "clf__max_depth": [None, 6, 10, 14],
            "clf__min_samples_leaf": [1, 2, 3],
            "clf__class_weight": [None, "balanced_subsample"],
        },
    ),
    "SVC": (
        Pipeline([("pre", preprocess), ("clf", SVC(random_state=42))]),
        {
            "clf__C": [0.5, 1, 2, 4],
            "clf__gamma": ["scale", "auto"],
            "clf__class_weight": [None, "balanced"],
            "clf__kernel": ["rbf"],
        },
    ),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_rows = []

for name, (pipe, param_grid) in grids.items():
    gs = GridSearchCV(pipe, param_grid, cv=cv, scoring="accuracy", n_jobs=-1)
    gs.fit(X, y)
    best_rows.append({
        "Model": name,
        "BestMeanCV": gs.best_score_,
        "BestParams": gs.best_params_,
    })
    print(f"{name} best mean CV: {gs.best_score_:.4f}")
    print(gs.best_params_, "\n")

df_tuned = pd.DataFrame(best_rows).sort_values("BestMeanCV", ascending=False).reset_index(drop=True)
display(df_tuned)

# Save for report
out_path = Path("../results/q2_tuned_cv_scores.csv")
df_tuned.to_csv(out_path, index=False)
print("Saved:", out_path.resolve())

LogisticRegression best mean CV: 0.8305
{'clf__C': 1, 'clf__class_weight': None, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'} 

RandomForest best mean CV: 0.8406
{'clf__class_weight': None, 'clf__max_depth': 6, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 300} 

SVC best mean CV: 0.8350
{'clf__C': 4, 'clf__class_weight': None, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'} 



Unnamed: 0,Model,BestMeanCV,BestParams
0,RandomForest,0.840613,"{'clf__class_weight': None, 'clf__max_depth': ..."
1,SVC,0.835007,"{'clf__C': 4, 'clf__class_weight': None, 'clf_..."
2,LogisticRegression,0.830513,"{'clf__C': 1, 'clf__class_weight': None, 'clf_..."


Saved: /Users/abheeshtroy/Documents/University/1_DM/Homeworks/homework_1/results/q2_tuned_cv_scores.csv


In [32]:
# Cell 6B — schema-agnostic comparison of Q1 vs Q2 vs Q2_tuned
import pandas as pd
from pathlib import Path

q1 = pd.read_csv("../results/q1_cv_scores.csv")
q2 = pd.read_csv("../results/q2_cv_scores.csv")

# Normalize names
q1 = q1.rename(columns={"Score":"MeanAcc","mean":"MeanAcc","std":"Std"})
q2 = q2.rename(columns={"Score":"MeanAcc","mean":"MeanAcc","std":"Std"})

# Start comparison
compare = q1.rename(columns={"MeanAcc":"Q1_Baseline","Std":"Q1_Std"}) \
            .merge(
                q2.rename(columns={"MeanAcc":"Q2_Improved","Std":"Q2_Std"}),
                on="Model", how="outer"
            )

# Try to add tuned if available
tuned_path = Path("../results/q2_tuned_cv_scores.csv")
if tuned_path.exists():
    q2_tuned = pd.read_csv(tuned_path)
    q2_tuned = q2_tuned.rename(columns={"BestMeanCV":"Q2_Tuned"})
    compare = compare.merge(q2_tuned[["Model","Q2_Tuned","BestParams"]] if "BestParams" in q2_tuned.columns else q2_tuned[["Model","Q2_Tuned"]],
                            on="Model", how="outer")
else:
    compare["Q2_Tuned"] = pd.NA
    if "BestParams" in compare.columns:
        compare.drop(columns=["BestParams"], inplace=True, errors="ignore")

# Compute deltas where possible
for col in ["Q2_Improved","Q2_Tuned"]:
    if col in compare.columns:
        compare[f"{col}_delta_pp"] = (compare[col] - compare["Q1_Baseline"]) * 100

# Order columns nicely
ordered = ["Model","Q1_Baseline","Q2_Improved","Q2_Improved_delta_pp","Q2_Tuned","Q2_Tuned_delta_pp","Q1_Std","Q2_Std"]
compare = compare[[c for c in ordered if c in compare.columns]].sort_values("Model").reset_index(drop=True)

display(compare)

out_path = Path("../results/q_compare_all.csv")
compare.to_csv(out_path, index=False)
print("Saved comparison table to:", out_path.resolve())


Unnamed: 0,Model,Q1_Baseline,Q2_Improved,Q2_Improved_delta_pp,Q2_Tuned,Q2_Tuned_delta_pp,Q1_Std,Q2_Std
0,Decision Tree,86.76,,,,,,
1,DecisionTree,,0.806942,,,,,0.022201
2,GaussianNB,,0.769889,,,,,0.033712
3,KNN,84.85,0.818172,-8403.182788,,,,0.022148
4,Linear SVC,79.12,,,,,,
5,LinearSVC,,0.831636,,,,,0.017159
6,Logistic Regression,80.36,,,,,,
7,LogisticRegression,,0.830513,,0.830513,,,0.017678
8,Naive Bayes,72.28,,,,,,
9,Perceptron,78.34,0.741824,-7759.817588,,,,0.038328


Saved comparison table to: /Users/abheeshtroy/Documents/University/1_DM/Homeworks/homework_1/results/q_compare_all.csv


In [31]:
import pandas as pd
print(pd.read_csv("../results/q1_cv_scores.csv").head())
print(pd.read_csv("../results/q2_cv_scores.csv").head())


                     Model  MeanAcc  Std
0  Support Vector Machines    78.23  NaN
1                      KNN    84.85  NaN
2      Logistic Regression    80.36  NaN
3            Random Forest    86.76  NaN
4              Naive Bayes    72.28  NaN
                Model   MeanAcc       Std
0           LinearSVC  0.831636  0.017159
1  LogisticRegression  0.830513  0.017678
2                 SVC  0.829389  0.010651
3                 KNN  0.818172  0.022148
4        RandomForest  0.817017  0.027192


In [36]:
# Cell 7 — normalize scales and create a clean comparison table
import pandas as pd
from pathlib import Path

cmp_path = Path("../results/q_compare_all.csv")
compare = pd.read_csv(cmp_path)

# 1) Normalize baselines if entered as percentages (e.g., 84.85 -> 0.8485)
def _to01(s):
    if s.isna().all():
        return s
    # If any values are >1, assume percentage scale
    if (s.dropna() > 1).any():
        return s / 100.0
    return s

for col in ["Q1_Baseline", "Q2_Improved", "Q2_Tuned"]:
    if col in compare.columns:
        compare[col] = _to01(compare[col])

# 2) Recompute deltas in percentage points
if "Q2_Improved" in compare.columns and "Q1_Baseline" in compare.columns:
    compare["Δ(Q2–Q1) pp"] = (compare["Q2_Improved"] - compare["Q1_Baseline"]) * 100

if "Q2_Tuned" in compare.columns and "Q1_Baseline" in compare.columns:
    compare["Δ(Tuned–Q1) pp"] = (compare["Q2_Tuned"] - compare["Q1_Baseline"]) * 100

# 3) Pretty rounding
round_cols = ["Q1_Baseline","Q2_Improved","Q2_Tuned","Δ(Q2–Q1) pp","Δ(Tuned–Q1) pp","Q1_Std","Q2_Std"]
for c in round_cols:
    if c in compare.columns:
        compare[c] = compare[c].astype(float).round(4 if "Std" in c else 3)

# 4) Order columns nicely
order = ["Model","Q1_Baseline","Q2_Improved","Q2_Tuned","Δ(Q2–Q1) pp","Δ(Tuned–Q1) pp","Q1_Std","Q2_Std"]
compare = compare[[c for c in order if c in compare.columns]].sort_values("Model").reset_index(drop=True)

display(compare)

# 5) Save a final, clean CSV for your report
final_path = Path("../results/q_compare_all_clean.csv")
compare.to_csv(final_path, index=False)
print("Saved:", final_path.resolve())


Unnamed: 0,Model,Q1_Baseline,Q2_Improved,Q2_Tuned,Δ(Q2–Q1) pp,Δ(Tuned–Q1) pp,Q1_Std,Q2_Std
0,Decision Tree,0.868,,,,,,
1,DecisionTree,,0.807,,,,,0.0222
2,GaussianNB,,0.77,,,,,0.0337
3,KNN,0.848,0.818,,-3.033,,,0.0221
4,Linear SVC,0.791,,,,,,
5,LinearSVC,,0.832,,,,,0.0172
6,Logistic Regression,0.804,,,,,,
7,LogisticRegression,,0.831,0.831,,,,0.0177
8,Naive Bayes,0.723,,,,,,
9,Perceptron,0.783,0.742,,-4.158,,,0.0383


Saved: /Users/abheeshtroy/Documents/University/1_DM/Homeworks/homework_1/results/q_compare_all_clean.csv
