# Titanic Survival Prediction (No-CLI Notebook)

Run all cells top-to-bottom. Make sure `data/train.csv` and `data/test.csv` are present.

In [None]:
import joblib, pandas as pd, numpy as np
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

DATA_DIR = Path("..") / "data"
MODELS_DIR = Path("..") / "models"
MODELS_DIR.mkdir(exist_ok=True)
print("Project paths:", DATA_DIR, MODELS_DIR)

In [None]:
import pandas as pd

def feature_engineer(df: pd.DataFrame) -> pd.DataFrame:
    df["Title"] = df["Name"].str.extract(r",\s*([^\.]*)\s*\.")
    map_title = {
        "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
        "Lady": "Royalty", "Countess": "Royalty", "Dona": "Royalty",
        "Sir": "Royalty", "Don": "Royalty", "Jonkheer": "Royalty",
        "Capt": "Officer", "Col": "Officer", "Major": "Officer", "Dr": "Officer", "Rev": "Officer"
    }
    df["Title"] = df["Title"].replace(map_title)

    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    df["TicketLen"] = df["Ticket"].astype(str).str.len()

    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
    df["Fare"] = df.groupby("Pclass")["Fare"].transform(lambda s: s.fillna(s.median()))
    df["Age"] = df.groupby(["Title", "Pclass"])["Age"].transform(lambda s: s.fillna(s.median()))
    df["Age"] = df["Age"].fillna(df["Age"].median())

    return df.drop(columns=["Cabin", "Name", "Ticket"])

def make_preprocessor():
    numeric_features = ["Age", "Fare", "FamilySize", "TicketLen", "Parch", "SibSp"]
    categorical_features = ["Sex", "Embarked", "Pclass", "Title", "IsAlone"]

    numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    return preprocessor

In [None]:
import pandas as pd

train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")
train = feature_engineer(train)
test = feature_engineer(test)
y = train["Survived"]
X = train.drop(columns=["Survived"])
print("Train shape:", X.shape, "Test shape:", test.shape)
train.head()

In [None]:
pre = make_preprocessor()
pipe = Pipeline(steps=[("pre", pre), ("model", LogisticRegression(max_iter=1000))])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
base = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy")
print(f"Baseline LogisticRegression CV: {base.mean():.4f} Â± {base.std():.4f}")

In [None]:
param_grid = [
    {"model": [LogisticRegression(max_iter=1000)], "model__C": [0.1, 1.0, 3.0]},
    {"model": [RandomForestClassifier(random_state=42)], "model__n_estimators": [200, 400], "model__max_depth": [None, 5, 8]},
]

grid = GridSearchCV(Pipeline(steps=[("pre", pre), ("model", LogisticRegression(max_iter=1000))]),
                    param_grid=param_grid, cv=cv, scoring="accuracy", n_jobs=-1)
grid.fit(X, y)
print("Best CV accuracy:", grid.best_score_)
print("Best params:", grid.best_params_)

best_model = grid.best_estimator_
best_model.fit(X, y)
preds = best_model.predict(X)
print("Training accuracy:", accuracy_score(y, preds))
print("Confusion matrix:\n", confusion_matrix(y, preds))
print("Classification report:\n", classification_report(y, preds))

In [None]:
test_pred = best_model.predict(test.drop(columns=["PassengerId"]))
submission = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": test_pred.astype(int)})
submission_path = MODELS_DIR / "submission.csv"
model_path = MODELS_DIR / "best_model.joblib"
submission.to_csv(submission_path, index=False)
import joblib
joblib.dump(best_model, model_path)
submission.head(), submission_path, model_path