In [289]:
np.random.seed(42)

import pandas as pd, numpy as np
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

In [None]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


df = pd.concat([train.assign(_is_train=1), test.assign(Survived=np.nan, _is_train=0)], ignore_index=True)

df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.").iloc[:,0]
df["Title"] = df["Title"].replace({"Mlle":"Miss","Ms":"Miss","Mme":"Mrs","Lady":"Rare","Countess":"Rare","Dona":"Rare","Dr":"Rare","Rev":"Rare","Col":"Rare","Major":"Rare","Sir":"Rare","Jonkheer":"Rare","Capt":"Rare","Don":"Rare"})
df.loc[~df["Title"].isin(["Mr","Miss","Mrs","Master"]), "Title"] = "Rare"
df["Surname"] = df["Name"].str.extract(r"^([^,]+)").iloc[:,0].str.strip()

df["FamilySize"] = df["SibSp"].fillna(0) + df["Parch"].fillna(0) + 1
df["IsAlone"] = (df["FamilySize"]==1).astype(int)
df["FamilyID"] = (df["Surname"].fillna("NA") + "_" + df["FamilySize"].astype(int).astype(str))

tp = df["Ticket"].astype(str).str.replace(r"[./]", "", regex=True).str.replace(r"\s+", "", regex=True)
df["TicketPrefix"] = tp.str.replace(r"\d+", "", regex=True).replace("", "NONE")
df["TicketGroupSize"] = df.groupby("Ticket")["Ticket"].transform("count").astype(int)

df["CabinDeck"] = df["Cabin"].fillna("U").astype(str).str[0]
df["CabinCount"] = df["Cabin"].fillna("").str.split().str.len().astype(int)
df["HasCabin"] = (df["Cabin"].notna()).astype(int)

df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode().iloc[0])

fare_grp = df.groupby(["Pclass","Embarked"])["Fare"].median()
df["Fare"] = df.apply(lambda r: fare_grp.loc[(r["Pclass"], r["Embarked"])] if pd.isna(r["Fare"]) else r["Fare"], axis=1)
q_hi = df.loc[df["_is_train"]==1, "Fare"].quantile(0.99)
df["Fare"] = df["Fare"].clip(lower=3, upper=q_hi)
df["FareLog"] = np.log1p(df["Fare"])

age_grp = df.groupby(["Title","Pclass","Sex"])["Age"].median()
df["Age"] = df.apply(lambda r: age_grp.loc[(r["Title"], r["Pclass"], r["Sex"])] if pd.isna(r["Age"]) else r["Age"], axis=1)
df["Age"] = df["Age"].fillna(df["Age"].median())
df["AgeBin"] = pd.cut(df["Age"], bins=[0,12,18,30,45,60,120], labels=["0-12","13-18","19-30","31-45","46-60","60+"]).astype(str)

df["FarePerPerson"] = df["Fare"] / df["FamilySize"].replace(0,1)
df["FarePerPersonLog"] = np.log1p(df["FarePerPerson"])
df["AgeClass"] = df["Age"] * df["Pclass"]
df["IsChild"] = (df["Age"] < 16).astype(int)
df["IsMother"] = ((df["Sex"]=="female") & (df["Parch"]>0) & (df["Age"]>18) & (df["Title"].isin(["Mrs"]))).astype(int)
df["IsMale"] = (df["Sex"]=="male").astype(int)

df["Pclass"] = df["Pclass"].astype(str)
df["PclassSex"] = df["Pclass"] + "_" + df["Sex"]
df["TitlePclass"] = df["Title"].astype(str) + "_" + df["Pclass"]
df["DeckPclass"] = df["CabinDeck"].astype(str) + "_" + df["Pclass"]
df["EmbarkedPclass"] = df["Embarked"].astype(str) + "_" + df["Pclass"]
df["SexEmbarked"] = df["Sex"].astype(str) + "_" + df["Embarked"].astype(str)
df["FamTicket"] = df["Surname"].fillna("NA") + "_" + df["TicketPrefix"].fillna("NONE")

trn = df[df["_is_train"]==1].copy().reset_index(drop=True)
tst = df[df["_is_train"]==0].copy().reset_index(drop=True)
y = trn["Survived"].astype(int).reset_index(drop=True)

def kfold_target_mean(train_df, test_df, y, col, n_splits=7, alpha=20, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros(len(train_df))
    global_mean = y.mean()
    for tr_idx, va_idx in skf.split(train_df, y):
        g = pd.DataFrame({col: train_df.iloc[tr_idx][col].values, "y": y.iloc[tr_idx].values}).groupby(col)["y"].agg(["size","mean"])
        sm = (g["mean"]*g["size"] + global_mean*alpha) / (g["size"] + alpha)
        oof[va_idx] = train_df.iloc[va_idx][col].map(sm).fillna(global_mean).values
    gfull = pd.DataFrame({col: train_df[col].values, "y": y.values}).groupby(col)["y"].agg(["size","mean"])
    smfull = (gfull["mean"]*gfull["size"] + global_mean*alpha) / (gfull["size"] + alpha)
    test_enc = test_df[col].map(smfull).fillna(global_mean).values
    return oof, test_enc

for c in ["Surname","FamilyID","Ticket","TicketPrefix","FamTicket","CabinDeck","Title"]:
    oof_enc, te_enc = kfold_target_mean(trn, tst, y, c, n_splits=7, alpha=20, seed=42)
    trn[f"{c}_surv"] = oof_enc
    tst[f"{c}_surv"] = te_enc

drop_cols = ["PassengerId","Name","Ticket","Cabin","Survived","_is_train"]
base_cols = [c for c in df.columns if c not in drop_cols]
num_cols = ["Age","Fare","FareLog","FamilySize","IsAlone","TicketGroupSize","CabinCount","HasCabin","FarePerPerson","FarePerPersonLog","AgeClass","IsChild","IsMother","IsMale","Surname_surv","FamilyID_surv","Ticket_surv","TicketPrefix_surv","FamTicket_surv","CabinDeck_surv","Title_surv"]
cat_cols = [c for c in base_cols if c not in num_cols]

X_tr_cb = trn[cat_cols + num_cols].copy()
X_te_cb = tst[cat_cols + num_cols].copy()
for c in cat_cols: 
    X_tr_cb[c] = X_tr_cb[c].astype("category")
    X_te_cb[c] = X_te_cb[c].astype("category")
cat_idx = [X_tr_cb.columns.get_loc(c) for c in cat_cols]

X_tr_lgb = X_tr_cb.copy()
X_te_lgb = X_te_cb.copy()

ohe_cols = ["Sex","Embarked","Title","CabinDeck","TicketPrefix","Pclass","AgeBin","PclassSex","TitlePclass","DeckPclass","EmbarkedPclass","SexEmbarked"]
X_tr_xgb = pd.get_dummies(trn[ohe_cols], drop_first=False)
X_te_xgb = pd.get_dummies(tst[ohe_cols], drop_first=False)
X_tr_xgb, X_te_xgb = X_tr_xgb.align(X_te_xgb, join="left", axis=1, fill_value=0)
X_tr_xgb = pd.concat([X_tr_xgb.reset_index(drop=True), trn[num_cols].reset_index(drop=True)], axis=1).astype(float)
X_te_xgb = pd.concat([X_te_xgb.reset_index(drop=True), tst[num_cols].reset_index(drop=True)], axis=1).astype(float)

seeds = [42, 7, 2021]
skf_n = 7

oof_cb = np.zeros(len(X_tr_cb))
oof_lgb = np.zeros(len(X_tr_lgb))
oof_xgb = np.zeros(len(X_tr_xgb))
pred_cb = np.zeros(len(X_te_cb))
pred_lgb = np.zeros(len(X_te_lgb))
pred_xgb = np.zeros(len(X_te_xgb))

for seed in seeds:
    skf = StratifiedKFold(n_splits=skf_n, shuffle=True, random_state=seed)
    for tr_idx, va_idx in skf.split(X_tr_cb, y):
        Xtr_cb, Xva_cb = X_tr_cb.iloc[tr_idx], X_tr_cb.iloc[va_idx]
        ytr, yva = y.iloc[tr_idx], y.iloc[va_idx]
        m_cb = CatBoostClassifier(
            loss_function="Logloss", eval_metric="AUC", iterations=3000, learning_rate=0.03, depth=6,
            l2_leaf_reg=3.0, random_strength=1.0, bagging_temperature=1.0, random_seed=seed, verbose=False
        )
        m_cb.fit(Xtr_cb, ytr, eval_set=(Xva_cb, yva), cat_features=cat_idx, use_best_model=True)
        oof_cb[va_idx] += m_cb.predict_proba(Xva_cb)[:,1] / len(seeds)
        pred_cb += m_cb.predict_proba(X_te_cb)[:,1] / (skf_n*len(seeds))

    skf = StratifiedKFold(n_splits=skf_n, shuffle=True, random_state=seed)
    for tr_idx, va_idx in skf.split(X_tr_lgb, y):
        Xtr_lgb, Xva_lgb = X_tr_lgb.iloc[tr_idx], X_tr_lgb.iloc[va_idx]
        ytr, yva = y.iloc[tr_idx], y.iloc[va_idx]
        m_lgb = LGBMClassifier(
            n_estimators=3000, learning_rate=0.02, num_leaves=16, max_depth=-1, subsample=0.85, colsample_bytree=0.85,
            reg_alpha=0.1, reg_lambda=2.0, objective="binary", random_state=seed, n_jobs=-1
        )
        m_lgb.fit(Xtr_lgb, ytr, eval_set=[(Xva_lgb, yva)], eval_metric="binary_logloss", callbacks=[], verbose=False)
        oof_lgb[va_idx] += m_lgb.predict_proba(Xva_lgb)[:,1] / len(seeds)
        pred_lgb += m_lgb.predict_proba(X_te_lgb)[:,1] / (skf_n*len(seeds))

    skf = StratifiedKFold(n_splits=skf_n, shuffle=True, random_state=seed)
    for tr_idx, va_idx in skf.split(X_tr_xgb, y):
        Xtr_xgb, Xva_xgb = X_tr_xgb.iloc[tr_idx], X_tr_xgb.iloc[va_idx]
        ytr, yva = y.iloc[tr_idx], y.iloc[va_idx]
        m_xgb = XGBClassifier(
            n_estimators=3000, learning_rate=0.02, max_depth=3, min_child_weight=1, subsample=0.85, colsample_bytree=0.85,
            gamma=0.0, reg_alpha=0.1, reg_lambda=2.0, objective="binary:logistic", eval_metric="logloss",
            tree_method="hist", random_state=seed, n_jobs=-1
        )
        m_xgb.fit(Xtr_xgb, ytr)
        oof_xgb[va_idx] += m_xgb.predict_proba(Xva_xgb)[:,1] / len(seeds)
        pred_xgb += m_xgb.predict_proba(X_te_xgb)[:,1] / (skf_n*len(seeds))

meta_X = np.vstack([oof_cb, oof_lgb, oof_xgb]).T
meta_te = np.vstack([pred_cb, pred_lgb, pred_xgb]).T

meta = LogisticRegression(max_iter=1500, C=1.0, solver="lbfgs")
meta.fit(meta_X, y)
final_pred = meta.predict_proba(meta_te)[:,1]

sub = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": (final_pred>=0.5).astype(int)})
sub.to_csv("submission.csv", index=False)
print("done")
