In [3]:
# =====================
# Cell 1: Imports & global config
# =====================

import os
import warnings
import logging
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import KBinsDiscretizer

import xgboost as xgb
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

CONF = {
    "seed": 42,
    "target": "loan_paid_back",
    "train_path": "/kaggle/input/playground-series-s5e11/train.csv",
    "test_path": "/kaggle/input/playground-series-s5e11/test.csv",
    "orig_path": "/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv",
    "n_folds": 8,
    "num_boost_round": 12000,
    "early_stopping_rounds": 350,
}

TARGET = CONF["target"]
np.random.seed(CONF["seed"])

# we’ll fill these after feature engineering
CATS_BASE: List[str] = []
NUMS_BASE: List[str] = []
NEW_FEATURES: List[str] = []


In [4]:
# =====================
# Cell 2: Load data & advanced feature engineering
# =====================

def load_data():
    train = pd.read_csv(CONF["train_path"])
    test = pd.read_csv(CONF["test_path"])
    orig = pd.read_csv(CONF["orig_path"])

    logger.info(f"Train: {train.shape}")
    logger.info(f"Test : {test.shape}")
    logger.info(f"Orig : {orig.shape}")
    return train, test, orig


def create_advanced_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Core affordability
    df["income_loan_ratio"] = df["annual_income"] / (df["loan_amount"] + 1)
    df["loan_to_income"] = df["loan_amount"] / (df["annual_income"] + 1)

    # Debt metrics
    df["total_debt"] = df["debt_to_income_ratio"] * df["annual_income"]
    df["available_income"] = df["annual_income"] * (1 - df["debt_to_income_ratio"])
    df["debt_burden"] = df["debt_to_income_ratio"] * df["loan_amount"]

    # Payment analysis (simple approximation)
    df["monthly_payment"] = df["loan_amount"] * df["interest_rate"] / 1200
    df["payment_to_income"] = df["monthly_payment"] / (df["annual_income"] / 12 + 1)
    df["affordability"] = df["available_income"] / (df["loan_amount"] + 1)

    # Risk scoring
    df["default_risk"] = (
        df["debt_to_income_ratio"] * 0.40
        + (850 - df["credit_score"]) / 850 * 0.35
        + df["interest_rate"] / 100 * 0.25
    )

    # Credit analysis
    df["credit_utilization"] = df["credit_score"] * (1 - df["debt_to_income_ratio"])
    df["credit_interest_product"] = df["credit_score"] * df["interest_rate"] / 100

    # Log transforms
    for col in ["annual_income", "loan_amount"]:
        df[f"{col}_log"] = np.log1p(df[col])

    # Grade parsing
    df["grade_subgrade"] = df["grade_subgrade"].astype(str)
    df["grade_letter"] = df["grade_subgrade"].str[0]
    df["grade_number"] = (
        df["grade_subgrade"].str[1:]
        .str.extract(r"(\d+)", expand=False)
        .fillna("0")
        .astype(int)
    )
    grade_map = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
    df["grade_rank"] = df["grade_letter"].map(grade_map).fillna(4).astype(int)

    return df


# ---- run once ----
train, test, orig = load_data()
test[TARGET] = -1  # dummy target to concat

logger.info("\n[STEP 1] Creating Enhanced Financial Features...")
combine = pd.concat([train, test, orig], axis=0, ignore_index=True)
combine_fe = create_advanced_features(combine)

CATS_BASE = [
    "gender",
    "marital_status",
    "education_level",
    "employment_status",
    "loan_purpose",
    "grade_subgrade",
]

NUMS_BASE = [
    "annual_income",
    "debt_to_income_ratio",
    "credit_score",
    "loan_amount",
    "interest_rate",
]

NEW_FEATURES = [
    "income_loan_ratio",
    "loan_to_income",
    "total_debt",
    "available_income",
    "debt_burden",
    "monthly_payment",
    "payment_to_income",
    "affordability",
    "default_risk",
    "credit_utilization",
    "credit_interest_product",
    "annual_income_log",
    "loan_amount_log",
    "grade_letter",
    "grade_number",
    "grade_rank",
]

logger.info(f"Created {len(NEW_FEATURES)} new features")
n_train = len(train)
n_test = len(test)
n_orig = len(orig)


Train: (593994, 13)
Test : (254569, 12)
Orig : (20000, 22)

[STEP 1] Creating Enhanced Financial Features...
Created 16 new features


In [5]:
# =====================
# Cell 3: Target encoding + build base tables (cats, interactions, CE)
# =====================

def kfold_target_encode_train_valid_test(
    Xy_train: pd.DataFrame,
    X_valid: pd.DataFrame,
    X_test: pd.DataFrame,
    col: str,
    target_col: str,
    n_splits: int = 10,
    seed: int = 42,
):
    """
    Fold-wise target encoding:
      - For Xy_train: OOF-style encoding using StratifiedKFold.
      - For X_valid & X_test: map from full Xy_train means.
    """
    from sklearn.model_selection import StratifiedKFold as SKF_TE

    kf = SKF_TE(n_splits=n_splits, shuffle=True, random_state=seed)
    tr_encoded = pd.Series(index=Xy_train.index, dtype="float32")
    y = Xy_train[target_col].values

    for tr_idx, val_idx in kf.split(Xy_train, y):
        tr_slice = Xy_train.iloc[tr_idx]
        means = tr_slice.groupby(col)[target_col].mean()
        tr_encoded.iloc[val_idx] = Xy_train.iloc[val_idx][col].map(means).astype(
            "float32"
        )

    global_means = Xy_train.groupby(col)[target_col].mean()
    val_encoded = X_valid[col].map(global_means).astype("float32")
    test_encoded = X_test[col].map(global_means).astype("float32")

    global_avg = Xy_train[target_col].mean().astype("float32")
    tr_encoded.fillna(global_avg, inplace=True)
    val_encoded.fillna(global_avg, inplace=True)
    test_encoded.fillna(global_avg, inplace=True)

    return tr_encoded.values, val_encoded.values, test_encoded.values


def build_base_tables(
    combine_fe: pd.DataFrame,
    n_train: int,
    n_test: int,
    n_orig: int,
    use_quantile_bins: bool = False,
):
    """
    One-time construction of:
      - numeric_cat features
      - interaction features
      - count-encoding
    Shared across all XGBoost experiments that use factorize() (your best configs).
    """
    combine = combine_fe.copy()

    CATS = CATS_BASE.copy()
    NUMS = NUMS_BASE + [f for f in NEW_FEATURES if f not in ["grade_letter"]]
    CATS.append("grade_letter")

    # numeric -> categorical
    CATS_NUM: List[str] = []
    if use_quantile_bins:
        logger.info("Using quantile bins for numeric _cat features...")
        for c in NUMS:
            n = f"{c}_cat"
            est = KBinsDiscretizer(
                n_bins=50,
                encode="ordinal",
                strategy="quantile",
            )
            combine[n] = est.fit_transform(combine[[c]]).astype("int32")
            CATS_NUM.append(n)
    else:
        logger.info("Using factorize() for numeric _cat features...")
        for c in NUMS:
            n = f"{c}_cat"
            combine[n], _ = combine[c].factorize()
            combine[n] = combine[n].astype("int32")
            CATS_NUM.append(n)
    logger.info(f"Created {len(CATS_NUM)} numeric _cat features")

    # interactions
    important_pairs = [
        ("employment_status", "grade_subgrade"),
        ("employment_status", "education_level"),
        ("employment_status", "loan_purpose"),
        ("grade_subgrade", "loan_purpose"),
        ("grade_subgrade", "education_level"),
        ("marital_status", "employment_status"),
    ]
    for num_cat in ["credit_score_cat", "debt_to_income_ratio_cat", "interest_rate_cat"]:
        for cat in ["employment_status", "grade_subgrade"]:
            important_pairs.append((num_cat, cat))

    CATS_INTER: List[str] = []
    for c1, c2 in important_pairs:
        name = f"{c1}_{c2}"
        if c1 in combine.columns and c2 in combine.columns:
            combine[name] = combine[c1].astype(str) + "_" + combine[c2].astype(str)
            CATS_INTER.append(name)
    logger.info(f"Created {len(CATS_INTER)} strategic interactions")

    # count-encoding
    CE: List[str] = []
    ALL_CATS = CATS + CATS_NUM + CATS_INTER
    logger.info(f"\nCreating count encoding for {len(ALL_CATS)} categorical features...")
    for c in tqdm(ALL_CATS, desc="Count encoding"):
        tmp = combine.groupby(c)[TARGET].count()
        tmp.name = f"CE_{c}"
        CE.append(tmp.name)
        combine = combine.merge(tmp, on=c, how="left")
    logger.info(f"Created {len(CE)} count encodings")

    train_full = combine.iloc[:n_train].copy()
    test_full = combine.iloc[n_train:n_train + n_test].copy()
    orig_full = combine.iloc[-n_orig:].copy()

    FEATURES = NUMS + CATS + CATS_NUM + CATS_INTER + CE
    logger.info(
        f"\nTrain_full: {train_full.shape}, Test_full: {test_full.shape}, Orig_full: {orig_full.shape}"
    )
    logger.info(f"Total FEATURES: {len(FEATURES)}")

    return train_full, test_full, orig_full, FEATURES, CATS, CATS_NUM, CATS_INTER


# ---- build shared tables once (factorize, no quantile bins, since A & F both use that) ----
train_full, test_full, orig_full, FEATURES, CATS, CATS_NUM, CATS_INTER = build_base_tables(
    combine_fe, n_train, n_test, n_orig, use_quantile_bins=False
)


Using factorize() for numeric _cat features...
Created 20 numeric _cat features
Created 12 strategic interactions

Creating count encoding for 39 categorical features...


Count encoding:   0%|          | 0/39 [00:00<?, ?it/s]

Created 39 count encodings

Train_full: (593994, 110), Test_full: (254569, 110), Orig_full: (20000, 110)
Total FEATURES: 98


In [6]:
# =====================
# Cell 4: XGBoost params + single-model CV runner
# =====================

logger.info("\n[STEP 2] Defining base XGBoost parameters...")

BASE_PARAMS: Dict[str, Any] = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.0095,
    "max_depth": 0,
    "subsample": 0.82,
    "colsample_bytree": 0.72,
    "seed": CONF["seed"],
    "grow_policy": "lossguide",
    "max_leaves": 36,
    "lambda": 4.5,
    "alpha": 2.2,
    "max_bin": 256,
    "tree_method": "gpu_hist",      # change to "hist" if no GPU
    "predictor": "gpu_predictor",   # change to "auto" if no GPU
    "verbosity": 0,
}
for k, v in BASE_PARAMS.items():
    logger.info(f"  {k}: {v}")


def run_cv_model(
    name: str,
    params_updates: Dict[str, Any],
    cv_type: str = "kfold",
    use_orig_aug: bool = True,
    num_boost_round: int = CONF["num_boost_round"],
    early_stopping_rounds: int = CONF["early_stopping_rounds"],
    cv_seed: int = CONF["seed"],
) -> Tuple[float, np.ndarray, np.ndarray]:
    """
    Run CV for one model config on the pre-built tables.
    Returns: (oof_auc, oof_preds, test_preds)
    """
    logger.info("\n" + "#" * 80)
    logger.info(f"MODEL: {name}")
    logger.info("#" * 80)

    y_all = train_full[TARGET].astype(int).values
    pos = (y_all == 1).sum()
    neg = (y_all == 0).sum()
    scale_pos_weight = neg / pos
    logger.info(f"Auto scale_pos_weight: {scale_pos_weight:.3f}")

    params = BASE_PARAMS.copy()
    params["scale_pos_weight"] = scale_pos_weight
    for k, v in params_updates.items():
        params[k] = v

    logger.info("Final params:")
    for k in sorted(params.keys()):
        logger.info(f"  {k}: {params[k]}")

    # CV splitter
    if cv_type.lower() == "kfold":
        logger.info("Using plain KFold")
        kf = KFold(n_splits=CONF["n_folds"], shuffle=True, random_state=cv_seed)
        split_iter = kf.split(train_full[FEATURES])
    else:
        logger.info("Using StratifiedKFold")
        kf = StratifiedKFold(n_splits=CONF["n_folds"], shuffle=True, random_state=cv_seed)
        split_iter = kf.split(train_full[FEATURES], y_all)

    oof_preds = np.zeros(n_train, dtype="float32")
    test_preds = np.zeros(n_test, dtype="float32")
    fold_scores: List[float] = []
    best_iters: List[int] = []

    X_test_base = test_full[FEATURES].copy()
    X_orig = orig_full[FEATURES + [TARGET]].copy()
    TARGET_ENCODE_CATS = CATS_NUM + CATS_INTER

    for fold, (tr_idx, val_idx) in enumerate(
        tqdm(list(split_iter), total=CONF["n_folds"], desc=f"CV folds ({name})"), 1
    ):
        logger.info(f"\n{'=' * 25}")
        logger.info(f"{name} - Fold {fold}/{CONF['n_folds']}")
        logger.info(f"{'=' * 25}")

        X_tr = train_full.iloc[tr_idx][FEATURES + [TARGET]].copy()
        X_val = train_full.iloc[val_idx][FEATURES + [TARGET]].copy()

        if use_orig_aug:
            Xy_train = pd.concat([X_tr, X_orig], axis=0, ignore_index=True)
        else:
            Xy_train = X_tr.copy()

        X_valid = X_val[FEATURES].copy()
        y_valid = X_val[TARGET].astype(int).values
        X_test = X_test_base.copy()

        logger.info(f"Target encoding {len(TARGET_ENCODE_CATS)} features...")
        for c in tqdm(TARGET_ENCODE_CATS, desc="Target encoding", leave=False):
            if c not in Xy_train.columns:
                continue
            tr_te, val_te, te_te = kfold_target_encode_train_valid_test(
                Xy_train[[c, TARGET]],
                X_valid[[c]],
                X_test[[c]],
                col=c,
                target_col=TARGET,
                n_splits=10,
                seed=CONF["seed"],
            )
            Xy_train[c] = tr_te
            X_valid[c] = val_te
            X_test[c] = te_te

        # cast categoricals
        for col in CATS_BASE + ["grade_letter"]:
            if col in Xy_train.columns:
                Xy_train[col] = Xy_train[col].astype("category")
                X_valid[col] = X_valid[col].astype("category")
                X_test[col] = X_test[col].astype("category")

        dtrain = xgb.DMatrix(
            Xy_train[FEATURES],
            label=Xy_train[TARGET].astype(int).values,
            enable_categorical=True,
        )
        dvalid = xgb.DMatrix(
            X_valid[FEATURES],
            label=y_valid,
            enable_categorical=True,
        )
        dtest = xgb.DMatrix(
            X_test[FEATURES],
            enable_categorical=True,
        )

        evals = [(dtrain, "train"), (dvalid, "valid")]
        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False,
        )

        best_iter = model.best_iteration if model.best_iteration is not None else num_boost_round
        best_iters.append(best_iter)

        oof_fold = model.predict(dvalid, iteration_range=(0, best_iter + 1))
        oof_preds[val_idx] = oof_fold
        test_preds += model.predict(dtest, iteration_range=(0, best_iter + 1)) / CONF["n_folds"]

        fold_auc = roc_auc_score(y_valid, oof_fold)
        fold_scores.append(fold_auc)
        logger.info(f"{name} - Fold {fold} AUC: {fold_auc:.5f}")

    overall_auc = roc_auc_score(train_full[TARGET].astype(int).values, oof_preds)
    logger.info("\n" + "=" * 80)
    logger.info(f"{name} - OOF AUC: {overall_auc:.6f}")
    logger.info(f"Fold AUCs: {[f'{s:.5f}' for s in fold_scores]}")
    logger.info(f"Avg best_iter: {np.mean(best_iters):.1f}")

    return overall_auc, oof_preds, test_preds



[STEP 2] Defining base XGBoost parameters...
  objective: binary:logistic
  eval_metric: auc
  learning_rate: 0.0095
  max_depth: 0
  subsample: 0.82
  colsample_bytree: 0.72
  seed: 42
  grow_policy: lossguide
  max_leaves: 36
  lambda: 4.5
  alpha: 2.2
  max_bin: 256
  tree_method: gpu_hist
  predictor: gpu_predictor
  verbosity: 0


In [7]:
# =====================
# Cell 5: Train final A & F, blend 50/50 -> submission_model.csv
# =====================

y_true = train[TARGET].astype(int).values

# A: factorize + KFold + orig_aug (your best single)
auc_A, oof_A, test_A = run_cv_model(
    name="A_factorize_kfold_base",
    params_updates={},                  # same as BASE_PARAMS
    cv_type="kfold",
    use_orig_aug=True,
    num_boost_round=12000,
    early_stopping_rounds=350,
)

# F: stratified + extra regularization (min_child_weight=8, gamma=0.2)
auc_F, oof_F, test_F = run_cv_model(
    name="F_stratified_factorize_minchild8_gamma02",
    params_updates={
        "min_child_weight": 8,
        "gamma": 0.2,
    },
    cv_type="stratified",
    use_orig_aug=True,
    num_boost_round=12000,
    early_stopping_rounds=350,
)

logger.info(f"\nSingle model OOF AUCs:")
logger.info(f"  A_factorize_kfold_base                    : {auc_A:.6f}")
logger.info(f"  F_stratified_factorize_minchild8_gamma02  : {auc_F:.6f}")

# 50/50 blend of OOF for reference
oof_blend = 0.5 * oof_A + 0.5 * oof_F
auc_blend = roc_auc_score(y_true, oof_blend)
logger.info(f"\nA/F 50-50 blend OOF AUC: {auc_blend:.6f}")

# 50/50 blend of test preds (this is your final “model-based” prediction)
test_blend = 0.5 * test_A + 0.5 * test_F

# Save base model submission
submission_model = pd.DataFrame({
    "id": test_full["id"].values,
    TARGET: test_blend,
})
submission_model_path = "/kaggle/working/submission_model.csv"
submission_model.to_csv(submission_model_path, index=False)
logger.info(f"\n✓ Saved base model submission to {submission_model_path}")
submission_model.head()



################################################################################
MODEL: A_factorize_kfold_base
################################################################################
Auto scale_pos_weight: 0.252
Final params:
  alpha: 2.2
  colsample_bytree: 0.72
  eval_metric: auc
  grow_policy: lossguide
  lambda: 4.5
  learning_rate: 0.0095
  max_bin: 256
  max_depth: 0
  max_leaves: 36
  objective: binary:logistic
  predictor: gpu_predictor
  scale_pos_weight: 0.25184723094496453
  seed: 42
  subsample: 0.82
  tree_method: gpu_hist
  verbosity: 0
Using plain KFold


CV folds (A_factorize_kfold_base):   0%|          | 0/8 [00:00<?, ?it/s]


A_factorize_kfold_base - Fold 1/8
Target encoding 32 features...


Target encoding:   0%|          | 0/32 [00:00<?, ?it/s]

XGBoostError: [10:58:23] /workspace/src/tree/updater_gpu_hist.cu:781: Exception in gpu_hist: [10:58:23] /workspace/src/tree/updater_gpu_hist.cu:787: Check failed: ctx_->gpu_id >= 0 (-1 vs. 0) : Must have at least one device
Stack trace:
  [bt] (0) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0xb27f2a) [0x7fc6e7db3f2a]
  [bt] (1) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0xb3e95a) [0x7fc6e7dca95a]
  [bt] (2) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0xb483cd) [0x7fc6e7dd43cd]
  [bt] (3) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x460c79) [0x7fc6e76ecc79]
  [bt] (4) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x46176c) [0x7fc6e76ed76c]
  [bt] (5) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x4c54f7) [0x7fc6e77514f7]
  [bt] (6) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x70) [0x7fc6e73edef0]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7fc754f41e2e]
  [bt] (8) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7fc754f3e493]



Stack trace:
  [bt] (0) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0xb27f2a) [0x7fc6e7db3f2a]
  [bt] (1) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0xb485c9) [0x7fc6e7dd45c9]
  [bt] (2) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x460c79) [0x7fc6e76ecc79]
  [bt] (3) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x46176c) [0x7fc6e76ed76c]
  [bt] (4) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(+0x4c54f7) [0x7fc6e77514f7]
  [bt] (5) /usr/local/lib/python3.11/dist-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x70) [0x7fc6e73edef0]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7fc754f41e2e]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7fc754f3e493]
  [bt] (8) /usr/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so(+0xa4d8) [0x7fc754f514d8]



In [None]:
# =====================
# Cell 6: Ensemble multiple submission CSVs + your new model
# =====================
import glob

base_sub_path = "/kaggle/input/22-november-2025-ps-s5e11"
target_col = TARGET

# 1) Choose which past submissions to include + weights
#    You can adjust these weights based on LB scores.
#
# Example using 015,017,018,019 as in your h_blend snippet
# plus your new model submission.
#
sub_sources = {
    # "name": (path, weight)
    "model": ("/kaggle/working/submission_model.csv", 0.30),   # your new A/F blend
    "015": (os.path.join(base_sub_path, "submission_015.csv"), 0.18),
    "017": (os.path.join(base_sub_path, "submission_017.csv"), 0.10),
    "018": (os.path.join(base_sub_path, "submission_018.csv"), 0.20),
    "019": (os.path.join(base_sub_path, "submission_019.csv"), 0.22),
}

# 2) Normalize weights to sum to 1
weight_sum = sum(w for (_, w) in sub_sources.values())
sub_sources = {k: (p, w / weight_sum) for k, (p, w) in sub_sources.items()}

logger.info("\nEnsembling the following submissions:")
for name, (path, w) in sub_sources.items():
    logger.info(f"  {name}: {path} (weight={w:.4f})")

# 3) Blend predictions
sample = pd.read_csv("/kaggle/input/playground-series-s5e11/sample_submission.csv")
blend_pred = np.zeros(len(sample), dtype="float64")

for name, (path, w) in sub_sources.items():
    sub_df = pd.read_csv(path)
    # assume columns: id, loan_paid_back
    # ensure correct order by merging on id (robust)
    merged = sample[["id"]].merge(sub_df[["id", target_col]], on="id", how="left")
    blend_pred += w * merged[target_col].values

sample[target_col] = blend_pred

final_submission_path = "/kaggle/working/submission_ensemble.csv"
sample.to_csv(final_submission_path, index=False)

logger.info(f"\n✓ Saved ensembled submission to {final_submission_path}")
sample.head()
