In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# 1. Load data
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

# 2. Preprocess / feature engineering
def preprocess(df, is_train=True):
    df = df.copy()

    # Cabin â†’ Deck, Side
    df["Deck"] = df["Cabin"].apply(lambda x: str(x).split("/")[0] if pd.notnull(x) else "Unknown")
    df["Side"] = df["Cabin"].apply(lambda x: str(x).split("/")[-1] if pd.notnull(x) else "Unknown")

    # Drop columns not used
    drop_cols = ["PassengerId", "Cabin", "Name"]
    if is_train:
        y = df["Transported"].astype(int)
        drop_cols.append("Transported")
    else:
        y = None

    df = df.drop(columns=drop_cols)
    return df, y

X_train_raw, y_train = preprocess(train, is_train=True)
X_test_raw, _ = preprocess(test, is_train=False)

# Identify column types
categorical_cols = X_train_raw.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X_train_raw.select_dtypes(include=["number"]).columns.tolist()

# Imputation + encoding
preprocessor = ColumnTransformer([
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), categorical_cols),

    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median"))
    ]), numeric_cols)

], remainder="passthrough")

# 3. Build a pipeline with RandomForest
clf = Pipeline([
    ("pre", preprocessor),
    ("rf", RandomForestClassifier(n_estimators=200, random_state=42))
])

# 4. Validation split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_raw, y_train, test_size=0.2, random_state=42
)

clf.fit(X_tr, y_tr)
print("Validation accuracy:", clf.score(X_val, y_val))

# 5. Train on all training data
clf.fit(X_train_raw, y_train)
pred = clf.predict(X_test_raw)

# 6. Submission file
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pred.astype(bool)
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved.")

Validation accuracy: 0.7872340425531915
submission.csv saved.
