In [None]:
import sys
from pathlib import Path
import os
import pandas as pd

sys.path.append(os.getcwd())

# notebooks/03_modeling.ipynb â†’ spaceship-titanic
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT =", PROJECT_ROOT)
print("SRC EXISTS =", (PROJECT_ROOT / "src").exists())


from src.preprocessing import preprocess_data


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier


In [None]:

import pandas as pd

import pandas as pd

train_df = pd.read_csv("../data/raw/train.csv")
test_df = pd.read_csv("../data/raw/test.csv")

from src.preprocessing import preprocess_data

# Feature engineering only
from src.preprocessing import preprocess_data

train_processed = preprocess_data(train_df)

X = train_processed.drop(columns=["Transported"])
y = train_processed["Transported"]

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
X.head()

PREPROCESSING PIPELINE

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "bool"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

In [None]:
preprocessor = ColumnTransformer(...)
xgb_pipeline = Pipeline([...])

In [None]:
print(train_df.columns.tolist())

BASELINE MODEL (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

baseline_model = LogisticRegression(max_iter=1000)

baseline_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", baseline_model)
])

baseline_pipe.fit(X_train, y_train)
baseline_preds = baseline_pipe.predict(X_val)

print("Baseline Accuracy:", accuracy_score(y_val, baseline_preds))

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", rf_model)
])

rf_pipe.fit(X_train, y_train)
rf_preds = rf_pipe.predict(X_val)

print("Random Forest Accuracy:", accuracy_score(y_val, rf_preds))

Hyper-Tune XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=800,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_weight=1,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42
)

xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])

xgb_pipeline.fit(X_train, y_train)
xgb_preds = xgb_pipeline.predict(X_val)

print("Tuned XGBoost Accuracy:", accuracy_score(y_val, xgb_preds))

In [None]:
xgb_pipeline.fit(X_train, y_train)
xgb_preds = xgb_pipeline.predict(X_val)

print(
    "XGBoost Accuracy (with GroupSize):",
    accuracy_score(y_val, xgb_preds)
)

LightGBM

In [None]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(
    n_estimators=1500,
    learning_rate=0.02,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", lgb_model)
])

lgb_pipeline.fit(X_train, y_train)
lgb_preds = lgb_pipeline.predict(X_val)

print("LightGBM Accuracy:", accuracy_score(y_val, lgb_preds))

In [None]:
print(X_train.columns.tolist())
print(preprocessor.transformers)

In [None]:
from src.preprocessing import preprocess_data

# Preprocess full training data
train_processed = preprocess_data(train_df)

X = train_processed.drop(columns=["Transported"])
y = train_processed["Transported"]

# Fit on full data
xgb_pipeline.fit(X, y)

In [None]:
test_processed = preprocess_data(test_df)

test_preds = xgb_pipeline.predict(test_processed)

In [None]:
test_preds = xgb_pipeline.predict(test_processed)

In [None]:
import os
os.makedirs("submission", exist_ok=True)

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Transported": test_preds
})

submission.to_csv("submission/submission.csv", index=False)
print("Updated submission saved!")


In [None]:
import os
print(os.getcwd())


In [None]:
os.listdir()


In [None]:
import pandas as pd
df = pd.read_csv("submission/submission.csv")
df.head()


In [None]:
test_preds_bool = test_preds.astype(bool)


In [None]:
test_preds_bool = (test_preds > 0.5)


In [None]:
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Transported": test_preds_bool
})

submission.to_csv("submission.csv", index=False, encoding="utf-8")
print("submission.csv updated correctly!")


In [None]:
import pandas as pd
df = pd.read_csv("submission.csv")
df.head()
df.dtypes
