## **Spaceship Titanic Classification with Decision Tree/ Random Forest**

# **1. Preparation**

In [1]:
import json
import os

kaggle_token = {"username":"thinhvan","key":"a7b33f792a3ea40881d8be4db4014871"}

!mkdir -p ~/.kaggle
with open("/root/.kaggle/kaggle.json", "w") as f:
    json.dump(kaggle_token, f)

!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c spaceship-titanic
!unzip spaceship-titanic.zip -d dataset/

Downloading spaceship-titanic.zip to /content
  0% 0.00/299k [00:00<?, ?B/s]
100% 299k/299k [00:00<00:00, 53.7MB/s]
Archive:  spaceship-titanic.zip
  inflating: dataset/sample_submission.csv  
  inflating: dataset/test.csv        
  inflating: dataset/train.csv       


# **2. Preprocessing**

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
try:
  from xgboost import XGBClassifier
except:
  !pip install xgboost
  from xgboost import XGBClassifier

# ============================
# 1. Load dataset
# ============================
train_df = pd.read_csv("dataset/train.csv")

In [14]:
# ============================
# 2. Preprocessing
# ============================

# Split <PassengerID> in to <Group> and <Member>
train_df[["Group", "Member"]] = train_df["PassengerId"].str.split("_", expand=True)

# Split Cabin into <Deck>, <Num> and <Side> features
train_df[["Deck", "Num", "Side"]] = train_df["Cabin"].str.split("/", expand=True)

# Convert <Num>, <Group> to numeric type
train_df["Group"] = train_df["Group"].astype("float")
train_df["Num"] = train_df["Num"].astype("float")

# Eliminate unvaluable and redundant attribute: <Name>, <Cabin>, <PassengerId>
train_df = train_df.drop(columns=["Name", "Cabin", "PassengerId"])

# Missing data
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', "Num", "Group"]
categorical_features = ['Member', 'HomePlanet', 'CryoSleep', 'Deck', 'Side', 'Destination', 'VIP']

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Split dataset to train/test = 0.8/0.2
X = train_df.drop("Transported", axis=1)
y = train_df["Transported"].astype("int")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Fit preprocessing
preprocessor.fit(X_train)

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

# **3. Training**

In [15]:
# ======================
# Setup models and hyperparameters
# ======================

# Models
models = {
  "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
  "SVM (RBF Kernel)": SVC(kernel="rbf", probability=False, random_state=42),
  "Decision Tree": DecisionTreeClassifier(random_state=42),
  "Random Forest": RandomForestClassifier(random_state=42),
  "Gradient Boosting": GradientBoostingClassifier(random_state=42),
  "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# PCA rates
pca_components = [None, 0.95, 0.9]

In [16]:
# ======================
# Fit and tracking results
# ======================
results = {}

for name, model in models.items():
  for rate in pca_components:
    pipe = Pipeline(steps=[
        ("pca", PCA(n_components=rate, random_state=42)),
        ("model", model)
    ])
    nc = "{:.2f}".format(rate) if rate is not None else str(1.00)
    print(f"[INFO] Training {name} - PCA@{nc}")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    results[name+f" - PCA@{nc}"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="weighted"),
        "recall": recall_score(y_test, y_pred, average="weighted"),
        "f1": f1_score(y_test, y_pred, average="weighted")
    }

[INFO] Training Logistic Regression - PCA@1.0
[INFO] Training Logistic Regression - PCA@0.95
[INFO] Training Logistic Regression - PCA@0.90
[INFO] Training SVM (RBF Kernel) - PCA@1.0
[INFO] Training SVM (RBF Kernel) - PCA@0.95
[INFO] Training SVM (RBF Kernel) - PCA@0.90
[INFO] Training Decision Tree - PCA@1.0
[INFO] Training Decision Tree - PCA@0.95
[INFO] Training Decision Tree - PCA@0.90
[INFO] Training Random Forest - PCA@1.0
[INFO] Training Random Forest - PCA@0.95
[INFO] Training Random Forest - PCA@0.90
[INFO] Training Gradient Boosting - PCA@1.0
[INFO] Training Gradient Boosting - PCA@0.95
[INFO] Training Gradient Boosting - PCA@0.90
[INFO] Training XGBoost - PCA@1.0
[INFO] Training XGBoost - PCA@0.95
[INFO] Training XGBoost - PCA@0.90


## **4. Experriment**

In [17]:
# ==============================
# 4. Log results
# ==============================
results_df = pd.DataFrame(results).T
print(results_df)

                                accuracy  precision    recall        f1
Logistic Regression - PCA@1.0   0.787234   0.787324  0.787234  0.787195
Logistic Regression - PCA@0.95  0.785509   0.785549  0.785509  0.785485
Logistic Regression - PCA@0.90  0.781484   0.781549  0.781484  0.781450
SVM (RBF Kernel) - PCA@1.0      0.797010   0.797124  0.797010  0.797008
SVM (RBF Kernel) - PCA@0.95     0.791259   0.791291  0.791259  0.791262
SVM (RBF Kernel) - PCA@0.90     0.789534   0.789539  0.789534  0.789536
Decision Tree - PCA@1.0         0.736055   0.736065  0.736055  0.736058
Decision Tree - PCA@0.95        0.733180   0.733498  0.733180  0.733024
Decision Tree - PCA@0.90        0.722829   0.722867  0.722829  0.722787
Random Forest - PCA@1.0         0.797010   0.797676  0.797010  0.796942
Random Forest - PCA@0.95        0.798735   0.799506  0.798735  0.798654
Random Forest - PCA@0.90        0.798735   0.799437  0.798735  0.798663
Gradient Boosting - PCA@1.0     0.796435   0.797055  0.796435  0