In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, f1_score
from scipy.stats import chi2_contingency, uniform, randint
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier 

train_df = pd.read_csv("train.csv")


In [2]:
def preprocessing(df):
    # Replace null values with mode
    df = df.fillna(df.mode().iloc[0])

    # Split Cabin String to 3 Parts
    df[["Deck", "Num", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df[["Id1", "Id2"]] = df["PassengerId"].str.split("_", expand=True).astype(int)
    df[["FirstName", "LastName"]] = df["Name"].str.split(" ", expand=True)
    df["Num"] = df["Num"].astype(int)

    # Bin to age range
    age_bins = [0, 18, 30, 45, 60, float("inf")]  # Define the age group bins
    age_labels = ["0-18", "19-30", "31-45", "46-60", "61+"]  # Labels for age groups

    df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

    # One Hot Encode Catergorical Columns
    df = pd.get_dummies(df, columns=["HomePlanet"])
    df = pd.get_dummies(df, columns=["Destination"])
    df = pd.get_dummies(df, columns=["Side"])
    df = pd.get_dummies(df, columns=["Deck"])
    df = pd.get_dummies(df, columns=["Id2"])
    df = pd.get_dummies(df, columns=["AgeGroup"])


    # Map True/False Columns to 1/0
    df["CryoSleep"] = df["CryoSleep"].astype(int)
    df["VIP"] = df["VIP"].astype(int)
    if "Transported" in df.columns:
        df["Transported"] = df["Transported"].astype(int)

    # Log Scaling on Numerical Columns
    constant = 1
    ServiceSpending = ["RoomService", "Spa", "VRDeck"]
    ShoppingSpending = ["FoodCourt", "ShoppingMall"]
    df["ServiceSpending"] = df[ServiceSpending].sum(axis=1)
    df["ShoppingSpending"] = df[ShoppingSpending].sum(axis=1)

    df.drop(columns=ServiceSpending+ShoppingSpending, axis=1, inplace=True)

    scaler = StandardScaler()
    columns_to_standardize = ["Num", "ServiceSpending", "ShoppingSpending"]
    df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

    # df[] = np.log(df[] + constant)

    # New features
    common_lastname = df["LastName"].value_counts().head(30)
    use_common_lastname = df["LastName"].isin(common_lastname.index)
    df["UseCommonLastname"] = use_common_lastname.astype(int)

    # Drop Irrelevant Columns
    df = df.drop(["PassengerId", "Cabin", "Name", "Age", "FirstName", "LastName", "Id1"], axis=1)

    return df

In [3]:
train_df = preprocessing(train_df)

In [4]:
train_df.columns

Index(['CryoSleep', 'VIP', 'Transported', 'Num', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Side_P',
       'Side_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F',
       'Deck_G', 'Deck_T', 'Id2_1', 'Id2_2', 'Id2_3', 'Id2_4', 'Id2_5',
       'Id2_6', 'Id2_7', 'Id2_8', 'AgeGroup_0-18', 'AgeGroup_19-30',
       'AgeGroup_31-45', 'AgeGroup_46-60', 'AgeGroup_61+', 'ServiceSpending',
       'ShoppingSpending', 'UseCommonLastname'],
      dtype='object')

In [5]:
selected_features = ['CryoSleep', 'Num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'Side_P', 'Deck_B', 'Deck_C', 'Deck_E',
       'Deck_F', 'Deck_G', 'Id2_3', 'Id2_4', 'Id2_5', 'AgeGroup_0-18',
       'AgeGroup_19-30', 'AgeGroup_31-45', 'AgeGroup_46-60', 'ServiceSpending',
       'ShoppingSpending', 'UseCommonLastname']

In [6]:
# Split the dataset into training and testing sets
X = train_df[selected_features]
y = train_df["Transported"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
eval_set = [(X_val, y_val)]

# Define hyperparameter distributions
from scipy.stats import uniform, randint

param_dist = {
    'learning_rate': uniform(0.01, 0.3),         # Example: Learning rate between 0.01 and 0.3
    'max_depth': randint(3, 10),                # Example: Max depth between 3 and 10
    'n_estimators': randint(200, 1000),         # Example: Number of estimators between 200 and 1000
    'subsample': uniform(0.6, 0.4),             # Example: Subsample ratio between 0.6 and 1.0
    'min_child_weight': randint(1, 40),          # Example: Min child weight between 1 and 5
    'gamma': uniform(0, 1),                     # Example: Gamma between 0 and 1
    'colsample_bytree': uniform(0.6, 0.4),      # Example: Colsample by tree between 0.6 and 1.0
}


# Create an XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42)

# Initialize RandomizedSearchCV
f1_scorer = make_scorer(f1_score, average='binary')  

random_search = RandomizedSearchCV(
    estimator=xgb_classifier,
    param_distributions=param_dist,
    n_iter=100,
    scoring=f1_scorer,
    cv=5,  
    random_state=42
)
# Fit random search to the data
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters: ", best_params)

xgb_classifier = xgb.XGBClassifier(
    learning_rate=best_params["learning_rate"],
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_child_weight=best_params["min_child_weight"],
    gamma=best_params['gamma'],
    subsample=best_params["subsample"],
    colsample_bytree=best_params['colsample_bytree'],
    objective="binary:logistic",
    nthread=4,
    scale_pos_weight=1,
    seed=42,
    early_stopping_rounds=10,
    eval_metric="logloss"
)

In [None]:
xgb_classifier.fit(X_train, y_train,
    eval_set=eval_set,
    verbose=True,
)

[0]	validation_0-logloss:0.68554
[1]	validation_0-logloss:0.67589
[2]	validation_0-logloss:0.66821
[3]	validation_0-logloss:0.66629
[4]	validation_0-logloss:0.66098
[5]	validation_0-logloss:0.65436
[6]	validation_0-logloss:0.64608
[7]	validation_0-logloss:0.63822
[8]	validation_0-logloss:0.63395
[9]	validation_0-logloss:0.62778
[10]	validation_0-logloss:0.62068
[11]	validation_0-logloss:0.61375
[12]	validation_0-logloss:0.60733
[13]	validation_0-logloss:0.60089
[14]	validation_0-logloss:0.59566
[15]	validation_0-logloss:0.59423
[16]	validation_0-logloss:0.58821
[17]	validation_0-logloss:0.58461
[18]	validation_0-logloss:0.57905
[19]	validation_0-logloss:0.57376
[20]	validation_0-logloss:0.56970
[21]	validation_0-logloss:0.56623
[22]	validation_0-logloss:0.56136
[23]	validation_0-logloss:0.55872
[24]	validation_0-logloss:0.55609
[25]	validation_0-logloss:0.55541
[26]	validation_0-logloss:0.55151
[27]	validation_0-logloss:0.54791
[28]	validation_0-logloss:0.54349
[29]	validation_0-loglos

In [None]:
y_pred = xgb_classifier.predict(X_test)

In [None]:
y_pred = xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8057471264367816
Confusion Matrix:
[[317  97]
 [ 72 384]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       414
           1       0.80      0.84      0.82       456

    accuracy                           0.81       870
   macro avg       0.81      0.80      0.80       870
weighted avg       0.81      0.81      0.81       870



In [None]:
testing = pd.read_csv("test.csv")

test_input = preprocessing(testing)[selected_features]


In [None]:
predictions = xgb_classifier.predict(test_input)
print(predictions)

result = pd.DataFrame()
result["PassengerId"] = testing["PassengerId"]
result["Transported"] = predictions.astype(bool)


[1 0 1 ... 1 1 1]


In [None]:
prev = pd.read_csv("submission.csv")

(prev["Transported"] ^ result["Transported"]).astype(int).sum()

181

In [None]:
result.to_csv("submission.csv", index=False)