In [134]:
import pandas as pd
import numpy as np
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2 
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import chi2_contingency, uniform, randint
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier 

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [135]:
train_df[["id1", "id2"]] = train_df["PassengerId"].str.split("_", expand=True)

In [136]:
train_df[["firstName", "LastName"]] = train_df["Name"].str.split(" ", expand=True)
test_df[["firstName", "LastName"]] = test_df["Name"].str.split(" ", expand=True)

In [138]:
common_lastname = train_df["LastName"].value_counts().head(20)
mask = test_df["LastName"].isin(common_lastname.index)

mask.value_counts()

False    4245
True       32
Name: LastName, dtype: int64

In [131]:
train_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,id1,id2,firstName,lastName,LastName
183,0205_01,Earth,False,G/34/S,TRAPPIST-1e,29.0,False,0.0,2.0,66.0,0.0,705.0,Carma Oneiles,False,0205,01,Carma,Oneiles,Oneiles
193,0216_01,Earth,False,F/40/S,55 Cancri e,21.0,False,0.0,56.0,0.0,0.0,1918.0,Searla Garnes,False,0216,01,Searla,Garnes,Garnes
398,0437_01,Earth,False,F/78/S,TRAPPIST-1e,39.0,False,382.0,0.0,22.0,467.0,0.0,Lynnee Oneiles,False,0437,01,Lynnee,Oneiles,Oneiles
399,0437_02,Earth,False,F/78/S,PSO J318.5-22,40.0,False,7.0,0.0,840.0,26.0,0.0,Debora Oneiles,False,0437,02,Debora,Oneiles,Oneiles
645,0678_01,Earth,True,G/103/S,TRAPPIST-1e,37.0,False,0.0,0.0,0.0,0.0,0.0,Bettie Moodman,False,0678,01,Bettie,Moodman,Moodman
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8647,9227_01,Earth,True,G/1498/P,TRAPPIST-1e,7.0,False,0.0,0.0,0.0,0.0,0.0,Glendy Hinglendez,False,9227,01,Glendy,Hinglendez,Hinglendez
8648,9227_02,Earth,True,G/1498/P,PSO J318.5-22,11.0,False,0.0,0.0,0.0,0.0,0.0,Jorgie Hinglendez,True,9227,02,Jorgie,Hinglendez,Hinglendez
8649,9227_03,Earth,True,G/1498/P,PSO J318.5-22,1.0,False,0.0,0.0,0.0,0.0,0.0,Paulas Hinglendez,True,9227,03,Paulas,Hinglendez,Hinglendez
8650,9227_04,Earth,True,G/1498/P,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,Robyny Hinglendez,True,9227,04,Robyny,Hinglendez,Hinglendez


In [41]:
def preprocessing(df):
    # Replace null values with mode
    df = df.fillna(df.mode().iloc[0])

    # Split Cabin String to 3 Parts
    df[["Deck", "Num", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df["Num"] = df["Num"].astype(int)

    # One Hot Encode Catergorical Columns
    df = pd.get_dummies(df, columns=["HomePlanet"])
    df = pd.get_dummies(df, columns=["Destination"])
    df = pd.get_dummies(df, columns=["Side"])
    df = pd.get_dummies(df, columns=["Deck"])

    # Map True/False Columns to 1/0
    df["CryoSleep"] = df["CryoSleep"].astype(int)
    df["VIP"] = df["VIP"].astype(int)
    if "Transported" in df.columns:
        df["Transported"] = df["Transported"].astype(int)

    # # Log Scaling on Numerical Columns
    constant = 1
    df[["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Num"]] = np.log(df[["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Num"]] + constant)

    # Drop Irrelevant Columns
    df = df.drop(["PassengerId", "Cabin", "Name"], axis=1)

    return df

In [42]:
train_df = preprocessing(train_df)



In [83]:
selected_features = ['CryoSleep', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Side_P',
       'Side_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F',
       'Deck_G', 'Deck_T']

In [84]:
X = train_df[selected_features]
y = train_df["Transported"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define the neural network architecture
model = keras.Sequential([
    keras.layers.Input(shape=(len(selected_features),)),  # Replace num_features with the actual number of features
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
])

early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=5,           # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored metric
)

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
epoch_size = [10, 15, 20, 25, 30]
batch_size = [4, 8, 16]
min_loss = 1
max_accuracy = 0
optimal_epochs = 0
optimal_batch_size = 0
for i in range(5):
    for j in range(3):
        e = epoch_size[i]
        b = batch_size[j]
        model.fit(X_train, y_train, epochs=e, batch_size=b, validation_data=(X_val, y_val), callbacks=[early_stopping])

        test_loss, test_accuracy = model.evaluate(X_test, y_test)
        if test_loss < min_loss:
            optimal_epochs = e
            optimal_batch_size = b

model.fit(X_train, y_train, epochs=optimal_epochs, batch_size=optimal_batch_size, validation_data=(X_val, y_val), callbacks=[early_stopping])
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test Accuracy: {test_accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

In [78]:
testing = pd.read_csv("test.csv")

test_input = preprocessing(testing)

predictions = model.predict(test_input[selected_features])

result = pd.DataFrame()
result["PassengerId"] = testing["PassengerId"]
result["Transported"] = predictions > 0.5

result.to_csv("submission.csv", index=False)



In [85]:
model.evaluate(X_train, y_train)



[0.34597334265708923, 0.8364970088005066]