In [66]:
# Import Libraries 
import os
import pandas as pd
import numpy as np
from IPython.display import clear_output

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# import lazypredict
# from lazypredict.Supervised import LazyClassifier

from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [51]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [52]:
# drop columns that are not needed
train.drop(['PassengerId'], axis=1, inplace=True)
PassengerId = test['PassengerId']
test.drop(['PassengerId'], axis=1, inplace=True)

In [53]:
# split into X and y
y = train['Transported']
train.drop(['Transported'], axis=1, inplace=True)

labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [54]:
# Drop columns that are not needed
train.drop(['Name'], axis=1, inplace=True)
test.drop(['Name'], axis=1, inplace=True)

In [55]:
# Impute missing values of Cabin with 0
train.Cabin.fillna(0, inplace=True)
test.Cabin.fillna(0, inplace=True)

# Separate the cabin number into 3 coloumns
train['Cabin_Type'] = train.Cabin.str[0]
train['Cabin_Number'] = train.Cabin.str[2]
train['Cabin_Side'] = train.Cabin.str[4]
train.drop(['Cabin'], axis=1, inplace=True)

test['Cabin_Type'] = test.Cabin.str[0]
test['Cabin_Number'] = test.Cabin.str[2]
test['Cabin_Side'] = test.Cabin.str[4]
test.drop(['Cabin'], axis=1, inplace=True)

# impute missing values with 0
train['Cabin_Type'] = train['Cabin_Type'].fillna('0')
train['Cabin_Number'] = train['Cabin_Number'].fillna('0')
train['Cabin_Side'] = train['Cabin_Side'].fillna('0')

# Replace / with 0
train['Cabin_Side'] = train['Cabin_Side'].str.replace('/', '0')
test['Cabin_Side'] = test['Cabin_Side'].str.replace('/', '0')

test['Cabin_Type'] = test['Cabin_Type'].fillna('0')
test['Cabin_Number'] = test['Cabin_Number'].fillna('0')
test['Cabin_Side'] = test['Cabin_Side'].fillna('0')

In [56]:
# Converting string columns to numeric
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Type','Cabin_Number', 'Cabin_Side']
labelencoder = LabelEncoder()

for feature in categorical_features:
    train[feature] = train[feature].astype(str)
    test[feature] = test[feature].astype(str)

    labelencoder.fit(train[feature])

    train[feature] = labelencoder.transform(train[feature])
    test[feature] = labelencoder.transform(test[feature])

    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(train[feature].mean(), inplace = True)

# impute missing values
numerical_features = ['Age','RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for feature in numerical_features:
    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(test[feature].mean(), inplace = True)

In [57]:
# pipeline for categorical variables
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_features = ['Age','RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [58]:
# Use the full pipeline to transform both training and test sets
preprocessor.fit(train)
train = preprocessor.transform(train)
test = preprocessor.transform(test)

In [59]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
train = sc.fit_transform(train)
test = sc.transform(test)

In [60]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [61]:
# # Testing (using Lazypredict-LazyClassifier)
# clf = LazyClassifier(verbose=0,
#                      ignore_warnings=True,
#                      custom_metric=None,
#                      predictions=False,
#                      random_state=12,
#                      classifiers='all')

# models, predictions = clf.fit(X_train , X_test , y_train , y_test)
# clear_output()

In [79]:
# Define models
models = [
    LGBMClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    xgb.XGBClassifier(),
    lgb.LGBMClassifier(),
    CatBoostClassifier()
]

In [80]:
# Evaluate models
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: %s" % model.__class__.__name__)
    print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))
    print("Precision: %.2f" % precision_score(y_test, y_pred))
    print("Recall: %.2f" % recall_score(y_test, y_pred))
    print("F1: %.2f" % f1_score(y_test, y_pred))
    print("\n")


Model: LGBMClassifier
Accuracy: 0.79
Precision: 0.77
Recall: 0.82
F1: 0.79


Model: RandomForestClassifier
Accuracy: 0.77
Precision: 0.78
Recall: 0.77
F1: 0.77


Model: GradientBoostingClassifier
Accuracy: 0.78
Precision: 0.76
Recall: 0.83
F1: 0.79


Model: KNeighborsClassifier
Accuracy: 0.76
Precision: 0.76
Recall: 0.78
F1: 0.77


Model: DecisionTreeClassifier
Accuracy: 0.72
Precision: 0.71
Recall: 0.76
F1: 0.73


Model: XGBClassifier
Accuracy: 0.78
Precision: 0.77
Recall: 0.80
F1: 0.79


Model: LGBMClassifier
Accuracy: 0.79
Precision: 0.77
Recall: 0.82
F1: 0.79


Learning rate set to 0.023581
0:	learn: 0.6815697	total: 4.31ms	remaining: 4.3s
1:	learn: 0.6730431	total: 9.58ms	remaining: 4.78s
2:	learn: 0.6618443	total: 15.2ms	remaining: 5.06s
3:	learn: 0.6531285	total: 19.4ms	remaining: 4.82s
4:	learn: 0.6424625	total: 23.9ms	remaining: 4.75s
5:	learn: 0.6351428	total: 28.1ms	remaining: 4.65s
6:	learn: 0.6261172	total: 33.4ms	remaining: 4.74s
7:	learn: 0.6175793	total: 38.5ms	remainin

In [87]:
# Final model
final_model = models[-1]

In [None]:
# make final predictions
final_model.fit(train, y)
final_predictions = final_model.predict(test)

# convert predictions to boolean
final_predictions = np.array(final_predictions, dtype = bool)

In [90]:
# save results to file
results = pd.DataFrame({'PassengerId': PassengerId, 'Transported': final_predictions})

filename = "submission.csv"

if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')