In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from category_encoders import BinaryEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
print(f'HomePlanet: {len(df.HomePlanet.unique())} \nCabin: {len(df.Cabin.unique())}\nDestination:{len(df.Destination.unique())}\nName:{len(df.Name.unique())}')

HomePlanet: 4 
Cabin: 6561
Destination:4
Name:8474


## Stratedgy

- Split cabin on '/'
- Split destination on '-'
- Split name into first, last
- Encode and Impute accordingly

In [7]:
df[['Cabin1', 'Cabin2', 'Cabin3']] = df['Cabin'].str.split('/', expand=True)
df[['Destination1', 'Destination2']] = df['Destination'].str.split('-', expand=True)
df[['FirstName', 'LastName']] = df['Name'].str.split(' ', expand=True)

df.drop(['Cabin', 'Destination', 'Name'], axis=1, inplace=True)

df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin1,Cabin2,Cabin3,Destination1,Destination2,FirstName,LastName
0,0001_01,Europa,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P,TRAPPIST,1e,Maham,Ofracculy
1,0002_01,Earth,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S,TRAPPIST,1e,Juanna,Vines
2,0003_01,Europa,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S,TRAPPIST,1e,Altark,Susent
3,0003_02,Europa,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S,TRAPPIST,1e,Solam,Susent
4,0004_01,Earth,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S,TRAPPIST,1e,Willy,Santantines


In [8]:
print(f"Cabin1: {len(df.Cabin1.unique())} \nCabin2: {len(df.Cabin2.unique())}\nCabin3: {len(df.Cabin3.unique())}\nDestination1: {len(df.Destination1.unique())}\nDestination2: {len(df.Destination2.unique())}\nFirstName: {len(df.FirstName.unique())}\nLastName: {len(df.LastName.unique())}")

Cabin1: 9 
Cabin2: 1818
Cabin3: 3
Destination1: 4
Destination2: 4
FirstName: 2707
LastName: 2218


In [9]:
encoder = BinaryEncoder(cols=['FirstName', 'LastName', 'Cabin1', 'Cabin3', 'Destination1', 'Destination2','VIP', 'HomePlanet'], return_df=True)
df = encoder.fit_transform(df)

df.drop(['VIP_1'], axis=1,inplace=True)

df.columns

Index(['PassengerId', 'HomePlanet_0', 'HomePlanet_1', 'HomePlanet_2',
       'CryoSleep', 'Age', 'VIP_0', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'Cabin1_0', 'Cabin1_1', 'Cabin1_2',
       'Cabin1_3', 'Cabin2', 'Cabin3_0', 'Cabin3_1', 'Destination1_0',
       'Destination1_1', 'Destination1_2', 'Destination2_0', 'Destination2_1',
       'FirstName_0', 'FirstName_1', 'FirstName_2', 'FirstName_3',
       'FirstName_4', 'FirstName_5', 'FirstName_6', 'FirstName_7',
       'FirstName_8', 'FirstName_9', 'FirstName_10', 'FirstName_11',
       'LastName_0', 'LastName_1', 'LastName_2', 'LastName_3', 'LastName_4',
       'LastName_5', 'LastName_6', 'LastName_7', 'LastName_8', 'LastName_9',
       'LastName_10', 'LastName_11'],
      dtype='object')

In [10]:
scaled_cols = ['Age', 'RoomService', 'FoodCourt']

X,y = df.drop(['Transported'],axis=1), df['Transported'].astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# params = {
#     'impute__n_neighbors': [3, 5, 7], 
#     'impute__weights': ['uniform', 'distance'],
#     'logreg__C': [0.01, 0.1, 1, 10, 100],
#     'logreg__solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],
#     'logreg__max_iter': [100, 200, 300, 400, 500, 600],
#     'logreg__penalty': ['l1', 'l2', 'none'],
# }

params = {
    'impute__n_neighbors': [3, 5, 7], 
    'impute__weights': ['uniform', 'distance'],
    'xgb__n_estimators': [100, 200, 300],
    'xgb__max_depth': [3, 4, 5, 6],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__subsample': [0.7, 0.8, 0.9],
    'xgb__colsample_bytree': [0.7, 0.8, 0.9],
}



col_transformer = ColumnTransformer([
    ('scaler', StandardScaler(), [5,7,8])
    ], remainder='passthrough'
)

pipeline = Pipeline([
    ('impute', KNNImputer()),
    ('ct', col_transformer),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])


clf = RandomizedSearchCV(pipeline, params, cv=5, verbose=0)

best_model = clf.fit(X_train, Y_train)

Y_pred = best_model.predict(X_test)

Y_proba = best_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

precision = precision_score(Y_test, Y_pred)
print(f"Precision: {precision}")

recall = recall_score(Y_test, Y_pred)
print(f"Recall: {recall}")

f1 = f1_score(Y_test, Y_pred)
print(f"F1 Score: {f1}")

roc_auc = roc_auc_score(Y_test, Y_proba)
print(f"ROC-AUC Score: {roc_auc}")


Accuracy: 0.7960122699386503
Precision: 0.7904059040590405
Recall: 0.8119787717968158
F1 Score: 0.8010471204188481
ROC-AUC Score: 0.8897506221359835


In [12]:
df_test[['Cabin1', 'Cabin2', 'Cabin3']] = df_test['Cabin'].str.split('/', expand=True)
df_test[['Destination1', 'Destination2']] = df_test['Destination'].str.split('-', expand=True)
df_test[['FirstName', 'LastName']] = df_test['Name'].str.split(' ', expand=True)
df_test.drop(['Cabin', 'Destination', 'Name'], axis=1, inplace=True)
encoder = BinaryEncoder(cols=['FirstName', 'LastName', 'Cabin1', 'Cabin3', 'Destination1', 'Destination2','VIP', 'HomePlanet'], return_df=True)
df_test = encoder.fit_transform(df_test)
df_test.drop(['VIP_1'], axis=1,inplace=True)
df_test['LastName_11'] = 0

preds = best_model.predict(df_test)

In [13]:
preds = preds.astype(bool)
df_test['Transported'] = preds
submission_df = df_test[['PassengerId', 'Transported']]
submission_df.to_csv('submission.csv', index=False)