In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import sklearn
import numpy as np
import seaborn as sns
from fastai.tabular.all import *

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
#hide
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [4]:
df.head().T

In [5]:
df.shape

In [6]:
[print(f"{col:25}\t{df[col].dtype}\t{df[col].nunique()}") for col in df.columns];

In [7]:
# Data Cleaning/Feature engineering

In [8]:
# Passengerid
df[['Group', 'GroupNo']] = df['PassengerId'].str.split('_', 1, expand=True)
df_test[['Group', 'GroupNo']] = df_test['PassengerId'].str.split('_', 1, expand=True)
df[['Group', 'GroupNo', 'PassengerId']]

In [9]:
# HomePlanet
print(df['HomePlanet'].unique())
print(df['HomePlanet'].value_counts())

In [10]:
# Cabin
df[['deck', 'num', 'side']] = df['Cabin'].str.split('/', expand=True)
df_test[['deck', 'num', 'side']] = df_test['Cabin'].str.split('/', expand=True)
df[['deck', 'num', 'side', 'Cabin']]

In [11]:
# side
print(df['side'].unique())
print(df['side'].value_counts())

In [12]:
# Name
df[['FirstName','LastName']] = df['Name'].str.split(' ', expand=True)
df_test[['FirstName','LastName']] = df_test['Name'].str.split(' ', expand=True)
df[['FirstName','LastName', 'Name']]

In [13]:
dep_var = 'Transported'

In [14]:
cont,cat = cont_cat_split(df, 5, dep_var=dep_var)

In [15]:
procs = [Categorify, FillMissing, Normalize]

In [16]:
splits = RandomSplitter(valid_pct=0.2, seed=0)(range_of(df))

In [17]:
to = TabularPandas(df, procs, cat, cont, dep_var, splits=splits)
to_test = TabularPandas(df_test, procs, cat, cont, y_names=None)

In [18]:
to.all_cols.dtypes

In [19]:
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y
len(to.train),len(to.valid)

In [20]:
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

In [21]:
def m_acc(m, xs, y): return sklearn.metrics.accuracy_score(y, m.predict(xs))

In [22]:
xgbc = XGBClassifier(eval_metric='mlogloss').fit(xs, y)
m_acc(xgbc, xs, y), m_acc(xgbc, valid_xs, valid_y) 

In [23]:
# Feature importance

fig, ax = plt.subplots(figsize=(5, 10))
plot_importance(xgbc, ax=ax, max_num_features=30).name

In [24]:
d = xgbc.get_booster().get_score()
sorted(d.items(), key=lambda x: x[1], reverse=True)

In [25]:
to_keep = ['Name', 'PassengerId', 'LastName', 'Cabin', 'num', 'Age', 'FoodCourt', 'VRDeck',
          'Spa', 'ShoppingMall', 'RoomService', 'side', 'Destination', 'GroupNo', 'HomePlanet',
          'CryoSleep', 'VRDeck_na']

xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [26]:
xgbc = XGBClassifier(eval_metric='mlogloss').fit(xs_imp, y)
m_acc(xgbc, xs_imp, y), m_acc(xgbc, valid_xs_imp, valid_y) 

In [27]:
xs_test = to_test.train.xs[to_keep]
preds = xgbc.predict(xs_test)
my_submission = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Transported': preds.astype(bool)})
my_submission.to_csv('submission_xgb.csv', index=False)

In [28]:
!pip install -Uqq fastbook
import fastbook
from fastbook import *

In [29]:
cluster_columns(xs_imp)

In [30]:
# Neural Network

dls = to.dataloaders(bs=64)

In [31]:
dls.show_batch()

In [32]:
learn = tabular_learner(dls, layers=[300,100], metrics=accuracy)

In [33]:
learn.lr_find()

In [34]:
learn.fit(1, 1e-2)

In [35]:
learn.show_results()

In [36]:
dl = learn.dls.test_dl(df_test)

In [37]:
x, y, z = learn.get_preds(dl=dl, with_decoded=True)

In [38]:
my_submission = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Transported': z.numpy().astype(bool)})
my_submission.to_csv('submission_nn.csv', index=False)

# Results

| Model | Train Accuracy | Valid Accuracy | Test Accuracy |
|--|--|--|--|
| XGBoost | 0.97 | 0.79 | 0.77601|
| NN | 0.767549 |  | 0.77227|