In [None]:
!pip install -q kaggle 
import os
from pathlib import Path

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: path = Path('../input/spaceship-titanic')
else:
    path = Path('spaceship-titanic')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

In [None]:
!pip install -q fastai
from fastai.tabular.all import *

pd.options.display.float_format = '{:.2f}'.format
set_seed(42)

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(path/"train.csv")
df.head()

In [None]:
import numpy as np

In [None]:
def add_features(df):
    df["GroupId"] = df["PassengerId"].str.split("_").str[0]
    df["GroupMember"] = df["PassengerId"].str.split("_").str[1]
    df["GroupSize"] = df.groupby("GroupId")["GroupId"].transform("size")
    df["LogRoomService"] = np.log1p(df["RoomService"])
    df["LogFoodCourt"] = np.log1p(df["FoodCourt"])
    df["LogShoppingMall"] = np.log1p(df["ShoppingMall"])
    df["LogSpa"] = np.log1p(df["Spa"])
    df["LogVRDeck"] = np.log1p(df["VRDeck"])
    df["Deck"] = df["Cabin"].str.split("/").str[0]
    df["Room"] = df["Cabin"].str.split("/").str[1]
    df["Side"] = df["Cabin"].str.split("/").str[2]
    df["Room"] = df["Room"].astype('Int64')
    df["MoneySpent"] = df["RoomService"] + df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"]
    df["LogMoneySpent"] = np.log1p(df["MoneySpent"])
#    df["Transported"] = df["Transported"].astype('Int64')
    
    
#    pd.get_dummies(df, columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side", "Transported"], drop_first=True)
    
add_features(df)
df.head()

In [None]:
splits = RandomSplitter(seed=42)(df)

In [None]:
dls = TabularPandas(
    df, splits=splits,
    procs = [ FillMissing, Categorify, Normalize],
    cat_names=["CryoSleep", "Deck", "Side", "VIP"],
#    cat_names=["HomePlanet","CryoSleep", "Deck", "Side", "Destination", "VIP"],
#    cont_names=["Age", "LogMoneySpent"],
    cont_names=["Age", "LogRoomService","LogFoodCourt","LogShoppingMall","LogSpa","LogVRDeck"],
#    cont_names=["Room","Age","LogRoomService","LogFoodCourt","LogShoppingMall","LogSpa","LogVRDeck"],
    y_names="Transported", y_block = CategoryBlock(),
).dataloaders(path=".")

In [None]:
dls.train.xs.tail()

In [None]:
learn = tabular_learner(dls, metrics=accuracy, layers=[12,12])

In [None]:
learn.lr_find(suggest_funcs=(slide, valley)) 

In [None]:
learn.fit(16, lr=0.04) 

In [None]:
tst_df = pd.read_csv(path/'test.csv')
add_features(tst_df)

In [None]:
tst_dl = learn.dls.test_dl(tst_df)

In [None]:
preds,targs = learn.get_preds(dl=tst_dl)

In [None]:
preds

In [None]:
tst_df['Survived'] = (preds[:,1]>0.5).int()
sub_df = tst_df[['PassengerId','Survived']]
sub_df.to_csv('sub.csv', index=False)

In [None]:
!head sub.csv

In [None]:
def ensemble():
    learn = tabular_learner(dls, metrics=accuracy, layers=[10,10])
    with learn.no_bar(),learn.no_logging(): learn.fit(16, lr=0.04)
    return learn.get_preds(dl=tst_dl)[0]

In [None]:
learns = [ensemble() for _ in range(5)]

In [None]:
len(learns)

In [None]:
ens_preds = torch.stack(learns).mean(0)

In [None]:
ens_preds

In [None]:
tst_df['Survived'] = (ens_preds[:,1]>0.5).int()
sub_df = tst_df[['PassengerId','Survived']]
sub_df.to_csv('ens_sub.csv', index=False)

# Binary Splits


In [None]:
cats=["CryoSleep", "Deck", "Side", "VIP"]
conts=["Age", "LogRoomService","LogFoodCourt","LogShoppingMall","LogSpa","LogVRDeck"]
dep=["Transported"]

In [None]:
df["CryoSleep"] = pd.Categorical(df.CryoSleep)
df["Transported"] = 

In [None]:
df.CryoSleep

In [None]:
import seaborn as sns 

fig,axs = plt.subplots(1,2, figsize=(11,5))
sns.barplot(data=df, y=dep, x="CryoSleep", ax=axs[0]).set(title="Survival rate")
sns.countplot(data=df, x="CryoSleep", ax=axs[1]).set(title="Histogram");