In [2]:
import pandas as pd

### Load Raw

In [3]:
raw_df = pd.read_csv('./data/train.csv')
data_df = raw_df

### Rename Columns

In [4]:
data_df = data_df.rename(columns={
    "RoomService"   : "spend_RoomService",
    "FoodCourt"     : "spend_FoodCourt",
    "ShoppingMall"  : "spend_ShoppingMall",
    "Spa"           : "spend_Spa",
    "VRDeck"        : "spend_VRDeck",
    })

In [5]:
data_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,spend_RoomService,spend_FoodCourt,spend_ShoppingMall,spend_Spa,spend_VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Cleaning

## Break-ups

There are a few compound features that would be better served broken down:

**PassengerId** 


`gggg_pp` - group_num / num_within

**Cabin** 

`deck/num/side` - _P|S_ port|starboard




In [6]:
data_df['group_id'] = data_df['PassengerId'].apply(lambda x: x.split('_')[0])
data_df['group_num'] = data_df['PassengerId'].apply(lambda x: x.split('_')[1])

In [7]:
data_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,spend_RoomService,spend_FoodCourt,spend_ShoppingMall,spend_Spa,spend_VRDeck,Name,Transported,group_id,group_num
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,1


### Cabin is tricky

There are multiple values including nulls in cabin. Let's explore.

In [8]:
data_df.Cabin.value_counts()

G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64

## LEARN: How to tell string values

Looking for string values in a feature should have an easier path.

In [9]:
def is_str(value):
    if value is None: return False
    # if value.isna(): return False
    return type(value) == str

cabin_is_string_ser = data_df.Cabin.apply(lambda x: is_str(x))
# to display 
cabin_is_string_ser.value_counts()

True     8494
False     199
Name: Cabin, dtype: int64

In [10]:
# data_df.where(cabin_is_string_ser == False)
data_df[cabin_is_string_ser == False]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,spend_RoomService,spend_FoodCourt,spend_ShoppingMall,spend_Spa,spend_VRDeck,Name,Transported,group_id,group_num
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False,0012,01
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True,0101,01
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,0110,01
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False,0239,01
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True,0244,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,8772_02,Europa,False,,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,,Naosura Motled,False,8772,02
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False,9057,01
8485,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Bath Brakeng,True,9069,03
8509,9081_03,Earth,True,,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Beula Clemondsey,False,9081,03


In [11]:
data_df[data_df.Cabin.isna()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,spend_RoomService,spend_FoodCourt,spend_ShoppingMall,spend_Spa,spend_VRDeck,Name,Transported,group_id,group_num
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False,0012,01
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True,0101,01
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,0110,01
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False,0239,01
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True,0244,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,8772_02,Europa,False,,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,,Naosura Motled,False,8772,02
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False,9057,01
8485,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Bath Brakeng,True,9069,03
8509,9081_03,Earth,True,,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Beula Clemondsey,False,9081,03


In [12]:
def cabin_parts(cabin):
    if cabin is None: return None
    if type(cabin) != str: return None
    return cabin.split("/")
def get_part(cabin, index):
    parts = cabin_parts(cabin)
    if parts is None: return None
    if len(parts) != 3: return None
    return parts[index]

data_df["cabin_deck"] = data_df.Cabin.apply(lambda x: get_part(x, 0))
data_df["cabin_num"] = data_df.Cabin.apply(lambda x: get_part(x, 1))
data_df["cabin_side"] = data_df.Cabin.apply(lambda x: get_part(x, 2))

## Spending Totals

Let's combine some spending to see if there are "types" of spending that correlate

In [13]:
spend_cols = [
    "spend_FoodCourt",
    "spend_RoomService",
    "spend_ShoppingMall",
    "spend_Spa",
    "spend_VRDeck"]
data_df["spend_total"] = data_df[spend_cols].sum(axis=1)

spend_high_cols = [
    "spend_RoomService",
    "spend_Spa",
    "spend_VRDeck"]
data_df["spend_high_total"] = data_df[spend_high_cols].sum(axis=1)

spend_low_cols = [
    "spend_FoodCourt",
    "spend_ShoppingMall",
    ]
data_df["spend_low_total"] = data_df[spend_low_cols].sum(axis=1)

In [14]:
data_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,spend_RoomService,spend_FoodCourt,spend_ShoppingMall,...,Name,Transported,group_id,group_num,cabin_deck,cabin_num,cabin_side,spend_total,spend_high_total,spend_low_total
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,Maham Ofracculy,False,1,1,B,0,P,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,Juanna Vines,True,2,1,F,0,S,736.0,702.0,34.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,Altark Susent,False,3,1,A,0,S,10383.0,6807.0,3576.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,Solam Susent,False,3,2,A,0,S,5176.0,3522.0,1654.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,Willy Santantines,True,4,1,F,1,S,1091.0,870.0,221.0
