# Spaceship Titanic 

In [1]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import LabelEncoder


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [5]:
def rename_columns(df):
    df.rename(columns={
        'PassengerId': 'passenger_id',
        'HomePlanet': 'home_planet',
        'CryoSleep':'cryo_sleep',
        'Cabin': 'cabin',
        'Destination': 'destination',
        'Age': 'age',
        'VIP': 'vip',
        'RoomService': 'room_service',
        'FoodCourt': 'food_court',
        'ShoppingMall':'shopping_mall',
        'Spa':'spa',
        'VRDeck': 'vr_deck',
        'Name': 'name',
        'Transported': 'transported'
    }, inplace=True)
    
    df.drop(columns=['name', 'passenger_id'], inplace=True)
    
    return df    

In [6]:
train_data = rename_columns(train_data)
test_data = rename_columns(test_data)

In [7]:
train_data.head()

Unnamed: 0,home_planet,cryo_sleep,cabin,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [8]:
train_data.shape

(8693, 12)

In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   home_planet    8492 non-null   object 
 1   cryo_sleep     8476 non-null   object 
 2   cabin          8494 non-null   object 
 3   destination    8511 non-null   object 
 4   age            8514 non-null   float64
 5   vip            8490 non-null   object 
 6   room_service   8512 non-null   float64
 7   food_court     8510 non-null   float64
 8   shopping_mall  8485 non-null   float64
 9   spa            8510 non-null   float64
 10  vr_deck        8505 non-null   float64
 11  transported    8693 non-null   bool   
dtypes: bool(1), float64(6), object(5)
memory usage: 755.7+ KB


In [10]:
train_data.isna().sum()

home_planet      201
cryo_sleep       217
cabin            199
destination      182
age              179
vip              203
room_service     181
food_court       183
shopping_mall    208
spa              183
vr_deck          188
transported        0
dtype: int64

In [11]:
test_data.isna().sum()

home_planet       87
cryo_sleep        93
cabin            100
destination       92
age               91
vip               93
room_service      82
food_court       106
shopping_mall     98
spa              101
vr_deck           80
dtype: int64

In [12]:
def clean_data(train, test):
    for df in [train, test]:
        df['home_planet'].fillna(df['home_planet'].mode()[0], inplace=True)
        df['cryo_sleep'].fillna(df['cryo_sleep'].mode()[0], inplace=True)
        df['cabin'].fillna(df['cabin'].mode()[0], inplace=True)
        df['destination'].fillna(df['destination'].mode()[0], inplace=True)
        df['vip'].fillna(df['vip'].mode()[0], inplace=True)
        
        df['age'].fillna(df['age'].median(), inplace=True)
        df['room_service'].fillna(df['room_service'].median(), inplace=True)
        df['food_court'].fillna(df['food_court'].median(), inplace=True)
        df['shopping_mall'].fillna(df['shopping_mall'].median(), inplace=True)
        df['spa'].fillna(df['spa'].median(), inplace=True)
        df['vr_deck'].fillna(df['vr_deck'].median(), inplace=True)
        
        df[['deck', 'num', 'side']] = df['cabin'].str.split('/', expand=True)
        df.drop(columns=['cabin'], inplace=True)
        
        df['home_planet'] = le.fit_transform(df['home_planet'].values)
        df['cryo_sleep'] = le.fit_transform(df['cryo_sleep'].values)
        df['destination'] = le.fit_transform(df['destination'].values)
        df['vip'] = le.fit_transform(df['vip'].values)
        df['deck'] = le.fit_transform(df['deck'].values)
        df['num'] = le.fit_transform(df['num'].values)
        df['side'] = le.fit_transform(df['side'].values)
        
        df['age'] = df['age'].astype(int)
        df['room_service'] = df['room_service'].astype(int)
        df['food_court'] = df['food_court'].astype(int)
        df['shopping_mall'] = df['shopping_mall'].astype(int)
        df['spa'] = df['spa'].astype(int)
        df['vr_deck'] = df['vr_deck'].astype(int)
    
    train = pd.get_dummies(train, columns=["home_planet"])
    test = pd.get_dummies(test, columns=["home_planet"])
    
    return train, test
        

In [13]:
le = LabelEncoder()
train_df, test_df = clean_data(train_data, test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['home_planet'].fillna(df['home_planet'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cryo_sleep'].fillna(df['cryo_sleep'].mode()[0], inplace=True)
  df['cryo_sleep'].fillna(df['cryo_sleep'].mode()[0], inplace=True)
The behavior will change in pa