# Data preparation

## Setup

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
train_raw = pd.read_csv('../data/raw/train.csv')
test_raw = pd.read_csv('../data/raw/test.csv')

print(f'Raw train dataset has {train_raw.shape[0]} observations of {train_raw.shape[1]} variables.')
print(f'Raw test dataset has {test_raw.shape[0]} observations of {test_raw.shape[1]} variables.')

train_raw.sample(5)

Raw train dataset has 8693 observations of 14 variables.
Raw test dataset has 4277 observations of 13 variables.


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
5792,6137_01,Earth,False,G/994/S,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,0.0,0.0,Camily Serry,True
2662,2854_01,Earth,False,F/591/P,55 Cancri e,14.0,False,0.0,,14.0,692.0,809.0,Phia Buckentry,False
1757,1865_06,,False,G/292/S,TRAPPIST-1e,6.0,False,0.0,0.0,0.0,0.0,0.0,Racyle Roses,False
1504,1596_01,Europa,True,B/54/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Izarfik Replipent,True
1047,1115_01,Earth,True,G/173/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Arl Oneidson,True


Variables:

* PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
* HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
* CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
* Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
* Destination - The planet the passenger will be debarking to.
* Age - The age of the passenger.
* VIP - Whether the passenger has paid for special VIP service during the voyage.
* RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
* Name - The first and last names of the passenger.
* Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## Quick exploration

In [4]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
TARGET = 'Transported'
FEATURES = [col for col in train_raw.columns if col not in ['Name', TARGET]]

numerical = train_raw[FEATURES].select_dtypes(include=np.number).columns
categorical = train_raw[FEATURES].select_dtypes(exclude=np.number).columns

print(f'Target: {TARGET}')
print(f'Features:\n\tnumerical: {numerical.to_list()}\n\tcategorical:{categorical.to_list()}')
print(f'Shapes:\n\ttrain: {train_raw.shape}\n\ttest: {test_raw.shape}')

Target: Transported
Features:
	numerical: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
	categorical:['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
Shapes:
	train: (8693, 14)
	test: (4277, 13)


In [6]:
train_raw.duplicated().sum(), test_raw.duplicated().sum()

(0, 0)

In [7]:
train_raw.isna().sum().sort_values(ascending=False)

CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Name            200
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
PassengerId       0
Transported       0
dtype: int64

## Feature engineering

In [8]:
def preprocess(d):

    # fill na for categorical variables
    d[categorical].fillna('missing')
    
    # feature engineering
    # get group count from PassengerId
    d[['group', 'Group_count']] = d['PassengerId'].str.split('_', expand=True)
    d['Group_count'] = d['Group_count'].astype('int')
    groupnum = d[['group', 'Group_count']].groupby('group').count()

    d.drop(columns=['Group_count'], inplace=True)
    d = d.join(groupnum, on='group')

    # split cabin information
    d[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = d['Cabin'].str.split('/', expand=True)

    # drop variables
    drop = ['PassengerId', 'Cabin', 'Name', 'Cabin_num', 'group']
    d.drop(columns=drop, inplace=True)

    return d

# apply feature engineering
train = preprocess(train_raw)
test = preprocess(test_raw)

In [9]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns

train[numerical] = train[numerical].astype(float)
train[categorical] = train[categorical].astype(str)

test[numerical] = test[numerical].astype(float)
test[categorical] = test[categorical].astype(str)

print(f'Target: {TARGET}')
print(f'Features:\n\tnumerical: {numerical.to_list()}\n\tcategorical:{categorical.to_list()}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')

train.head()

Target: Transported
Features:
	numerical: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group_count']
	categorical:['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck', 'Cabin_side']
Shapes:
	train: (8693, 14)
	test: (4277, 13)


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_count,Cabin_deck,Cabin_side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1.0,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,1.0,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,2.0,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,2.0,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1.0,F,S


In [10]:
# save preprocessed data
os.makedirs('../data/final', exist_ok=True)
train.to_csv('../data/final/train.csv', index=False)
test.to_csv('../data/final/test.csv', index=False)