In [55]:
#Import packages
import pandas as pd

In [56]:
#Import the data
train = pd.read_csv("./Input Data/train.csv")
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Preprocessing

In [57]:
#Remove useless information
del train["PassengerId"]
del train["Name"]

#### Missing Values

In [58]:
#Evaluate missing values per row
train.isna().sum(axis=1).value_counts()

0    6764
1    1746
2     171
3      12
dtype: int64

In [59]:
#Removes Rows with Multiple Missing Values
total_rows_before = train.shape[0]

n_cols_keep = train.shape[1] - 1
train.dropna(inplace=True, thresh=n_cols_keep)

print("{} rows removed with more than 1 missing value".format(total_rows_before - train.shape[0]))

183 rows removed with more than 1 missing value


In [60]:
#Evaluate Missing Values per Column
total_missing = train.isna().sum()
percent_missing = (total_missing / train.count()).map("{:.2%}".format)
missing = pd.concat([total_missing, percent_missing], axis=1, keys=["Total Missing", "Percent Missing"])
missing

Unnamed: 0,Total Missing,Percent Missing
HomePlanet,168,2.01%
CryoSleep,176,2.11%
Cabin,159,1.90%
Destination,149,1.78%
Age,153,1.83%
VIP,166,1.99%
RoomService,150,1.79%
FoodCourt,147,1.76%
ShoppingMall,166,1.99%
Spa,158,1.89%


In [64]:
#Fill in missing categorical values with most frequent entry
categorical_columns = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]

for col in categorical_columns:
    most_freq = train[col].mode()[0] #Most frequent non-nan entry
    train[col].fillna(most_freq, inplace=True)

In [65]:
#Explore numerican column distributions
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8357.0,8360.0,8363.0,8344.0,8352.0,8356.0
mean,28.837741,223.925239,460.320579,173.707215,312.743894,303.938487
std,14.479248,664.284658,1616.61012,603.450272,1143.274561,1145.283286
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,77.0,27.25,59.25,45.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [66]:
#Fill in missing numerical values with median
numerical_columns = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

for col in numerical_columns:
    col_median = train[col].median()
    train[col].fillna(col_median, inplace=True)

In [69]:
#Confirm that all missing values are filled in
assert train.isna().sum().sum() == 0