## Data Preprocessing

Now, we are done with exploratory data analysis of both training and testing datasets. Now, we should get into preprocessing for both the datasets as some of the features are not numerical.

### Importing all packages

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import *
from sklearn.linear_model import *
from math import *
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.feature_extraction import *
from sklearn.naive_bayes import *
from sklearn.discriminant_analysis import *
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.neighbors import *
from sklearn.cluster import *

### Importing all datasets

In [4]:
df_train = pd.read_csv("train_eda.csv")
df_test = pd.read_csv("test_eda.csv")

### Displaying first 5 elements of training dataset

In [5]:
df_train.head()

Unnamed: 0,index,PassengerId,HomePlanet,Cabin Deck,Cabin Number,Cabin Side,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0,0001_01,Europa,B,0,P,False,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,False
1,1,0002_01,Earth,F,0,S,False,TRAPPIST-1e,27.0,False,109.0,9.0,25.0,549.0,44.0,True
2,2,0003_01,Europa,A,0,S,False,TRAPPIST-1e,27.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,3,0003_02,Europa,A,0,S,False,TRAPPIST-1e,27.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,4,0004_01,Earth,F,1,S,False,TRAPPIST-1e,27.0,False,303.0,70.0,151.0,565.0,2.0,True


### Displaying first 5 elements of testing dataset

In [6]:
df_test.head()

Unnamed: 0,index,PassengerId,HomePlanet,Cabin Deck,Cabin Number,Cabin Side,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0,0013_01,Earth,G,3,S,True,TRAPPIST-1e,26.0,False,0.0,0.0,0.0,0.0,0.0
1,1,0018_01,Earth,F,4,S,False,TRAPPIST-1e,26.0,False,0.0,9.0,0.0,2823.0,0.0
2,2,0019_01,Europa,C,0,S,True,55 Cancri e,26.0,False,0.0,0.0,0.0,0.0,0.0
3,3,0021_01,Europa,C,1,S,False,TRAPPIST-1e,26.0,False,0.0,6652.0,0.0,181.0,585.0
4,4,0023_01,Earth,F,5,S,False,TRAPPIST-1e,26.0,False,10.0,0.0,635.0,0.0,0.0


### Removal of dummy column "index" in both the datasets

In [8]:
train_1 = df_train.drop("index",axis=1,inplace=False)
test_1 = df_test.drop("index",axis=1,inplace=False)

In [9]:
train_1.head()

Unnamed: 0,PassengerId,HomePlanet,Cabin Deck,Cabin Number,Cabin Side,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,B,0,P,False,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,F,0,S,False,TRAPPIST-1e,27.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,A,0,S,False,TRAPPIST-1e,27.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,A,0,S,False,TRAPPIST-1e,27.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,F,1,S,False,TRAPPIST-1e,27.0,False,303.0,70.0,151.0,565.0,2.0,True


In [10]:
test_1.head()

Unnamed: 0,PassengerId,HomePlanet,Cabin Deck,Cabin Number,Cabin Side,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0013_01,Earth,G,3,S,True,TRAPPIST-1e,26.0,False,0.0,0.0,0.0,0.0,0.0
1,0018_01,Earth,F,4,S,False,TRAPPIST-1e,26.0,False,0.0,9.0,0.0,2823.0,0.0
2,0019_01,Europa,C,0,S,True,55 Cancri e,26.0,False,0.0,0.0,0.0,0.0,0.0
3,0021_01,Europa,C,1,S,False,TRAPPIST-1e,26.0,False,0.0,6652.0,0.0,181.0,585.0
4,0023_01,Earth,F,5,S,False,TRAPPIST-1e,26.0,False,10.0,0.0,635.0,0.0,0.0


### Checking for unique values in "HomePlanet" feature

In [97]:
hp_train = train_1["HomePlanet"].unique()
hp_test = test_1["HomePlanet"].unique()
hp_train.sort()
hp_test.sort()
print("Training : ",hp_train)
print("Testing  : ",hp_test)

Training :  ['Earth' 'Europa' 'Mars']
Testing  :  ['Earth' 'Europa' 'Mars']


### Performing one-hot encoding for "HomePlanet" feature

In [98]:
ohe = OneHotEncoder(drop=[["Earth"]])
train_ohe = ohe.fit_transform(train_1["HomePlanet"].to_numpy().reshape(-1,1)).toarray()
test_ohe = ohe.fit_transform(test_1["HomePlanet"].to_numpy().reshape(-1,1)).toarray()
home_planet_train = pd.DataFrame(train_ohe,columns=["HomePlanet_Europa","HomePlanet_Mars"])
home_planet_test = pd.DataFrame(test_ohe,columns=["HomePlanet_Europa","HomePlanet_Mars"])

In [99]:
train_2 = train_1.copy()
train_2.drop(columns=["HomePlanet"],axis=1,inplace=True)
ctr = 1
for i in home_planet_train:
    train_2.insert(loc=ctr,column=i,value=home_planet_train[i])
    ctr += 1
train_2.head()

Unnamed: 0,PassengerId,HomePlanet_Europa,HomePlanet_Mars,Cabin Deck,Cabin Number,Cabin Side,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,1.0,0.0,B,0,P,False,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,0.0,0.0,F,0,S,False,TRAPPIST-1e,27.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,1.0,0.0,A,0,S,False,TRAPPIST-1e,27.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,1.0,0.0,A,0,S,False,TRAPPIST-1e,27.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,0.0,0.0,F,1,S,False,TRAPPIST-1e,27.0,False,303.0,70.0,151.0,565.0,2.0,True


In [100]:
test_2 = test_1.copy()
test_2.drop(columns=["HomePlanet"],axis=1,inplace=True)
ctr = 1
for i in home_planet_test:
    test_2.insert(loc=ctr,column=i,value=home_planet_test[i])
    ctr += 1
test_2.head()

Unnamed: 0,PassengerId,HomePlanet_Europa,HomePlanet_Mars,Cabin Deck,Cabin Number,Cabin Side,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0013_01,0.0,0.0,G,3,S,True,TRAPPIST-1e,26.0,False,0.0,0.0,0.0,0.0,0.0
1,0018_01,0.0,0.0,F,4,S,False,TRAPPIST-1e,26.0,False,0.0,9.0,0.0,2823.0,0.0
2,0019_01,1.0,0.0,C,0,S,True,55 Cancri e,26.0,False,0.0,0.0,0.0,0.0,0.0
3,0021_01,1.0,0.0,C,1,S,False,TRAPPIST-1e,26.0,False,0.0,6652.0,0.0,181.0,585.0
4,0023_01,0.0,0.0,F,5,S,False,TRAPPIST-1e,26.0,False,10.0,0.0,635.0,0.0,0.0


### Checking for unique values in "Cabin Deck" feature

In [101]:
cd_train = train_2["Cabin Deck"].unique()
cd_test = test_2["Cabin Deck"].unique()
cd_train.sort()
cd_test.sort()
print("Training : ",cd_train)
print("Testing  : ",cd_test)

Training :  ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T']
Testing  :  ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T']


### Decimal to binary converter function for binary encoding

In [227]:
def bin_to_dec(num,num_ele):
    size = int(np.floor(np.log2(num_ele)))
    n = num
    s = []
    ctr = 0
    while n>0:
        r = int(n % 2)
        n = int(n / 2)
        s.append(r)
        ctr += 1
    for i in range(ctr,size+1):
        s.append(0)
    bin = s[::-1]
    return bin

In [228]:
bin_to_dec(1,16)

[0, 0, 0, 0, 1]

### Performing Binary Encoding for "Cabin Deck" feature

In [117]:
ctr = 0
d = dict()
for i in cd_train:
    d[i] = ctr
    ctr += 1
train_2["Cabin Deck"] = train_2["Cabin Deck"].map(d)
test_2["Cabin Deck"] = test_2["Cabin Deck"].map(d)

In [124]:
np.binary_repr(train_2["Cabin Deck"].to_numpy())

TypeError: only integer scalar arrays can be converted to a scalar index