Preparing data for sklearn classifier 

In [2]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

from sklearn import metrics

In [3]:
## Load data
df = pd.read_csv('C:/Users/anebe/VSCode Projects/titanic_comp/data/raw/train.csv')
df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
## Drop redundant features
df = df.drop(['PassengerId', 'Name'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(5)
memory usage: 755.7+ KB


In [5]:
df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
df = df.drop('Cabin', axis=1)
df.head(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Cabin_num,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [6]:
## Encode categorical values
type_encode = LabelEncoder()

for object in ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Cabin_num', 'Side']:
    df[object] = df[object].astype('category')
    df[object] = type_encode.fit_transform(df[object])

In [7]:
df.info()
df.isnull().any()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   int32  
 1   CryoSleep     8693 non-null   int64  
 2   Destination   8693 non-null   int32  
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   bool   
 11  Deck          8693 non-null   int32  
 12  Cabin_num     8693 non-null   int32  
 13  Side          8693 non-null   int32  
dtypes: bool(1), float64(6), int32(5), int64(1), object(1)
memory usage: 721.7+ KB


HomePlanet      False
CryoSleep       False
Destination     False
Age              True
VIP              True
RoomService      True
FoodCourt        True
ShoppingMall     True
Spa              True
VRDeck           True
Transported     False
Deck            False
Cabin_num       False
Side            False
dtype: bool

In [8]:
## Replace null values in numerical features with 0's
for obj in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age']:
    df[obj] = df[obj].fillna(0)

df.isnull().any()

HomePlanet      False
CryoSleep       False
Destination     False
Age             False
VIP              True
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Transported     False
Deck            False
Cabin_num       False
Side            False
dtype: bool

In [9]:
df['VIP'] = df['VIP'].fillna(False)
df.isnull().any()

HomePlanet      False
CryoSleep       False
Destination     False
Age             False
VIP             False
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Transported     False
Deck            False
Cabin_num       False
Side            False
dtype: bool

In [10]:
## Split datasets and save to file
X = df.drop(['Transported'],axis=1).values
y = df['Transported'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)


Select and train models

In [11]:
## Load in different classifier models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.linear_model._passive_aggressive import PassiveAggressiveClassifier 


In [12]:
## Training
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)

In [13]:
## NN
nn = MLPClassifier(hidden_layer_sizes=(200,))
nn.fit(X_train, y_train)

In [14]:
## PAC
pac = PassiveAggressiveClassifier()
pac.fit(X_train, y_train)

In [15]:
## Test Models
for x in [[clf, 'Random Forest'], [nn, 'Neural Network'], [pac, 'Passive Agressive']]:
    y_pred = x[0].predict(X_test)
    print(x[1]+' accuracy: ', metrics.accuracy_score(y_test, y_pred))

Random Forest accuracy:  0.8031278748850046
Neural Network accuracy:  0.7769089236430543
Passive Agressive accuracy:  0.5961361545538179


Apply processing to test data and generate submission

In [16]:
## Doing all earlier processing to test data
## Leave passenger id in
df = pd.read_csv('C:/Users/anebe/VSCode Projects/titanic_comp/data/raw/test.csv')
df = df.drop(['Name'], axis=1)
df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
df = df.drop('Cabin', axis=1)
for object in ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Cabin_num', 'Side']:
    df[object] = df[object].astype('category')
    df[object] = type_encode.fit_transform(df[object])
for obj in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age']:
    df[obj] = df[obj].fillna(0)
df['VIP'] = df['VIP'].fillna(False)

In [39]:
## Split data
submissions = df['PassengerId'].values
X = df.drop(['PassengerId'], axis=1).values

In [40]:
## Make predictions
test_pred = clf.predict(X)

In [50]:
## Combine Id's and results
submissions_arr = np.dstack((submissions, test_pred))
columns = ['PassengerId', 'Transported']
submissions_df = pd.DataFrame(submissions_arr[0], columns=columns)
submissions_df.head(5)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [53]:
## Save to file for submission
submissions_df.to_csv('C:/Users/anebe/VSCode Projects/titanic_comp/data/processed/submission.csv', index=False)