In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [15]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [16]:
train_data.dropna(inplace=True)

In [17]:
X = train_data.drop(['PassengerId', 'Name', 'Transported'], axis=1)
y = train_data['Transported']

In [18]:
X.rename(columns={
    'HomePlanet': 'home_planet', 
    'CryoSleep' : 'cryo_sleep', 
    'Cabin' : 'cabin', 
    'Destination' : 'destination', 
    'Age' : 'age', 
    'VIP' : 'vip',
    'RoomService' : 'room_service',
    'FoodCourt' : 'food_court',
    'ShoppingMall' : 'shopping_mall',
    'Spa' : 'spa',
    'VRDeck' : 'vr_deck'
    }, inplace=True)

In [19]:
X.dropna(subset=['cabin'], inplace=True)
X['cabin_deck'] = X.cabin.str[0]
X['cabin_num'] = X.cabin.str[2:-2].astype(int)
X['cabin_side'] = X.cabin.str[-1]
X.drop('cabin', axis=1, inplace=True)

# Dumb Model

In [20]:
# make a dummy model

from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X, y)
dummy_clf.score(X, y)


0.5036330608537694

# Model - SVM

In [21]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# scale the numeric features
scaler = StandardScaler()
X[['age', 'cabin_num']] = scaler.fit_transform(X[['age', 'cabin_num']])


# split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

X_train = X_train.drop(['home_planet', 'destination'], axis=1)
X_val = X_val.drop(['home_planet', 'destination'], axis=1)

X_train = pd.get_dummies(X_train, columns=['cabin_deck', 'cabin_side'])
X_val = pd.get_dummies(X_val, columns=['cabin_deck', 'cabin_side'])

model = svm.SVC(kernel='rbf', C=1, gamma=0.1)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy_score(y_val, y_pred)

  X_train = pd.get_dummies(X_train, columns=['cabin_deck', 'cabin_side'])
  X_val = pd.get_dummies(X_val, columns=['cabin_deck', 'cabin_side'])


0.7344931921331316

In [24]:
# make a test submission

test_data.isna().sum()


PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64