In [69]:
# Import library

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer, SimpleImputer
from category_encoders import HashingEncoder
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB   
from sklearn.tree import DecisionTreeClassifier

# Training data preparation

In [70]:
# Read data

train_data = pd.read_csv('train.csv')

X_train_data = train_data.drop(['Transported'], axis='columns')
Y_train_data = train_data['Transported']

train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [71]:
def impute_data(df):

    ## Missing value
    cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
    num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    # Using Simple Imputer to deal with missing value of categorical variables
    imputer = SimpleImputer(strategy="most_frequent")
    imputer.fit(df[cat_col])
    df[cat_col] = imputer.transform(df[cat_col])

    # Using KNN Imputer to deal with missing value of numerical variables
    imputer = KNNImputer(n_neighbors=7)
    imputer.fit(df[num_col])
    df[num_col] = imputer.transform(df[num_col])

    # Remove missing value of cabin and name (because can not fill those missing value)
    # df = df.dropna(axis='index')

    return df

X_train_data = impute_data(X_train_data)

In [72]:
def split_column(df):

    # Create a column "PassengerGroup" from "PassengerId" 
    df['PassengerGroup'] = df['PassengerId'].str.split('_').str[0]

    # Create column "LastName" from "Name"
    df['LastName'] = df['Name'].str.split(' ').str[1]

    # Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
    df[['CabinDeck', 'CabinNum', 'CabinSide']] = df.Cabin.str.split('/', expand = True)

    # Drop 3 columns "PassengerId", "Name" and "Cabin"
    df = df.drop(['PassengerId', 'Name', 'Cabin', 'CabinNum'], axis='columns')

    return df

X_train_data = split_column(X_train_data)

In [73]:
# Convert boolean to integer: 0 = False, 1 = True

def bool_to_int(df):
    df['CryoSleep'] = list(map(int, df['CryoSleep']))
    df['VIP'] = list(map(int, df['VIP']))
    return df

X_train_data = bool_to_int(X_train_data)

Y_train_data = pd.Series(list(map(int, Y_train_data)))

In [74]:
def dummy_variable(df):

    # Create dummy variables and drop original ones

    HomePlanetDummies = pd.get_dummies(df['HomePlanet'], prefix='HomePlanet')
    df = pd.concat([df, HomePlanetDummies], axis='columns')

    DestinationDummies = pd.get_dummies(df['Destination'], prefix='Destination')
    df = pd.concat([df, DestinationDummies], axis='columns')

    CabinDeckDummies = pd.get_dummies(df['CabinDeck'], prefix='CabinDeck')
    df = pd.concat([df, CabinDeckDummies], axis='columns')

    CabinSideDummies = pd.get_dummies(df['CabinSide'], prefix='CabinSide')
    df = pd.concat([df, CabinSideDummies], axis='columns')

    df = df.drop(['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide'], axis='columns')

    return df

X_train_data = dummy_variable(X_train_data)

In [75]:
def hashing_encode(df):

    # Using feature hashing to encode PassengerGroup, CabinDeck and LastName

    encoder = HashingEncoder(cols='PassengerGroup',n_components=5)
    PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(df['PassengerGroup']))
    PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
    df = pd.concat([df, PassengerGroupDummies], axis='columns')

    # encoder = HashingEncoder(cols='CabinDeck',n_components=5)
    # CabinDeckDummies = pd.DataFrame(encoder.fit_transform(df['CabinDeck']))
    # CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
    # df = pd.concat([df, CabinDeckDummies], axis='columns')

    encoder = HashingEncoder(cols='LastName',n_components=5)
    LastNameDummies = pd.DataFrame(encoder.fit_transform(df['LastName']))
    LastNameDummies = LastNameDummies.add_prefix('LastName_')
    df = pd.concat([df, LastNameDummies], axis='columns')

    # df = df.drop(['PassengerGroup', 'CabinDeck', 'LastName'], axis='columns')
    df = df.drop(['PassengerGroup', 'LastName'], axis='columns')

    return df

X_train_data = hashing_encode(X_train_data)

In [76]:
# Standardization

scaler = StandardScaler().fit(X_train_data)

X_train_data = pd.DataFrame(scaler.transform(X_train_data))

# Testing data preparation

In [77]:
# Read data

X_test_data = pd.read_csv('test.csv')

# For writing to submission file
PassengerIdTest = X_test_data['PassengerId']

In [78]:
X_test_data = impute_data(X_test_data)

In [79]:
X_test_data = split_column(X_test_data)

In [80]:
X_test_data = bool_to_int(X_test_data)

In [81]:
X_test_data = dummy_variable(X_test_data)

In [82]:
X_test_data = hashing_encode(X_test_data)

In [83]:
# Standardization

X_test_data = pd.DataFrame(scaler.transform(X_test_data))

# Training and testing

## LogisticRegression

In [84]:
# clf = LogisticRegression().fit(X_train_data, Y_train_data)
# prediction = clf.predict(X_test_data)

## SVC

In [85]:
# # Cross validation, testing different values of C parameter

# C_list = [0.01, 0.1, 0.5, 1, 1.5, 2, 3, 5]
# C_score = []

# for c in C_list:
#     print("C =", c)
#     clf = LinearSVC(C=c)
#     cv_res = cross_validate(clf, X_train_data, Y_train_data, cv=5)
#     C_score.append(cv_res['test_score'].mean())

# C_score

In [86]:
# # K fold cross validation

# C_list = [2,2.5,3,3.5]
# C_score = []

# kf = KFold(n_splits=5, shuffle=True)

# # for each parameter
# for c in C_list:
#     print("-", c)
#     fold_score = []

#     # for each fold of a parameter
#     for i, (train_index, validate_index) in enumerate(kf.split(X_train_data)):
#         X_fold_train_data = X_train_data.iloc[train_index]
#         Y_fold_train_data = Y_train_data.iloc[train_index]

#         X_fold_validate_data = X_train_data.iloc[validate_index]
#         Y_fold_validate_data = Y_train_data.iloc[validate_index]

#         clf = SVC(C=c)
#         clf.fit(X_fold_train_data, Y_fold_train_data)
#         fold_score.append(
#             clf.score(X_fold_validate_data, Y_fold_validate_data)
#         )
    
#     C_score.append(np.mean(fold_score))

# C_score

Result:
- SVC(C=2.5): 0.79471
- LogisticRegression(): 0.79214
- GaussianNB(): 0.72901
- DecisionTreeClassifier(): 0.72854
- LinearSVC(): 0.71475

In [87]:
# Train with best value of C and predict

clf = DecisionTreeClassifier().fit(X_train_data, Y_train_data)
prediction = clf.predict(X_test_data)

## Prediction

In [88]:
res = pd.DataFrame(
        {
            'PassengerId': list(PassengerIdTest),
            'Transported': [(p == 1) for p in list(prediction)]
        }
    )
res.to_csv('submission.csv', index=False)