In [None]:
# Import library

import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from category_encoders import HashingEncoder
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Training data preparation

In [None]:
# Read data

train_data = pd.read_csv('train.csv')

In [None]:
def impute_data(df):

    ## Missing value
    cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
    num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    # Using Simple Imputer to deal with missing value of categorical variables
    imputer = SimpleImputer(strategy="most_frequent")
    imputer.fit(df[cat_col])
    df[cat_col] = imputer.transform(df[cat_col])

    # Using KNN Imputer to deal with missing value of numerical variables
    imputer = KNNImputer(n_neighbors=7)
    imputer.fit(df[num_col])
    df[num_col] = imputer.transform(df[num_col])

    # Remove missing value of cabin and name (because can not fill those missing value)
    # df = df.dropna(axis='index')

    return df

train_data = impute_data(train_data)

In [None]:
def split_column(df):

    # Create a column "PassengerGroup" from "PassengerId" 
    df['PassengerGroup'] = df['PassengerId'].str.split('_').str[0]

    # Create column "LastName" from "Name"
    df['LastName'] = df['Name'].str.split(' ').str[1]

    # Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
    df[['CabinDeck', 'CabinNum', 'CabinSide']] = df.Cabin.str.split('/', expand = True)

    # Drop 3 columns "PassengerId", "Name" and "Cabin"
    df = df.drop(['PassengerId', 'Name', 'Cabin', 'CabinNum'], axis='columns')

    return df

train_data = split_column(train_data)

In [None]:
# Convert boolean to integer: 0 = False, 1 = True (for training data: with Transported column)

def bool_to_int_train(df):
    df['CryoSleep'] = list(map(int, df['CryoSleep']))
    df['VIP'] = list(map(int, df['VIP']))
    df['Transported'] = list(map(int, df['Transported']))
    return df

train_data = bool_to_int_train(train_data)

In [None]:
def dummy_variable(df):

    # Create dummy variables and drop original ones

    HomePlanetDummies = pd.get_dummies(df['HomePlanet'], prefix='HomePlanet')
    df = pd.concat([df, HomePlanetDummies], axis='columns')

    DestinationDummies = pd.get_dummies(df['Destination'], prefix='Destination')
    df = pd.concat([df, DestinationDummies], axis='columns')

    CabinDeckDummies = pd.get_dummies(df['CabinDeck'], prefix='CabinDeck')
    df = pd.concat([df, CabinDeckDummies], axis='columns')

    CabinSideDummies = pd.get_dummies(df['CabinSide'], prefix='CabinSide')
    df = pd.concat([df, CabinSideDummies], axis='columns')

    df = df.drop(['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide'], axis='columns')

    return df

train_data = dummy_variable(train_data)

In [None]:
def hashing_encode(df):

    # Using feature hashing to encode PassengerGroup, CabinDeck and LastName

    encoder = HashingEncoder(cols='PassengerGroup',n_components=5)
    PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(df['PassengerGroup']))
    PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
    df = pd.concat([df, PassengerGroupDummies], axis='columns')

    # encoder = HashingEncoder(cols='CabinDeck',n_components=5)
    # CabinDeckDummies = pd.DataFrame(encoder.fit_transform(df['CabinDeck']))
    # CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
    # df = pd.concat([df, CabinDeckDummies], axis='columns')

    encoder = HashingEncoder(cols='LastName',n_components=5)
    LastNameDummies = pd.DataFrame(encoder.fit_transform(df['LastName']))
    LastNameDummies = LastNameDummies.add_prefix('LastName_')
    df = pd.concat([df, LastNameDummies], axis='columns')

    # df = df.drop(['PassengerGroup', 'CabinDeck', 'LastName'], axis='columns')
    df = df.drop(['PassengerGroup', 'LastName'], axis='columns')

    return df

train_data = hashing_encode(train_data)

# Testing data preparation

In [None]:
# Read data

test_data = pd.read_csv('test.csv')

# For writing to submission file
PassengerIdTest = test_data['PassengerId']

In [None]:
test_data = impute_data(test_data)

In [None]:
test_data = split_column(test_data)

In [None]:
# Convert boolean to integer: 0 = False, 1 = True (for testing data: without Transported column)

def bool_to_int_test(df):
    df['CryoSleep'] = list(map(int, df['CryoSleep']))
    df['VIP'] = list(map(int, df['VIP']))
    return df

test_data = bool_to_int_test(test_data)

In [None]:
test_data = dummy_variable(test_data)

In [None]:
test_data = hashing_encode(test_data)

# Training and testing

## LogisticRegression

In [None]:
clf = LogisticRegression().fit(train_data.drop(['Transported'], axis='columns'), train_data['Transported'])
prediction = clf.predict(test_data)

## LinearSVC

In [None]:
# # Cross validation, testing different values of C parameter

# C_list = [0.01, 0.1, 0.5, 1, 1.5, 2, 3, 5]
# C_score = []

# for c in C_list:
#     print("C =", c)
#     clf = LinearSVC(C=c)
#     cv_res = cross_validate(clf, train_data.drop(['Transported'], axis='columns'), train_data['Transported'], cv=5)
#     C_score.append(cv_res['test_score'].mean())

# C_score

In [None]:
# # Train with best value of C and predict

# clf = LinearSVC().fit(train_data.drop(['Transported'], axis='columns'), train_data['Transported'])
# prediction = clf.predict(test_data)

## Prediction

In [None]:
res = pd.DataFrame(
        {
            'PassengerId': list(PassengerIdTest),
            'Transported': [(p == 1) for p in list(prediction)]
        }
    )
res.to_csv('submission.csv', index=False)