In [424]:
import numpy as np
import pandas as pd
import xgboost as xgb
from copy import deepcopy
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Activation
from tensorflow.keras.losses import BinaryCrossentropy

In [2]:
train_file = "data/train.csv"

### Load data and split it to tran and test

In [622]:
df = pd.read_csv(train_file, sep=',')

In [623]:
y = df.Transported.astype("int")
X = df.drop(columns=['Transported', 'Name', 'PassengerId', 'VIP'])

In [653]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(len(X_train), len(y_train), len(X_test), len(y_test))

6954 6954 1739 1739


### Features selection

In [625]:
cols_NaN_to_mode = ["HomePlanet", "CryoSleep", "Cabin", "Destination"]
cols_NaN_to_mean = ["Age"]
cols_NaN_to_zero = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

In [776]:
# Implement classes for transformation of some colomns for using in Pipeline  

In [757]:
class ReplaceNaNToMode:
    
    def __init__(self, columns):
        self.columns = columns
        self.modes = []
        
    def fit(self, X, y=None):
        for i, col in enumerate(self.columns):
            self.modes.append(X[col].mode()[0])
        return self
    
    def transform(self, X, y=None):
        X_copy = deepcopy(X)
        for i, col in enumerate(self.columns):
            X_copy[col].fillna(self.modes[i], inplace=True)
        return X_copy
    

class ReplaceNaNToMean:
    
    def __init__(self, columns):
        self.columns = columns
        self.means = []
        
    def fit(self, X, y=None):
        for i, col in enumerate(self.columns):
            self.means.append(X[col].mean())
        return self
    
    def transform(self, X, y=None):
        X_copy = deepcopy(X)
        for i, col in enumerate(self.columns):
            X_copy[col].fillna(self.means[i], inplace=True)
        return X_copy

    
class ReplaceNaNToZero:
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = deepcopy(X)
        for i, col in enumerate(self.columns):
            X_copy[col].fillna(0, inplace=True)
        return X_copy

    
class OwnStandardScaler:
    
    def __init__(self, columns):
        self.columns = columns
        self.scaler = StandardScaler()
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self
    
    def transform(self, X, y=None):
        X_copy = deepcopy(X)
        X_copy[self.columns] = self.scaler.transform(X_copy[self.columns])
        return X_copy


class SplitCabin:
    
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = deepcopy(X)
        df_cabin = X_copy[self.column].str.split("/", expand=True)
#         X[self.column+"_Deck"], X[self.column+"_Num"], X[self.column+"_Side"]  = df_cabin[0], df_cabin[1], df_cabin[2]
        X_copy[self.column+"_Deck"], X_copy[self.column+"_Side"]  = df_cabin[0], df_cabin[2]
        X_copy.drop(columns=['Cabin'], inplace=True)
        return X_copy
    

class OwnOneHotEncoder:
    
    def __init__(self, columns):
        self.columns = columns
        self.encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        
    def fit(self, X, y=None):
        self.encoder.fit(X[self.columns])
        return self
    
    def transform(self, X, y=None):
        X_copy = deepcopy(X)
        data = self.encoder.transform(X_copy[self.columns])
        new_columns = []
        for i, col in enumerate(self.columns):
            new_columns += [f"{col}_{(str(c).replace(' ','_'))}" for c in self.encoder.categories_[i]]
        new_df = pd.DataFrame(data, columns=new_columns)
        X_copy = X_copy.drop(columns=self.columns)
        X_copy.reset_index(inplace=True, drop=True)
        X_copy = pd.concat((X_copy, new_df), axis=1)
        return X_copy
    
    
class CastType:
    
    def __init__(self, column_type_dict):
        self.column_type_dict = column_type_dict
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = deepcopy(X)        
        X_copy = X_copy.astype(self.column_type_dict)
        return X_copy

    
class AddNewFeatures:
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = deepcopy(X)
        X_copy["IsAmenities"] = (X_copy["RoomService"] + X_copy["FoodCourt"] + X_copy["ShoppingMall"] + \
        X_copy["Spa"] +X_copy["VRDeck"]) > 0
        return X_copy 

In [758]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6954 entries, 3559 to 8384
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    6792 non-null   object 
 1   CryoSleep     6777 non-null   object 
 2   Cabin         6802 non-null   object 
 3   Destination   6808 non-null   object 
 4   Age           6815 non-null   float64
 5   RoomService   6811 non-null   float64
 6   FoodCourt     6803 non-null   float64
 7   ShoppingMall  6786 non-null   float64
 8   Spa           6807 non-null   float64
 9   VRDeck        6795 non-null   float64
dtypes: float64(6), object(4)
memory usage: 597.6+ KB


### Make pipelines

In [764]:
preprocessing_w_ohe = Pipeline([('replace_NaN_to_mode', ReplaceNaNToMode(cols_NaN_to_mode)), 
                                ('replace_NaN_to_mean', ReplaceNaNToMean(cols_NaN_to_mean)),
                                ('replace_NaN_to_zero', ReplaceNaNToZero(cols_NaN_to_zero)),
                                ('add_new_features', AddNewFeatures()),                                
                                ('standard_scaler', OwnStandardScaler(['Age', 'RoomService', 'FoodCourt', 
                                                                       'ShoppingMall', 'Spa', 'VRDeck'])),
                                ('split_cabin', SplitCabin("Cabin")),
                                ('one_hot_encoder', OwnOneHotEncoder(['HomePlanet', 'Destination', 'Cabin_Deck', 'Cabin_Side'])),
                                ('cast_type', CastType({"CryoSleep": "int", "IsAmenities": "int"}))
                               ])

### SVM

In [750]:
svm_pipeline = deepcopy(preprocessing_w_ohe)
svm_pipeline.steps.append(('svc', SVC()))
param_grid =  {
    'svc__C': [1, 5, 10, 15, 20], 
    'svc__gamma': ['scale', 'auto'], 
    'svc__kernel': ['rbf', 'sigmoid']
}
gs = GridSearchCV(svm_pipeline, param_grid, n_jobs=8, return_train_score=True, error_score='raise', verbose=3)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [751]:
gs.best_estimator_

In [766]:
gs.score(X_test, y_test)

0.8102357676825762

### XGBoost

In [720]:
xgb_pipeline = deepcopy(preprocessing_w_ohe)
xgb_pipeline.steps.append(('xgb', xgb.XGBClassifier()))

In [721]:
xgb_pipeline.fit(X_train, y_train)

In [722]:
xgb_pipeline.score(X_test, y_test)

0.8090856814261069

### NN

In [777]:
# Implement function for transformation dataset

In [734]:
def preprocessing(X_train, X_test=None):
    t = ReplaceNaNToMode(cols_NaN_to_mode)
    t.fit(X_train)
    X_tmp = t.transform(X_test if X_test is not None else X_train)

    t = ReplaceNaNToMean(cols_NaN_to_mean)
    t.fit(X_train)
    X_tmp = t.transform(X_tmp)

    t = ReplaceNaNToZero(cols_NaN_to_zero)
    t.fit(X_train)
    X_tmp = t.transform(X_tmp)

    t = AddNewFeatures()
    t.fit(X_tmp)
    X_tmp = t.transform(X_tmp)

    t = OwnStandardScaler(['Age', 'RoomService', 'FoodCourt',
                           'ShoppingMall', 'Spa', 'VRDeck'])
    t.fit(X_train)
    X_tmp = t.transform(X_tmp)

    sc = SplitCabin("Cabin")
    sc.fit(X_tmp)
    X_tmp = sc.transform(X_tmp)

    ohe = OwnOneHotEncoder(['HomePlanet', 'Destination', 'Cabin_Deck', 'Cabin_Side'])
    ohe.fit(X_tmp)
    X_tmp = ohe.transform(X_tmp)

    ct = CastType({"CryoSleep": "int", "IsAmenities": "int"})
    ct.fit(X_tmp)
    X_tmp = ct.transform(X_tmp)
    
    return X_tmp

In [735]:
X_train_preproc = preprocessing(X_train)
X_test_preproc = preprocessing(X_train, X_test)
X_train_preproc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6954 entries, 0 to 6953
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  6954 non-null   int64  
 1   Age                        6954 non-null   float64
 2   RoomService                6954 non-null   float64
 3   FoodCourt                  6954 non-null   float64
 4   ShoppingMall               6954 non-null   float64
 5   Spa                        6954 non-null   float64
 6   VRDeck                     6954 non-null   float64
 7   IsAmenities                6954 non-null   int64  
 8   HomePlanet_Earth           6954 non-null   float64
 9   HomePlanet_Europa          6954 non-null   float64
 10  HomePlanet_Mars            6954 non-null   float64
 11  Destination_55_Cancri_e    6954 non-null   float64
 12  Destination_PSO_J318.5-22  6954 non-null   float64
 13  Destination_TRAPPIST-1e    6954 non-null   float

In [778]:
# Define NN

In [723]:
def SpaceShipTitanicNN(input_shape, name):
    inputs = Input(input_shape)
    X = Dense(units=128, activation='relu')(inputs)
    X = Dense(units=1)(X)
    X = Activation('sigmoid')(X)
    model = Model(inputs=inputs, outputs=X, name=name)
    return model

In [724]:
feature_counts = X_train_preproc.shape[1]

In [732]:
model = SpaceShipTitanicNN(input_shape=(feature_counts,), name="SpaceShipTitanicNN")
model.summary()

Model: "SpaceShipTitanicNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        [(None, 24)]              0         
_________________________________________________________________
dense_55 (Dense)             (None, 128)               3200      
_________________________________________________________________
dense_56 (Dense)             (None, 1)                 129       
_________________________________________________________________
activation_21 (Activation)   (None, 1)                 0         
Total params: 3,329
Trainable params: 3,329
Non-trainable params: 0
_________________________________________________________________


In [733]:
model.compile(loss=BinaryCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"])

In [736]:
model.fit(X_train_preproc.to_numpy(), y_train, epochs=50, batch_size=64, 
          validation_data=(X_test_preproc.to_numpy(), y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f65ac343580>

In [737]:
model.evaluate(X_test_preproc.to_numpy(), y_test)



[0.39219486713409424, 0.8079355955123901]

### Prediction

In [782]:
# SVM is the best estimator

In [770]:
final_model = gs.best_estimator_

In [771]:
test_file = "data/test.csv"
submission_file = "data/submission.csv"
df_test_sub = pd.read_csv(test_file, sep=',')
df_submission = df_test_sub[["PassengerId"]]
X_test_sub = df_test_sub.drop(columns=['Name', 'PassengerId', 'VIP'])

In [774]:
y_predict = final_model.predict(X_test_sub)
df_submission = df_submission.assign(Transported=y_predict.astype('bool'))
df_submission.to_csv(submission_file, index=False)

In [None]:
# https://www.kaggle.com/competitions/spaceship-titanic/leaderboard# - Alexander Akhterov

In [None]:
# Predict for NN

In [549]:
X_test_sub_preproc = preprocessing(X_train, X_test_sub)

In [560]:
y_predict = model.predict(X_test_sub_preproc)
y_predict = (y_predict>0.5).squeeze()

In [562]:
df_submission = df_submission.assign(Transported=y_predict)
df_submission.to_csv(submission_file, index=False)