In [None]:
#make sure to add shared folder as a shortcut in drive
#https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab
from google.colab import drive 

drive.mount('/content/drive', force_remount=True)
path = "/content/drive/My Drive/Data Analytics Project Folder/"

Mounted at /content/drive


In [None]:
#only for kaggle oauth token
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle.json': b'{"username":"aidanhorn","key":"7468ac9d8fc189c159d8be617482e7f3"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
#Import statements here


import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import train_test_split, cross_validate

from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import KNNImputer
from sklearn.impute import IterativeImputer

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.cluster import KMeans

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn import metrics

from sklearn.metrics import confusion_matrix

# LDA/QDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# SVM
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import svm

# MLP
from sklearn.neural_network import MLPClassifier


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

Checklist:


*   Data preprocessing
  * normalize distances - use standard scalar in pipeline
  * encode categorical data - done
  * Create new predictors - done
  * Missing Values
      *  one-hot encoding - imply missing data (none of the above, 0 in all columns)
      *  Drop rows with missing values (bad)
      *  Mean/mode imputation (MM)
      *  KNN imputation
*   EDA
  * Predictor variance
  * principle components
* Methods
  * Logistic Regression - Done MM & KNN
  * LDA/QDA
  * SVM
    * Try Kernalization (Sklearn LinearSVM)
  



# Load Data, Imputation, and Feature Engineering

In [None]:
#data contains all data we can use for training and testing our model
#validation contains submission validation set to determine competition score

data = pd.read_csv(path + "data.csv")
validation = pd.read_csv(path + "validation.csv")


In [None]:
#methods for imputation and feature engineering

def build_df(data):
  df = data.loc[:, ['CryoSleep','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]

  df['InParty'] = [1 if sum(id[:4] in p_Id for p_Id in data['PassengerId']) > 1 else 0 for id in data['PassengerId']]

  hp_dummy = pd.get_dummies(data['HomePlanet'], drop_first=False, prefix = "From", prefix_sep='_')

  dest_dummy = pd.get_dummies(data['Destination'], drop_first=False, prefix = "To", prefix_sep='_')

  df = pd.concat([dest_dummy, df], axis=1)

  df = pd.concat([hp_dummy, df], axis=1)

  if 'Transported' in data.columns:
    df['Transported'] = data['Transported']

  return df

def build_df_dropna(data):
  
  df = build_df(data)

  df = df.dropna()

  df.name = "DropNA"

  return df

def build_df_MM(data):

  df = data.drop(['PassengerId','Cabin','Name'], axis = 1)

  hp_dummy = pd.get_dummies(data['HomePlanet'], drop_first=False, prefix = "From", prefix_sep='_')

  dest_dummy = pd.get_dummies(data['Destination'], drop_first=False, prefix = "To", prefix_sep='_')

  df = pd.concat([dest_dummy, df], axis=1)

  df = pd.concat([hp_dummy, df], axis=1)

  df.pop('HomePlanet')
  
  df.pop('Destination')

  df = df.fillna(df.mean())

  df.name = "MeanImpute"

  return df

def build_df_KNN(data):

  df = build_df(data)

  imputer = KNNImputer(n_neighbors=10)

  if 'Transported' in data.columns:

    df = pd.DataFrame(imputer.fit_transform(df.drop(['Transported'], axis = 1)), columns = df.columns[:-1])

    df['Transported'] = data['Transported']
  
  else:

    df = pd.DataFrame(imputer.fit_transform(df),columns = df.columns)

  df.name = "KNNImpute"

  return df

def build_df_II(data):

  enc = OrdinalEncoder()

  if 'Transported' in data.columns:

    df_raw = data.drop(['Transported'], axis = 1)
  
  else:

    df_raw = data

  dfOE = pd.DataFrame(enc.fit_transform(df_raw),columns = df_raw.columns)

  imp = IterativeImputer(max_iter=10, random_state=0)

  imp.fit(dfOE)

  df = pd.DataFrame(imp.transform(dfOE), columns = dfOE.columns)

  df = pd.DataFrame(enc.inverse_transform(df),columns = df.columns).fillna(0)

  df.pop('Name')

  df['InParty'] = [1 if sum(id[:4] in p_Id for p_Id in df['PassengerId']) > 1 else 0 for id in df['PassengerId']]

  df.pop('PassengerId')

  hp_dummy = pd.get_dummies(df['HomePlanet'], drop_first=True, prefix = "From", prefix_sep='')

  dest_dummy = pd.get_dummies(df['Destination'], drop_first=True, prefix = "To", prefix_sep='')

  df = pd.concat([dest_dummy, df], axis=1)

  df = pd.concat([hp_dummy, df], axis=1)

  df.pop('HomePlanet')

  df.pop('Destination')

  df[['CabinDeck','CabinNumber','CabinSide']] = df['Cabin'].str.split("/", n = 2, expand = True)
  df.pop('Cabin')

  side_dummy = pd.get_dummies(df['CabinSide'], prefix = "Side", drop_first=True)
  df = pd.concat([side_dummy, df], axis=1)
  df.pop('CabinSide')

  df['CabinNumber'] = df['CabinNumber'].astype(float)

  nom = (df.groupby('CabinDeck').size()) / len(df)
  df['CabinDeck'] = df['CabinDeck'].apply(lambda x : nom[x])

  if 'Transported' in data.columns:

      df['Transported'] = data['Transported']

  df.name = "IterativeImpute"

  return df

def build_df_custom(data):

  df_temp = data.loc[:, ['CryoSleep','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]

  df_temp['InParty'] = [1 if sum(id[:4] in p_Id for p_Id in data['PassengerId']) > 1 else 0 for id in data['PassengerId']]

  df_temp.CryoSleep = df_temp.CryoSleep.replace({True: 1, False: 0})
  df_temp.VIP = df_temp.VIP.replace({True: 1, False: 0})

  mode = data['HomePlanet'].value_counts().index[0]
  df_temp['HomePlanet'] = data['HomePlanet'].fillna(mode)

  mode = data['Destination'].value_counts().index[0]
  df_temp['Destination'] = data['Destination'].fillna(mode)

  hp_dummy = pd.get_dummies(data['HomePlanet'], drop_first=True, prefix = "From", prefix_sep='')

  dest_dummy = pd.get_dummies(data['Destination'], drop_first=True, prefix = "To", prefix_sep='')

  df_temp = pd.concat([dest_dummy, df_temp], axis=1)

  df_temp = pd.concat([hp_dummy, df_temp], axis=1)

  df_temp.pop('HomePlanet')

  df_temp.pop('Destination')

  df_temp['CryoSleep'] = df_temp['CryoSleep'].fillna(False)
  #df_temp.loc[df_temp['CryoSleep'] == True,['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = df_temp.loc[df_temp['CryoSleep'] == True,['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0)

  df_temp[['CabinDeck','CabinNumber','CabinSide']] = data['Cabin'].str.split("/", n = 2, expand = True)

  df_temp['CabinDeck'] = df_temp['CabinDeck'].fillna('F')


  mode = df_temp[df_temp.CabinDeck == 'F']['CabinSide'].value_counts().index[0]
  df_temp['CabinSide'] = df_temp['CabinSide'].fillna(mode)
  side_dummy = pd.get_dummies(df_temp['CabinSide'], prefix = "Side", drop_first=True)
  df_temp = pd.concat([side_dummy, df_temp], axis=1)
  df_temp.pop('CabinSide')

  df_temp['CabinNumber'] = df_temp['CabinNumber'].astype(float)
  df_temp['CabinNumber'] = df_temp['CabinNumber'].fillna(1796/2)

  df_temp['VIP'] = df_temp['VIP'].fillna(False)

  median = df_temp['Age'].describe()[5]
  df_temp['Age'] = df_temp['Age'].fillna(median)

  df_temp[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = df_temp[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0.0)

  nom = (df_temp.groupby('CabinDeck').size()) / len(df_temp)
  df_temp['CabinDeck'] = df_temp['CabinDeck'].apply(lambda x : nom[x])

  if 'Transported' in data.columns:
    df_temp['Transported'] = data['Transported']
    df_temp.Transported = df_temp.Transported.replace({True: 1, False: 0})

  df_temp.name = "Custom"

  return df_temp

Exploratory Analysis

In [None]:
data.head()

# displaying the data 

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

# Building Datasets

In [None]:
df = build_df(data)

df_dropna = build_df_dropna(data)

df_MM = build_df_MM(data)

df_KNN = build_df_KNN(data)

df_II = build_df_II(data)

df_temp = build_df_custom(data)

In [None]:
#Construct list for comparison of FE/Imputation techniques for train/test

df_methods = [df_MM, df_KNN, df_II, df_temp]

In [None]:
valid = build_df(validation)

valid_dropna = build_df_dropna(validation)

valid_MM = build_df_MM(validation)

valid_KNN = build_df_KNN(validation)

valid_II = build_df_II(validation)

valid_temp = build_df_custom(validation)

In [None]:
#Construct list for comparison of FE/Imputation techniques for validation

valid_methods = [valid_MM, valid_KNN, valid_II, valid_temp]

# Reports

In [None]:
ProfileReport(data)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
ProfileReport(df_temp)

Output hidden; open in https://colab.research.google.com to view.

# Random Plots

In [None]:
%matplotlib notebook
sns.pairplot(df_temp).savefig(path+'pairplot.png')

<IPython.core.display.Javascript object>

# Logistic Regression


In [None]:
#Logistic Regression with train/test set

for df_ in df_methods:

  X = df_.drop(['Transported'], axis = 1)
  y = df_['Transported']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(),LogisticRegression(random_state=0, max_iter = 1000, solver = 'liblinear')).fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  print("Method:", df_.name)

  print("Score:", clf.score(X_test, y_test))

  print(cf, end = "\n\n")

Method: MeanImpute
Score: 0.781441717791411
Actual     False  True
Predicted             
False       1006   286
True         284  1032

Method: KNNImpute
Score: 0.7967791411042945
Actual     False  True
Predicted             
False       1000   248
True         282  1078

Method: IterativeImpute
Score: 0.7668711656441718
Actual     False  True
Predicted             
False       1078   396
True         212   922

Method: Custom
Score: 0.7906441717791411
Actual     False  True
Predicted             
False        984   252
True         294  1078



Confusion Matrix

In [None]:
# Confusion matrix for Logistic Regression
import pylab as pl

cm = confusion_matrix(y_test, y_pred)
pl.matshow(cm)
pl.title('Confusion matrix of the classifier')
pl.colorbar()
pl.show()

In [None]:
#Logsitic Regression with complete data and validation set
#Kaggle Score by Impute Method: 
#Mean: 0.78559
#KNN: 0.78676
#Iterative: 0.78536

for df_, valid_ in zip(df_methods, valid_methods):

  X_train = df_.drop(['Transported'], axis = 1)
  y_train = df_['Transported']

  X_test = valid_

  #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(),LogisticRegression(random_state=0, max_iter = 1000, solver = 'liblinear')).fit(X_train, y_train)

  y_pred = clf.predict(X_test).astype('bool')

  #cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  submission = pd.DataFrame({ 'PassengerId': validation.PassengerId.values, 'Transported': y_pred })
  
  fname = "my_submission_LR_" + df_.name + ".csv"

  submission.to_csv(fname, index=False)

In [None]:
!kaggle competitions submit -c spaceship-titanic -f my_submission_LR_IterativeImpute.csv -m "IterativeImputeLogisticRegression"

# Random Forest

In [None]:
#Random Forest with train/test split

for df_ in df_methods:

  X = df_.drop(['Transported'], axis = 1)
  y = df_['Transported']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf=make_pipeline(StandardScaler(),RandomForestClassifier(n_estimators=400, criterion = 'entropy', bootstrap = True))
  
  clf.fit(X_train,y_train)

  y_pred=clf.predict(X_test)

  cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  print("Method:", df_.name)

  print("Score:", clf.score(X_test, y_test))

  print(cf, end = "\n\n")

Method: MeanImpute
Score: 0.7868098159509203
Actual     False  True
Predicted             
False       1050   317
True         239  1002

Method: KNNImpute
Score: 0.781441717791411
Actual     False  True
Predicted             
False        997   300
True         270  1041

Method: IterativeImpute
Score: 0.7925613496932515
Actual     False  True
Predicted             
False       1066   299
True         242  1001

Method: Custom
Score: 0.7975460122699386
Actual     False  True
Predicted             
False       1084   321
True         207   996



In [None]:
#Random Forest with complete data and validation set
#Kaggle Score by Impute Method: 
#Mean: 0.78512
#KNN: 0.77834
#Iterative: 0.77133

for df_, valid_ in zip(df_methods, valid_methods):

  X_train = df_.drop(['Transported'], axis = 1)
  y_train = df_['Transported']

  X_test = valid_

  #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf=make_pipeline(StandardScaler(),RandomForestClassifier(n_estimators=400, criterion = 'entropy', bootstrap = True)).fit(X_train,y_train)

  y_pred = clf.predict(X_test).astype('bool')

  #cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  submission = pd.DataFrame({ 'PassengerId': validation.PassengerId.values, 'Transported': y_pred })
  
  fname = "my_submission_RF_" + df_.name + ".csv"

  submission.to_csv(fname, index=False)

In [None]:
!kaggle competitions submit -c spaceship-titanic -f my_submission_RF_Custom.csv -m "RF Custom 400 entropy bootstrap"

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.7/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 166, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


# KNN Classifier

In [None]:
#KNN with train/test

for df_ in df_methods:

  X = df_.drop(['Transported'], axis = 1)
  y = df_['Transported']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5)).fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  print("Method:", df_.name)

  print("Score:", clf.score(X_test, y_test))

  print(cf, end = "\n\n")

Method: MeanImpute
Score: 0.75920245398773
Actual     False  True
Predicted             
False        971   272
True         356  1009

Method: KNNImpute
Score: 0.7584355828220859
Actual     False  True
Predicted             
False       1020   334
True         296   958

Method: IterativeImpute
Score: 0.7649539877300614
Actual     False  True
Predicted             
False        999   321
True         292   996

Method: Custom
Score: 0.7756901840490797
Actual     False  True
Predicted             
False       1015   293
True         292  1008



In [None]:
#KNN with complete data and validation set
#Kaggle Score by Impute Method: 
#Mean: 
#KNN: 
#Iterative

for df_, valid_ in zip(df_methods, valid_methods):

  X_train = df_.drop(['Transported'], axis = 1)
  y_train = df_['Transported']

  X_test = valid_

  #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=10)).fit(X_train, y_train)

  y_pred = clf.predict(X_test).astype('bool')

  #cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  submission = pd.DataFrame({ 'PassengerId': validation.PassengerId.values, 'Transported': y_pred })
  
  fname = "my_submission_KNN_" + df_.name + ".csv"

  submission.to_csv(fname, index=False)

In [None]:
!kaggle competitions submit -c spaceship-titanic -f my_submission_KNN_Custom.csv -m "KNN Custom 10"

100% 56.6k/56.6k [00:01<00:00, 37.4kB/s]
Successfully submitted to Spaceship Titanic

# LDA/QDA

In [None]:
# LDA 

for df_ in df_methods:

  X = df_.drop(['Transported'], axis = 1)
  y = df_['Transported']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), LinearDiscriminantAnalysis(solver = 'svd')) # use singular value decomp to solve 

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  print("Method:", df_.name)

  print("Score:", clf.score(X_test, y_test))

  print(cf, end = "\n\n")



Method: MeanImpute
Score: 0.7668711656441718
Actual     False  True
Predicted             
False       1087   392
True         216   913

Method: KNNImpute
Score: 0.758819018404908
Actual     False  True
Predicted             
False       1063   393
True         236   916

Method: IterativeImpute
Score: 0.7565184049079755
Actual     False  True
Predicted             
False       1077   416
True         219   896

Method: Custom
Score: 0.7672546012269938
Actual     False  True
Predicted             
False       1076   372
True         235   925



In [None]:
#LDA with complete data and validation set
#Kaggle Score by Impute Method: 
#Mean: 
#KNN: 
#Iterative: 

for df_, valid_ in zip(df_methods, valid_methods):

  X_train = df_.drop(['Transported'], axis = 1)
  y_train = df_['Transported']

  X_test = valid_

  # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), LinearDiscriminantAnalysis(solver = 'svd')) # use singular value decomp to solve 

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  # cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  submission = pd.DataFrame({ 'PassengerId': validation.PassengerId.values, 'Transported': y_pred })
  
  fname = "my_submission_LDA_" + df_.name + ".csv"

  submission.to_csv(fname, index=False)



In [None]:
# Kaggle call

!kaggle competitions submit -c spaceship-titanic -f my_submission_LDA_Custom.csv -m "LDA Custom"

100% 56.6k/56.6k [00:01<00:00, 31.6kB/s]
Successfully submitted to Spaceship Titanic

In [None]:
# QDA 
for df_ in df_methods:

  X = df_.drop(['Transported'], axis = 1)
  y = df_['Transported']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis())

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  print("Method:", df_.name)

  print("Score:", clf.score(X_test, y_test))

  print(cf, end = "\n\n")



Method: MeanImpute
Score: 0.7085889570552147
Actual     False  True
Predicted             
False        657    97
True         663  1191

Method: KNNImpute
Score: 0.6924846625766872
Actual     False  True
Predicted             
False        644   123
True         679  1162

Method: IterativeImpute
Score: 0.7611196319018405
Actual     False  True
Predicted             
False        994   319
True         304   991

Method: Custom
Score: 0.718558282208589
Actual     False  True
Predicted             
False        649   106
True         628  1225



In [None]:
#QDA with complete data and validation set
#Kaggle Score by Impute Method: .7311
#Mean: 
#KNN: 
#Iterative: 

for df_, valid_ in zip(df_methods, valid_methods):

  X_train = df_.drop(['Transported'], axis = 1)
  y_train = df_['Transported']

  X_test = valid_

  # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis()) # use singular value decomp to solve 

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  # cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  submission = pd.DataFrame({ 'PassengerId': validation.PassengerId.values, 'Transported': y_pred })
  
  fname = "my_submission_RF_" + df_.name + ".csv"

  submission.to_csv(fname, index=False)

In [None]:
# Kaggle call

!kaggle competitions submit -c spaceship-titanic -f my_submission_RF_Custom.csv -m "IterativeImputeQDA"


100% 55.6k/55.6k [00:00<00:00, 98.1kB/s]
Successfully submitted to Spaceship Titanic


# SVM

In [None]:
# attempting SVM with kernalization throught scikit-learn

for df_ in df_methods:

  X = df_.drop(['Transported'], axis = 1)
  y = df_['Transported']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), SVC(kernel = 'linear'))

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  print("Method:", df_.name)

  print("Score:", clf.score(X_test, y_test))

  print(cf, end = "\n\n")

Method: MeanImpute
Score: 0.7791411042944786
Actual     False  True
Predicted             
False       1009   268
True         308  1023

Method: KNNImpute
Score: 0.7871932515337423
Actual     False  True
Predicted             
False       1020   294
True         261  1033

Method: IterativeImpute
Score: 0.7553680981595092
Actual     False  True
Predicted             
False       1096   456
True         182   874

Method: Custom
Score: 0.7649539877300614
Actual     False  True
Predicted             
False       1053   376
True         237   942



In [None]:
#SVM with complete data and validation set
#Kaggle Score by Impute Method: 
#Mean: 
#KNN: 
#Iterative: 


for df_, valid_ in zip(df_methods, valid_methods):

  X_train = df_.drop(['Transported'], axis = 1)
  y_train = df_['Transported']

  X_test = valid_

  # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), SVC(kernel = 'linear')) # use singular value decomp to solve 

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  # cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  submission = pd.DataFrame({ 'PassengerId': validation.PassengerId.values, 'Transported': y_pred })
  
  fname = "my_submission_RF_" + df_.name + ".csv"

  submission.to_csv(fname, index=False)

In [None]:
# Kaggle call

# Multi-Layer Perceptron

In [None]:
for df_ in df_methods:

  X = df_.drop(['Transported'], axis = 1)
  y = df_['Transported']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), MLPClassifier(random_state=1, 
                                                          hidden_layer_sizes = (4,),
                                                          activation = 'relu',
                                                          solver = 'adam',
                                                          max_iter=10000))
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  print("Method:", df_.name)

  print("Score:", clf.score(X_test, y_test))

  print(cf, end = "\n\n")

Method: MeanImpute
Score: 0.7929447852760736
Actual     False  True
Predicted             
False       1013   244
True         296  1055

Method: KNNImpute
Score: 0.7879601226993865
Actual     False  True
Predicted             
False        958   217
True         336  1097

Method: IterativeImpute
Score: 0.7975460122699386
Actual     False  True
Predicted             
False       1038   246
True         282  1042

Method: Custom
Score: 0.781441717791411
Actual     False  True
Predicted             
False        973   251
True         319  1065



In [None]:
#MLP with complete data and validation set
#Kaggle Score by Impute Method: 
#Mean: 
#KNN: 
#Iterative: 
#Custom:


for df_, valid_ in zip(df_methods, valid_methods):

  X_train = df_.drop(['Transported'], axis = 1)
  y_train = df_['Transported']

  X_test = valid_

  # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
  

  clf = make_pipeline(StandardScaler(), MLPClassifier(random_state=1, 
                                                          hidden_layer_sizes = (4,),
                                                          activation = 'relu',
                                                          solver = 'adam',
                                                          max_iter=10000))
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  # cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  submission = pd.DataFrame({ 'PassengerId': validation.PassengerId.values, 'Transported': y_pred })
  
  fname = "my_submission_MLP_" + df_.name + ".csv"

  submission.to_csv(fname, index=False)

In [None]:
!kaggle competitions submit -c spaceship-titanic -f my_submission_MLP_Custom.csv -m "MLP Custom 4"

In [None]:
#Try out different mlp configurations

from sklearn.metrics import precision_score, recall_score, f1_score 

#f = plt.figure()
#f.set_figwidth(60)
#f.set_figheight(40)

X = df_temp.drop(['Transported'], axis = 1)
y = df_temp['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

mlp_performance = pd.DataFrame(columns = ["Hidden Layer Size", 
                                          "Activation", 
                                          "Solver", 
                                          "Precision", 
                                          "Recall",
                                          "F1-Score"])

for hls in [(1,),(4,),(8,),(16,),(24,),(2,1,),(4,1,),(8,1,),(4,4,),(8,8,),(8,4,4,),(16,8,4,)]:
    for activation in ['identity','logistic','tanh','relu']:
        for solver in ['lbfgs', 'sgd', 'adam']:
        
            clf = make_pipeline(StandardScaler(), MLPClassifier(random_state=1, 
                                                        hidden_layer_sizes = hls,
                                                        activation = activation,
                                                        solver = solver,
                                                        max_iter=10000))
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_test)
            
           
            mlp_performance.loc[len(mlp_performance.index)] = [hls,
                                                              activation,
                                                              solver,
                                                              precision_score(y_test, y_pred, average = 'macro'),
                                                              recall_score(y_test, y_pred, average = 'macro'),
                                                              f1_score(y_test, y_pred, average = 'macro')] 

  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the nu

# K-Means

In [None]:
for df_ in df_methods:

  X = df_.drop(['Transported'], axis = 1)
  y = df_['Transported']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  clf = make_pipeline(StandardScaler(), KMeans(n_clusters=2, random_state=0)) # alive or dead 
  
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  cf = pd.crosstab(y_pred, y_test, colnames = ['Actual'], rownames = ['Predicted'])

  print("Method:", df_.name)

  print("Score:", clf.score(X_test, y_test))

  print(cf, end = "\n\n")

Method: MeanImpute
Score: -31823.06460894126
Actual     False  True
Predicted             
0           1076   877
1            225   430

Method: KNNImpute
Score: -33066.31946153259
Actual     False  True
Predicted             
0            232   399
1           1084   893

Method: IterativeImpute
Score: -36073.17470277496
Actual     False  True
Predicted             
0           1058   875
1            238   437

Method: Custom
Score: -36551.56520482966
Actual     False  True
Predicted             
0           1067   909
1            225   407



# Feature Forest

In [None]:
#assumes that target feature has no nulls
class feature_forest:

  forest = {}

  X = pd.DataFrame()
  y = []

  def __init__(self) -> None:
      pass

  def get_feature_subset(self, df, i):
    return tuple(df.columns[df.loc[i,:].notna()])

  def get_feature_subsets(self, df_X):
    #returns a set of all feature subsets present in the data using nan as the criterion
    subsets = set()
    for i in range(len(df_X)):
      subsets.add(tuple(df_X.columns[df_X.loc[i,:].notna()]))
    return subsets

  def train_tree(self, feature_subset, X, y):

    X_subset = X[list(feature_subset)].dropna()
    y_subset = y.iloc[X_subset.index]

    clf = DecisionTreeClassifier(random_state=0).fit(X_subset,y_subset)

    return clf

  def generate_forest(self, df, target: str):

    feature_subsets = self.get_feature_subsets(df.drop([target], axis = 1))
    #train tree from data subset

    forest = {}

    self.X = df.drop([target], axis = 1)
    self.y = df[target]

    for feature_subset in list(feature_subsets):

      
      #df_subset = df[list(feature_subset)].dropna()

      #X_subset = X[list(feature_subset)].dropna()
      #y_subset = y.iloc[X_subset.index]
      

      #clf = DecisionTreeClassifier(random_state=0).fit(X_subset,y_subset)

      clf = self.train_tree(feature_subset, X = self.X, y = self.y)

      forest.update({feature_subset : clf})

    self.forest = forest

  def predict(self, df_X):

    y_pred = []

    for i in range(len(df_X)):

      key = self.get_feature_subset(df_X, i)
      
      if key in self.forest.keys():

        clf = self.forest.get(key)

      else:

        clf = self.train_tree(key, self.X, self.y)

        self.forest.update({key : clf})

      y_hat = clf.predict(df_X.loc[[i]].dropna(axis = 1))[0]

      y_pred.append(y_hat)

    return y_pred

In [None]:
def build_df_nan(data):
  df = data.loc[:, ['HomePlanet','Destination','Cabin','CryoSleep','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]

  hp_dummy = pd.get_dummies(df['HomePlanet'], drop_first=False, prefix = "From", prefix_sep='')

  dest_dummy = pd.get_dummies(df['Destination'], drop_first=False, prefix = "To", prefix_sep='')

  df = pd.concat([dest_dummy, df], axis=1)

  df = pd.concat([hp_dummy, df], axis=1)

  df.pop('HomePlanet')

  df.pop('Destination')

  df['InParty'] = [1 if sum(id[:4] in p_Id for p_Id in data['PassengerId']) > 1 else 0 for id in data['PassengerId']]

  df[['CabinDeck','CabinNumber','CabinSide']] = df['Cabin'].str.split("/", n = 2, expand = True)
  df.pop('Cabin')

  side_dummy = pd.get_dummies(df['CabinSide'], prefix = "Side", drop_first=True)
  df = pd.concat([side_dummy, df], axis=1)
  df.pop('CabinSide')

  df['CabinNumber'] = df['CabinNumber'].astype(float)

  nom = (df.groupby('CabinDeck').size()) / len(df)
  df['CabinDeck'] = df['CabinDeck'].apply(lambda x : nom[x] if not x is np.nan else x)

  if 'Transported' in data.columns:

    df['Transported'] = data['Transported']
  
  df.name = "FeatureForest"
  
  return df

In [None]:
df = build_df_nan(data)

ff = feature_forest()

In [None]:
!kaggle competitions submit -c spaceship-titanic -f my_submission_ff.csv -m "Feature Forest"

100% 56.4k/56.4k [00:00<00:00, 276kB/s]
Successfully submitted to Spaceship Titanic

# Subset Selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

X = df_temp.drop(['Transported'], axis = 1)
y = df_temp['Transported']

lr = make_pipeline(StandardScaler(),LogisticRegression(random_state=0, max_iter = 1000, solver = 'liblinear'))
knn = make_pipeline(StandardScaler(),KNeighborsClassifier(n_neighbors=5))
rf  = make_pipeline(StandardScaler(),RandomForestClassifier(n_estimators=400, criterion = 'entropy', bootstrap = True))

models = [lr, knn, rf]
for e,model in enumerate(models):
  print({0:"Logsitic Regression",1:"KNN K=5",2:"Random Forest"}.get(e))
  for i in range(1,8):
    sfs = SequentialFeatureSelector(model, n_features_to_select=i, direction = 'forward')
    sfs.fit(X, y)
    model.fit(X.loc[:, sfs.get_feature_names_out()],y)
    print(i,sfs.get_feature_names_out())
    cv_results = cross_validate(model, X.loc[:, sfs.get_feature_names_out()], y, cv=5)  
    print("Average Score (K = 5):", np.mean(cv_results['test_score']))

Logsitic Regression
1 ['CryoSleep']
Average Score (K = 5): 0.7182813423319752
2 ['CryoSleep' 'FoodCourt']
Average Score (K = 5): 0.7304739109748535
3 ['CryoSleep' 'FoodCourt' 'Spa']
Average Score (K = 5): 0.743818286371478
4 ['CryoSleep' 'FoodCourt' 'Spa' 'VRDeck']
Average Score (K = 5): 0.7554375985563706
5 ['CryoSleep' 'RoomService' 'FoodCourt' 'Spa' 'VRDeck']
Average Score (K = 5): 0.7690091457664849
6 ['ToPSO J318.5-22' 'CryoSleep' 'RoomService' 'FoodCourt' 'Spa' 'VRDeck']
Average Score (K = 5): 0.781895538022659
7 ['ToPSO J318.5-22' 'CryoSleep' 'VIP' 'RoomService' 'FoodCourt' 'Spa'
 'VRDeck']
Average Score (K = 5): 0.7820105466483058
KNN K=5
1 ['FoodCourt']
Average Score (K = 5): 0.58749092603119
2 ['RoomService' 'FoodCourt']
Average Score (K = 5): 0.6577817099228358
3 ['RoomService' 'FoodCourt' 'Spa']
Average Score (K = 5): 0.6889589734189788
4 ['RoomService' 'FoodCourt' 'ShoppingMall' 'Spa']
Average Score (K = 5): 0.7600390685227744
5 ['RoomService' 'FoodCourt' 'ShoppingMall' 'S