In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import *
from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.model_selection import *
import seaborn as sns
from sklearn.impute import KNNImputer
import missingno as msno
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
df1 = pd.read_csv('../input/spaceship-titanic/train.csv')
df2 = pd.read_csv('../input/spaceship-titanic/test.csv')
df1.head()

In [None]:
df2.head()

In [None]:
df1.info()

In [None]:
df2.info()

In [None]:
df1.describe()

In [None]:
df2.describe()

In [None]:
df1.describe(include='object').round().T

In [None]:
df2.describe(include='object').round().T

In [None]:
for df in [df1, df2]:
    df.isna().mean().plot(kind='barh', figsize=(10,5))
    plt.show()
    print('')

In [None]:
print('Percentage of missing data per feature:\n')
round(df1.isna().sum() / df1.shape[0], 2)

In [None]:
msno.matrix(df1)
plt.show()

In [None]:
msno.heatmap(df1)
plt.show()

In [None]:
print('Statistical Distribution of Passengers in CryoSleep\n')
round(df1[df1['CryoSleep'] == True].describe(), 3)

In [None]:
print('Statistical Distribution of Passengers NOT in CryoSleep\n')
round(df1[df1['CryoSleep'] == False].describe(), 3)


In [None]:
df1[df1['CryoSleep'] == True][['Room Service', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = 0.1

In [None]:
df1['Transported'] = df1['Transported'].astype(bool)

quant_imputer = KNNImputer(n_neighbors=3)

quant_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

for df in [df1, df2]:
    df['HomePlanet'].fillna('Earth', inplace=True)
    df['CryoSleep'].fillna(False, inplace=True)
    df['Cabin'].fillna('Z/99999/Z', inplace=True)
    df['Destination'].fillna('TRAPPIST-1e', inplace=True)
    df['VIP'].fillna(False, inplace=True)

    df[quant_features + ['Age']] = pd.DataFrame(quant_imputer.fit_transform(df[quant_features + ['Age']]))

    df['TotalSpending'] = df[quant_features].sum(axis=1)

    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0]).astype(str)
    df['Num'] = df['Cabin'].apply(lambda x: x.split('/')[1]).astype(str)
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2]).astype(str) 

    df['Passenger_Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(str)
    df['Passenger_Num'] = df['PassengerId'].apply(lambda x: x.split('_')[1]).astype(str)

    df['Passenger_Group'] = df['Passenger_Group'].astype('category')
    df['HomePlanet'] = df['HomePlanet'].astype('category')
    df['CryoSleep'] = df['CryoSleep'].astype(bool)
    df['Deck'] = df['Deck'].astype('category')
    df['Side'] = df['Side'].astype('category')
    df['Destination'] = df['Destination'].astype('category')

for df in [df1, df2]:
  df['Group_Size'] = df['Passenger_Group'].map(lambda x: pd.concat([df1['Passenger_Group'], df2['Passenger_Group']]).value_counts()[x])

In [None]:
df1.info()

In [None]:
df1.head()

In [None]:
df2.info()

In [None]:
df2.head()

In [None]:
y_true_df1 = df1['Transported']
features = ['Group_Size', 'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'Age', 'TotalSpending'] + quant_features
x_df1 = pd.get_dummies(df1[features], drop_first=True)
x_df2 = pd.get_dummies(df2[features], drop_first=True)

In [None]:
lgbm_clf=LGBMClassifier(n_estimators = 3000,
                        random_state=0,
                        learning_rate=0.0015,
                        objective="binary")
xgb_clf=XGBClassifier()
catboost_clf = CatBoostClassifier()
clf = VotingClassifier([('lgbm', lgbm_clf), ('xgm', xgb_clf), ('catboost', catboost_clf)], voting='hard')
clf = clf.fit(x_df1, y_true_df1)
y_predicted = clf.predict(x_df1)
accuracy_score(y_true_df1, y_predicted)

In [None]:
y_df2 = clf.predict(x_df2)
df2['Transported'] = y_df2
df2[['PassengerId', 'Transported']].to_csv('submission.csv', index=False)