In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.utils import resample
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train_data=pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_data=pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
print(train_data)
print(train_data.dtypes)
#print(test_data)
PassengerID=test_data['PassengerId']

     PassengerId HomePlanet CryoSleep     Cabin    Destination   Age    VIP  \
0        0001_01     Europa     False     B/0/P    TRAPPIST-1e  39.0  False   
1        0002_01      Earth     False     F/0/S    TRAPPIST-1e  24.0  False   
2        0003_01     Europa     False     A/0/S    TRAPPIST-1e  58.0   True   
3        0003_02     Europa     False     A/0/S    TRAPPIST-1e  33.0  False   
4        0004_01      Earth     False     F/1/S    TRAPPIST-1e  16.0  False   
...          ...        ...       ...       ...            ...   ...    ...   
8688     9276_01     Europa     False    A/98/P    55 Cancri e  41.0   True   
8689     9278_01      Earth      True  G/1499/S  PSO J318.5-22  18.0  False   
8690     9279_01      Earth     False  G/1500/S    TRAPPIST-1e  26.0  False   
8691     9280_01     Europa     False   E/608/S    55 Cancri e  32.0  False   
8692     9280_02     Europa     False   E/608/S    TRAPPIST-1e  44.0  False   

      RoomService  FoodCourt  ShoppingMall     Spa 

In [3]:
train_data[['deck','num', 'side']] = train_data['Cabin'].str.split('/', expand=True)
test_data[['deck','num', 'side']] = test_data['Cabin'].str.split('/', expand=True)

In [4]:
#PassengerId
train_data = train_data.drop(['PassengerId'], axis=1)
test_data = test_data.drop(['PassengerId'], axis=1)
#HomePlanet
train_data['HomePlanet'] = train_data['HomePlanet'].fillna(train_data['HomePlanet'].mode()[0])
test_data['HomePlanet'] = test_data['HomePlanet'].fillna(test_data['HomePlanet'].mode()[0])
#CryoSleep
train_data['CryoSleep'] = train_data['CryoSleep'].fillna(train_data['CryoSleep'].mode()[0])
test_data['CryoSleep'] = test_data['CryoSleep'].fillna(test_data['CryoSleep'].mode()[0])
#Cabin
train_data = train_data.drop(['Cabin'], axis=1)
test_data = test_data.drop(['Cabin'], axis=1)
#Deck
train_data['deck'] = train_data['deck'].fillna(train_data['deck'].mode()[0])
test_data['deck'] = test_data['deck'].fillna(test_data['deck'].mode()[0])
#Num
train_data['num'] = train_data['num'].fillna(train_data['num'].mode()[0])
test_data['num'] = test_data['num'].fillna(test_data['num'].mode()[0])
train_data['num'] = train_data['num'].astype('int32')
test_data['num'] = test_data['num'].astype('int32')
#Side
train_data['side'] = train_data['side'].fillna(train_data['side'].mode()[0])
test_data['side'] = test_data['side'].fillna(test_data['side'].mode()[0])
#Destination
train_data['Destination'] = train_data['Destination'].fillna(train_data['Destination'].mode()[0])
test_data['Destination'] = test_data['Destination'].fillna(test_data['Destination'].mode()[0])
#Age
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())
#VIP
train_data['VIP'] = train_data['VIP'].fillna(train_data['VIP'].mode()[0])
test_data['VIP'] = test_data['VIP'].fillna(test_data['VIP'].mode()[0])
#Luxuries
train_data['Luxuries'] = train_data[['RoomService', 'Spa', 'VRDeck']].sum(axis=1)
test_data['Luxuries'] = test_data[['RoomService', 'Spa', 'VRDeck']].sum(axis=1)
#Basic
train_data['Basic'] = train_data[['FoodCourt', 'ShoppingMall']].sum(axis=1)
test_data['Basic'] = test_data[['FoodCourt', 'ShoppingMall']].sum(axis=1)
#RoomService
train_data = train_data.drop(['RoomService'], axis=1)
test_data = test_data.drop(['RoomService'], axis=1)
#FoodCourt
train_data = train_data.drop(['FoodCourt'], axis=1)
test_data = test_data.drop(['FoodCourt'], axis=1)
#ShoppingMall
train_data = train_data.drop(['ShoppingMall'], axis=1)
test_data = test_data.drop(['ShoppingMall'], axis=1)
#Spa
train_data = train_data.drop(['Spa'], axis=1)
test_data = test_data.drop(['Spa'], axis=1)
#VRDeck
train_data = train_data.drop(['VRDeck'], axis=1)
test_data = test_data.drop(['VRDeck'], axis=1)
#Name
train_data = train_data.drop(['Name'], axis=1)
test_data = test_data.drop(['Name'], axis=1)

In [5]:
def type_cleaner(df):
    for x in df.columns:
        if 'object' in str(df[x].dtype):
            uniq = df[x].unique()
            uniq.sort()
            counter = 0
            for y in uniq:
                df[x].mask(df[x] == y, counter, inplace=True)
            df[x] = df[x].astype('category')
    return df

train_data = type_cleaner(train_data)
test_data = type_cleaner(test_data)

print(train_data.dtypes)
print(train_data.head())
print(test_data.dtypes)
print(test_data.head())

HomePlanet     category
CryoSleep          bool
Destination    category
Age             float64
VIP                bool
Transported        bool
deck           category
num               int32
side           category
Luxuries        float64
Basic           float64
dtype: object
  HomePlanet  CryoSleep Destination   Age    VIP  Transported deck  num side  \
0          0      False           0  39.0  False        False    0    0    0   
1          0      False           0  24.0  False         True    0    0    0   
2          0      False           0  58.0   True        False    0    0    0   
3          0      False           0  33.0  False        False    0    0    0   
4          0      False           0  16.0  False         True    0    1    0   

   Luxuries   Basic  
0       0.0     0.0  
1     702.0    34.0  
2    6807.0  3576.0  
3    3522.0  1654.0  
4     870.0   221.0  
HomePlanet     category
CryoSleep          bool
Destination    category
Age             float64
VIP          

In [6]:
x = train_data[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'deck', 'num', 'side', 'Luxuries', 'Basic']]
y = train_data[['Transported']]
scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15)
#_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
#_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

  estimator=estimator,


In [7]:
#pick one
#X_train, y_train = resample(X_train, y_train)
######### RF
rf = RandomForestClassifier(n_estimators=101, criterion='gini', min_impurity_decrease=0.0007, max_depth=4)
rf.fit(X_train, y_train)

######### ADA
ada = AdaBoostClassifier(base_estimator=rf)
ada.fit(X_train, y_train)

######### KNN
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
knn.fit(X_train, y_train)

######### GAUS
gnb = GaussianNB()
gnb.fit(X_train, y_train)

######### GradientBoost
gbc = GradientBoostingClassifier(n_estimators=101, max_depth=7, min_impurity_decrease=0.0007)
gbc.fit(X_train, y_train)

######### VOTER
voter = VotingClassifier(estimators=[('rf', rf), ('ada', ada), ('knn', knn), ('gnb', gnb), ('gbc', gbc)],
                        voting='soft')
voter.fit(X_train, y_train)

chosen = voter
y_pred_voter = chosen.predict(X_test)
print(classification_report(y_test, y_pred_voter))

  """
  y = column_or_1d(y, warn=True)
  estimator=estimator,
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  estimator=estimator,
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  estimator=estimator,
  estimator=estimator,


              precision    recall  f1-score   support

       False       0.83      0.74      0.79       649
        True       0.77      0.85      0.81       655

    accuracy                           0.80      1304
   macro avg       0.80      0.80      0.80      1304
weighted avg       0.80      0.80      0.80      1304



  X = check_array(X, **check_params)
  X = check_array(X, **check_params)


In [8]:
pred=chosen.predict(test_data)
predictions=pd.DataFrame(pred.astype(bool),columns=['Transported'])
submit=pd.concat([PassengerID,predictions],axis=1)
submit.to_csv("submission.csv",index=False)

  X = check_array(X, **check_params)
  X = check_array(X, **check_params)
