<a href="https://www.kaggle.com/code/behnambaloochy/spaceship-titanic?scriptVersionId=144836551" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Data Preprocessing ----------------------------------------------------------------------------------
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv', encoding='utf-8')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv', encoding='utf-8')
df_train["Transported"] = df_train["Transported"].astype(int)

y_train = df_train.iloc[:, 13].values
X_train = df_train.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values

# Categorical Feature Encoding**********
options_HomePlanet = {}
for i in range(0, df_train["HomePlanet"].unique().shape[0]):
    options_HomePlanet[df_train["HomePlanet"].unique()[i]] = i+1

j = 0
for i in X_train[:,0]:
    X_train[j,0] = options_HomePlanet.get(i)
    j += 1

options_Cabin = {}
for i in range(0, df_train["Cabin"].unique().shape[0]):
    options_Cabin[df_train["Cabin"].unique()[i]] = i+1
j = 0
for i in X_train[:,2]:
    X_train[j,2] = options_Cabin.get(i)
    j += 1

options_Destination = {}
for i in range(0, df_train["Destination"].unique().shape[0]):
    options_Destination[df_train["Destination"].unique()[i]] = i+1
j = 0
for i in X_train[:,3]:
    X_train[j,3] = options_Destination.get(i)
    j += 1
# **********
X_train[:,1] = np.where(X_train[:,1] == 'True', 1, 0)
X_train[:,5] = np.where(X_train[:,5] == 'True', 1, 0)
X_train = np.float_(X_train)
X_test = df_test.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values

# Categorical Feature Encoding**********
j = 0
for i in X_test[:,0]:
    X_test[j,0] = options_HomePlanet.get(i)
    j += 1

j = 0
for i in X_test[:,2]:
    X_test[j,2] = options_Cabin.get(i)
    j += 1

j = 0
for i in X_test[:,3]:
    X_test[j,3] = options_Destination.get(i)
    j += 1
# **********

X_test[:,1] = np.where(X_test[:,1] == 'True', 1, 0)
X_test[:,5] = np.where(X_test[:,5] == 'True', 1, 0)
X_test = np.float_(X_test)

# Define Data Pipeline----------------------------------------------------------------------------------
pipe_rf = make_pipeline(SimpleImputer(missing_values=np.nan, strategy="mean"), 
                        xgb.XGBClassifier(n_estimators=200, learning_rate=0.02,max_depth=2, random_state=1))

# XGboost Training------------------------------------------------------------------------
pipe_rf.fit(X_train, y_train)

y_train_pred = pipe_rf.predict(X_train)
pipe_train = accuracy_score(y_train, y_train_pred)
print(f'XGboost train accuracies: ' f'{pipe_train:.3f}')
# XGboost Prediction----------------------------------------------------------------------
y_pred = pipe_rf.predict(X_test)

# Submission--------------------------------------------------------------------------------------------
trans = []
for i in y_pred:
    trans.append(i==1)
submission = np.column_stack((df_test.PassengerId, trans))
print(submission)
print(np.shape(submission))
df_result = pd.DataFrame(submission)
df_result.to_csv("/kaggle/working/submission.csv", header=['PassengerId', 'Transported'], index=False)


In [None]:
print('Number of class 0 examples:', X_train[y_train == 0].shape[0])
print('Number of class 1 examples:', X_train[y_train == 1].shape[0])
print('The classes are balanced.')

In [None]:
pipe_rf.get_params(deep=True)

# Grid Search for Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

pipe_rf = make_pipeline(SimpleImputer(missing_values=np.nan, strategy="mean"), 
                        xgb.XGBClassifier(random_state=1))
param_range1 = [190, 200, 210]
param_range2 = [0.015, 0.02, 0.025]
param_range3 = [2, 3]
param_grid = [{'xgbclassifier__n_estimators': param_range1, 
               'xgbclassifier__learning_rate': param_range2,
               'xgbclassifier__max_depth': param_range3}]
gs = GridSearchCV(estimator=pipe_rf, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  refit=False,
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)