<a href="https://www.kaggle.com/code/ocanaydin/spaceship-titanic-feature-ext?scriptVersionId=113934223" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**GET TRAIN AND TEST DATA**

In [None]:
train_csv = pd.read_csv("../input/spaceship-titanic/train.csv")
test_csv = pd.read_csv("../input/spaceship-titanic/test.csv")
print(train_csv.shape)
train_csv.head()

**Seperate "PassengerId" column from dataset.**

In [None]:
"""Seperate PassengerId."""
train_ID = train_csv["PassengerId"]
test_ID = test_csv["PassengerId"]

train_x = train_csv.drop(columns = ["PassengerId"])
test_x = test_csv.drop(columns = ["PassengerId"])

**VISUALIZATION OF NUMERICAL DATA AS HISTOGRAM**

In [None]:
_fig = train_x.hist(figsize = (30,24),bins = 50,color = "blue",edgecolor = "black",xlabelsize = 15,ylabelsize = 15)

**PREPROCESSING DATA**

In [None]:
"""Check data if it contains any nan keys."""
def check_nan_keys(data):
    nan_keys = []
    for key in data.keys():
        for i in range(len(data[key].isna())):
            if data[key].isna()[i]:
                nan_keys.append(key)
                break
    return nan_keys

In [None]:
"""Check data if it containts any categorical data."""
def check_categorical_keys(data):
    categorical_keys = []
    for key in data.keys():
        if data[key].dtype == np.dtype("O"):
            categorical_keys.append(key)
    return categorical_keys

In [None]:
"""Remove 'Name' column from both train and test datas.It is unique so i think that it has no effect on transported."""
train_x = train_x.drop(columns = ["Name"])
test_x = test_x.drop(columns = ["Name"])
"""Also remove 'Cabin' column from both train and test datas because it contains too many unique values. """
train_x = train_x.drop(columns = ["Cabin"])
test_x = test_x.drop(columns = ["Cabin"])

In [None]:
"""Check both nan and categorical for train data."""
train_nan_keys = check_nan_keys(train_x)
train_categorical_keys = check_categorical_keys(train_x)
print(f"Train nan keys : {train_nan_keys}\n Train categorical keys : {train_categorical_keys}")

In [None]:
"""Check both nan and categorical for test data."""
test_nan_keys = check_nan_keys(test_x)
test_categorical_keys = check_categorical_keys(test_x)
print(f"Test nan keys : {test_nan_keys}\nTest categorical keys : {test_categorical_keys}")

**VISUALIZATION OF CATEGORICAL DATA**

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
fig,axes = plt.subplots(2,2,figsize = (10,5))
count = 0
for i in range(2):
    for j in range(2):
        sns.countplot(train_categorical_keys[count],alpha = 0.7,data = train_x,ax = axes[i,j])
        count += 1
fig.tight_layout()

In [None]:
"""We see 'VIP' column not distributed so we can drop it."""
train_x = train_x.drop(columns = ["VIP"])
test_x = test_x.drop(columns = ["VIP"])
train_x

**FILL NA VALUES**

In [None]:
"""For train x."""
for key in train_x.keys():
    #If it contains categorical data,fill it with most iterative value.
    if train_x[key].dtype == "object":
        print(f"{key} : {train_x[key].dtype}")
        train_x[key].fillna(train_x[key].value_counts().index[0],inplace = True)
        
    #If it all contains numerical data,fill it with its mean.
    else:
        print(f"{key} : {train_x[key].dtype}")
        train_x[key].fillna(train_x[key].mean(),inplace = True)
        print("Numerical")
        

In [None]:
"""For test_x."""
for key in test_x.keys():
    #If it contains categorical data,fill it with most iterative value.
    if test_x[key].dtype == "object":
        print(f"{key} : {test_x[key].dtype}")
        test_x[key].fillna(test_x[key].value_counts().index[0],inplace = True)
        
    #If it all contains numerical data,fill it with its mean.
    else:
        print(f"{key} : {test_x[key].dtype}")
        test_x[key].fillna(test_x[key].mean(),inplace = True)
        print("Numerical")

**CONVERT CATEGORICAL DATAS TO NUMERIC DATA.**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_categorical_keys = check_categorical_keys(train_x)
"""For train_x."""
for key in train_categorical_keys:
    transformed_data = label_encoder.fit_transform(train_x[key].values.astype("str").ravel())
    train_x = train_x.drop(columns = [key])
    train_x = pd.concat([train_x,pd.DataFrame(transformed_data,columns = [key])],axis = 1)

print(check_categorical_keys(train_x))

In [None]:
test_categorical_keys = check_categorical_keys(test_x)
"""For test_x."""
for key in test_categorical_keys:
    transformed_data = label_encoder.fit_transform(test_x[key].values.astype("str").ravel())
    test_x = test_x.drop(columns = [key])
    test_x = pd.concat([test_x,pd.DataFrame(transformed_data,columns = [key])],axis = 1)

print(check_categorical_keys(test_x))

In [None]:
"""Seperate 'Transformed column' from train x."""
train_y = train_x["Transported"]
train_x = train_x.drop(columns = ["Transported"])

In [None]:
train_y = pd.DataFrame(train_y,columns = ["Transported"])

**STANDARTIZATION OF DATA**

In [None]:
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
"""Standartization of train and test datas."""
scaled_train_x = SS.fit_transform(train_x)
scaled_test_x = SS.fit_transform(test_x)
"""Convert them to dataframe."""
scaled_train_x = pd.DataFrame(scaled_train_x,columns = train_x.columns)
scaled_test_x = pd.DataFrame(scaled_test_x,columns = test_x.columns)

**FIT AND PREDICT**

In [None]:
"""Random Forest Classifier."""
from sklearn.ensemble import RandomForestClassifier
#RFC = RandomForestClassifier(max_features = 7,n_estimators = 200,criterion = "entropy",min_samples_split = 20,
        #random_state = 42)
#RFC.fit(scaled_train_x,train_y.values.ravel())
"""XGBoost Classifier"""
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier(booster = "gbtree",eta = 0.025,gamma = 0.5,max_depth = 10,reg_lambda = 1.2,alpha = 1.2)
xgb_classifier.fit(scaled_train_x,train_y.values.ravel() * 1)

In [None]:
preds = xgb_classifier.predict(scaled_test_x)

In [None]:
result = pd.DataFrame({"PassengerId" : test_ID,"Transported" : np.array(preds,dtype = "bool")})
result

In [None]:
result.to_csv("spaceship_titanic_preds.csv",index = False)