# Spaceship Titanic

The goal of this project is to build a model that predicts which passengers were transported from the Spaceship Titanic with the greatest possible accuracy.

## 1. Imports and Data Cleaning

In [12]:
import matplotlib.pyplot as plt

import pandas as pd

import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from xgboost import XGBClassifier

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
submission_df = pd.read_csv('test.csv')

In [4]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## 2. Preparing for Modeling

In [6]:
X = train_df.drop('Transported',axis=1)
y = train_df['Transported']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = .2, random_state = 42)

In [8]:
ohecols = ['HomePlanet','CryoSleep','Cabin','Destination','VIP']

In [9]:
numcols = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

In [14]:
ohe_sub = Pipeline([
    ('cat_impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [15]:
num_sub = Pipeline([
    ('num_impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [16]:
CT = ColumnTransformer(transformers=[
    ('ohe', ohe_sub, ohecols),
    ('num', num_sub, numcols),   
])

## 3. Modeling

### 3a. Dummy

In [17]:
dum_pipe = Pipeline([
    ('ct', CT),
    ('dummy', DummyClassifier(strategy='most_frequent'))
])

In [18]:
dum_pipe.fit(X_train,y_train)



In [19]:
dum_pipe.score(X_test,y_test)

0.5048878665899943

### 3b. Logistic Regression

In [20]:
lr_pipe = Pipeline([
    ('ct', CT),
    ('lr', LogisticRegression())
])

In [21]:
lr_pipe.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
lr_pipe.score(X_test,y_test)

0.7745830937320299

### 3c. Random Forest

In [23]:
rf_pipe = Pipeline([
    ('ct', CT),
    ('rf', RandomForestClassifier(max_depth = 5))
])

In [24]:
rf_pipe.fit(X_train,y_train)



In [25]:
rf_pipe.score(X_test,y_test)

0.7412305922944221

### 3d. XGBoost

In [28]:
xgb_pipe = Pipeline([
    ('ct',CT),
    ('xgb',XGBClassifier(max_depth = 5))
])

In [29]:
xgb_pipe.fit(X_train,y_train)



In [30]:
xgb_pipe.score(X_test,y_test)

0.7878090856814262

### 4. Submission

In [32]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [33]:
passenger_ids = submission_df['PassengerId']

In [35]:
predictions = xgb_pipe.predict(submission_df).astype(bool)

In [36]:
result_df = pd.DataFrame({'PassengerId':passenger_ids,'Transported':predictions})

In [38]:
result_df.to_csv('submission_1.csv',index=False)