<a href="https://www.kaggle.com/code/yusufglcan/titanic-spaceship-competition-notebook?scriptVersionId=103344546" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder,StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas_profiling import ProfileReport
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
import random


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
df = pd.read_csv(r'/kaggle/input/spaceship-titanic/train.csv')
df1 = pd.read_csv(r'/kaggle/input/spaceship-titanic/test.csv')
df2 = pd.read_csv(r'/kaggle/input/spaceship-titanic/test.csv')

# Notes

# Cabin
Cabin partitions may be the hidden variable with high causation. I will keep an eye on it.
# Passenger ID
The Group part of the passender IDs can reveal hidden correlation with the target column. 
# Name 
Name column is important since it inludes last name which may give me hints about the families.


In [3]:
# ProfileReport(df)

## Data Wrangling

I divided the preprocessing stage into 3 parts, deriving columns, filling missing values and transformation of the data. I cover the code of each part with a function so that I can
apply the same process to the test data later. 

In [4]:
scl = MinMaxScaler()
scl1 = StandardScaler()
le = LabelEncoder()
def derive_columns(df):
    df.loc[df.Cabin.isna()==True,'Cabin'] = 'Missing/Missing/Missing'
    df.Name.fillna('Missing Missing')
    #Splitting the Cabin values in order to possible reveal  the correlation between the locations of passengers.
    df['Deck'] = df.Cabin.apply(lambda x : x.split('/')[0]) 
    df['Deck_number'] = df.Cabin.apply(lambda x : x.split('/')[1])
    df['Port'] = df.Cabin.apply(lambda x : x.split('/')[2])

    # Create a new column named Surname because some family members travel next to each other. It might be better for the model to add more correlation to the features of family members.
    df['Surname']=df.Name.apply(lambda x : str(x).split(' ')[-1])

    # Groups part of the passenger id reveals the relationships between the passengers.
    df['Groups'] = df.PassengerId.apply(lambda x :str(x).split('_')[0])

    #Drop the processed columns to avoid noise
    df.drop(columns= ['Name','Cabin','PassengerId'],inplace=True)
    
    return df
derive_columns(df)

   

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Deck_number,Port,Surname,Groups
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P,Ofracculy,0001
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S,Vines,0002
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S,Susent,0003
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S,Susent,0003
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S,Santantines,0004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,98,P,Noxnuther,9276
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,1499,S,Mondalley,9278
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,1500,S,Connon,9279
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,608,S,Hontichre,9280


I use fill the missing values here. 
The last part is a little experimental, I use the proportions of the values in columns to fill the missing data. I know it can make the results worse but to me it is worth trying.

In [5]:
def fill_columns(df):
    # When we check the distribution of the luxury spending columns, big majority of passengers had never spent money on these activities. I just assign zero for the missing values.
    #Spenders are not statistically significant in the distribution
    df.loc[df.Spa.isna()==True,'Spa'] = 0
    df.loc[df.ShoppingMall.isna()==True,'ShoppingMall'] = 0
    df.loc[df.VRDeck.isna()==True,'VRDeck'] = 0
    df.loc[df.FoodCourt.isna()==True,'FoodCourt'] = 0
    df.loc[df.RoomService.isna()==True,'RoomService'] = 0

    # The distribution shows a big majority is non-VIP passengers so assigning 'False' to the columns with missig values is the most accurate thing to do statistically.
    df.VIP.fillna('False',inplace=True)

    # Correlation matrix shows that there is a negative small correlation between the target variable and the 'Age' column so I do not think Age matters in the calculation at all.
    #However, at the same time I do not want to lose the columns entirely, therefore I just fill the NAN's with the mean value. 
    df.Age.fillna(df.Age.mean(),inplace=True)

    ## I do not want to lose columns especially in the CryoSleep column. So, I wanted to keep the distribution percentage by filling the nulls with randomized values.  

    weight_HP = [df.HomePlanet.value_counts().values[0]/len(df.HomePlanet),df.HomePlanet.value_counts().values[1]/len(df.HomePlanet),df.HomePlanet.value_counts().values[2]/len(df.HomePlanet)]
    fillings_HP = random.choices(df.HomePlanet.unique()[:-1],weights=weight_HP,  k=df[df.HomePlanet.isna()==True].shape[0])
    df.loc[df.HomePlanet.isna()==True,'HomePlanet'] = fillings_HP

    weight_CR = [df.CryoSleep.value_counts().values[0]/len(df.CryoSleep),df.CryoSleep.value_counts().values[1]/len(df.CryoSleep)]
    fillings_CR = random.choices(df.CryoSleep.unique()[:-1],weights =weight_CR, k=df[df.CryoSleep.isna()==True].shape[0])
    df.loc[df.CryoSleep.isna()==True,'CryoSleep'] = fillings_CR

    weight_dest = [df.Destination.value_counts().values[0]/len(df.Destination),df.Destination.value_counts().values[1]/len(df.Destination),df.Destination.value_counts().values[2]/len(df.Destination)]
    fillings_dest = random.choices(df.Destination.unique()[:-1],weights = weight_dest,k=df[df.Destination.isna()==True].shape[0])
    df.loc[df.Destination.isna()==True,'Destination'] = fillings_dest
    df.loc[df.Deck_number=='Missing','Deck_number'] = random.choices(np.arange(1,900,1),k=df[df.Deck_number=='Missing'].shape[0])
    df.drop(columns=['Deck_number'],inplace=True)
    
    
    return df
fill_columns(df)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Port,Surname,Groups
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P,Ofracculy,0001
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S,Vines,0002
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S,Susent,0003
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S,Susent,0003
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S,Santantines,0004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,P,Noxnuther,9276
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,S,Mondalley,9278
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,S,Connon,9279
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,S,Hontichre,9280


I simply transform the data using label encoder. The guaranteed approach is to use one hot encoder to transform categorical data but I do not think it matters much in this case.

In [6]:
def transform_columns(df):
    
    df.Destination = le.fit_transform(df.Destination)
    df.CryoSleep = le.fit_transform(df.CryoSleep)
    df.HomePlanet = le.fit_transform(df.HomePlanet)
    df.Deck = le.fit_transform(df.Deck)
    df.Port = le.fit_transform(df.Port)
    df.Surname = le.fit_transform(df.Surname)
    df.VIP = df.VIP.apply(lambda x : 1 if x ==True else 0 )
    return df
transform_columns(df)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Port,Surname,Groups
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1,1,1431,0001
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,True,5,2,2109,0002
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,0,2,1990,0003
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,0,2,1990,0003
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,True,5,2,1778,0004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,False,0,1,1416,9276
8689,0,1,1,18.0,0,0.0,0.0,0.0,0.0,0.0,False,6,2,1341,9278
8690,0,0,2,26.0,0,0.0,0.0,1872.0,1.0,0.0,True,6,2,470,9279
8691,1,0,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,False,4,2,996,9280


Defining the features and label columns. Test and train data is defined using sklearn split tool. I also dropped 'HomePlanet' column because it is highly correlated with
'Destination'.Two columns with high correlation could impair the accuracy of the model. 

In [7]:
x = df.drop(columns=['Transported','HomePlanet'])
y = df.Transported
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1234)

I made a small experiment which hints about the results of different combinations of models and scaling techniques. Created pipelines and grids designed to run different models.

In [8]:
pipelines = {'rf':make_pipeline(StandardScaler(),RandomForestClassifier()),
            'adaboost':make_pipeline(StandardScaler(),AdaBoostClassifier()),
            'rf1':make_pipeline(MinMaxScaler(),RandomForestClassifier()),
             'rf2':make_pipeline(MinMaxScaler(),StandardScaler(),RandomForestClassifier()),
            'adaboost1':make_pipeline(MinMaxScaler(),AdaBoostClassifier()),
            'svc': make_pipeline(MinMaxScaler(),StandardScaler(),SVC())}
                                      
                                      
gridx = {'rf':{'randomforestclassifier__n_estimators':[300,400,450]},
         'rf1':{'randomforestclassifier__n_estimators':[300,400,450]},
         'rf2':{'randomforestclassifier__n_estimators':[350,400,450],
                'randomforestclassifier__criterion':['entropy'],
                'randomforestclassifier__max_depth':[12,14,16]},     
               
        'adaboost':{'adaboostclassifier__n_estimators':[80,100,120]},
        'adaboost1':{'adaboostclassifier__n_estimators':[80,100,120]},
        'svc' : {'svc__degree':[3,5,2],
                'svc__C':[10,5,1,1.2]}}
                                                      

Calling each pipeline and corresponding parameter set to make see the accuracy scores. 

In [9]:
for model in pipelines.keys():

    model = GridSearchCV(pipelines[model],param_grid=gridx[model],n_jobs=-1,cv=5,scoring='accuracy')
    model.fit(x_train,y_train)
    print(accuracy_score(model.predict(x_test),y_test))
    print(model.best_params_)

0.803680981595092
{'randomforestclassifier__n_estimators': 300}
0.7940950920245399
{'adaboostclassifier__n_estimators': 80}
0.8052147239263804
{'randomforestclassifier__n_estimators': 400}
0.803680981595092
{'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 12, 'randomforestclassifier__n_estimators': 400}
0.7940950920245399
{'adaboostclassifier__n_estimators': 80}
0.7963957055214724
{'svc__C': 1, 'svc__degree': 3}


The best one is the random forest classifier so I dig deeper with hyperparameter tuning on this algorithm.

In [10]:
pipes = {'rf':make_pipeline(MinMaxScaler(),RandomForestClassifier())}

params = {'rf':{'randomforestclassifier__max_depth': [15,12],
          'randomforestclassifier__bootstrap':[True],
          'randomforestclassifier__n_estimators':[500,450],
          'randomforestclassifier__criterion':['entropy'],
          'randomforestclassifier__min_samples_leaf':[5,7],
          'randomforestclassifier__min_samples_split':[7]
         }}
grid = GridSearchCV(pipes['rf'],param_grid=params['rf'],cv=5,verbose=5,scoring='accuracy')
print(grid.fit(x_train,y_train))
print(grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END randomforestclassifier__bootstrap=True, randomforestclassifier__criterion=entropy, randomforestclassifier__max_depth=15, randomforestclassifier__min_samples_leaf=5, randomforestclassifier__min_samples_split=7, randomforestclassifier__n_estimators=500;, score=0.791 total time=   4.0s
[CV 2/5] END randomforestclassifier__bootstrap=True, randomforestclassifier__criterion=entropy, randomforestclassifier__max_depth=15, randomforestclassifier__min_samples_leaf=5, randomforestclassifier__min_samples_split=7, randomforestclassifier__n_estimators=500;, score=0.813 total time=   3.9s
[CV 3/5] END randomforestclassifier__bootstrap=True, randomforestclassifier__criterion=entropy, randomforestclassifier__max_depth=15, randomforestclassifier__min_samples_leaf=5, randomforestclassifier__min_samples_split=7, randomforestclassifier__n_estimators=500;, score=0.801 total time=   3.9s
[CV 4/5] END randomforestclassifier__bootstrap=Tr

But I failed to find a better score with hyperparameter tuning. Therefore I will use the plain model.

In [11]:
accuracy_score(grid.predict(x_test),y_test)

0.8040644171779141

In [12]:
model = GridSearchCV(pipelines['rf1'],param_grid=gridx['rf1'],n_jobs=-1,cv=5,scoring='accuracy')
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
accuracy_score(y_test,y_pred)

0.8052147239263804

I process the test data before feeding it to the model I built.

In [13]:
derive_columns(df1)
fill_columns(df1)
transform_columns(df1)
df1.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Port,Surname,Groups
0,0,1,2,27.0,0,0.0,0.0,0.0,0.0,0.0,6,2,275,13
1,0,0,2,19.0,0,0.0,9.0,0.0,2823.0,0.0,5,2,1190,18
2,1,1,0,31.0,0,0.0,0.0,0.0,0.0,0.0,2,2,1604,19


I make predictions and create a data frame for submission.

In [14]:
y_pred=model.predict(df1.drop(columns=['HomePlanet']))

In [15]:
submission = pd.DataFrame({'PassengerId':df2.PassengerId,
             'Transported':y_pred})
submission.Transported=submission.Transported.astype('boolean')
submission.to_csv('submission.csv',index=False)

In [16]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
