In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load data

In [67]:
source = 'https://www.kaggle.com/code/smorika/beginner-with-score-0-77990-improves-step-by-step/data'
train_data = pd.read_csv('titanic_train.csv')
train_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C


In [68]:
test_data = pd.read_csv('titanic_test.csv')
test_data.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [69]:
# Need to drop survived dataset from train dataset
# survived is an output; test dataset will not jhave this
survived_data = train_data['Survived']
train_data = train_data.drop('Survived', axis=1)

In [70]:
# concat train_data and test_data
df = pd.concat([train_data,test_data], sort=False, ignore_index=True)
df2 = df.copy()

In [71]:
df.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1304,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


## Exploratory Data Analysis

In [72]:
df.shape

(1309, 11)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


In [74]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,1309.0,655.0,378.020061,1.0,328.0,655.0,982.0,1309.0
Pclass,1309.0,2.294882,0.837836,1.0,2.0,3.0,3.0,3.0
Age,1046.0,29.881138,14.413493,0.17,21.0,28.0,39.0,80.0
SibSp,1309.0,0.498854,1.041658,0.0,0.0,0.0,1.0,8.0
Parch,1309.0,0.385027,0.86556,0.0,0.0,0.0,0.0,9.0
Fare,1308.0,33.295479,51.758668,0.0,7.8958,14.4542,31.275,512.3292


# Data PreProcessing

### Drop irrelevant features

In [75]:
# omit features that are not improtant
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin','Embarked']

df = df.drop(columns_to_drop,axis=1)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  1309 non-null   int64  
 1   Sex     1309 non-null   object 
 2   Age     1046 non-null   float64
 3   SibSp   1309 non-null   int64  
 4   Parch   1309 non-null   int64  
 5   Fare    1308 non-null   float64
dtypes: float64(2), int64(3), object(1)
memory usage: 61.5+ KB


### Convert Sex feature to numerical

In [77]:
# Need to covnert 'Sex' into numerical feature
df['Sex'].value_counts(dropna=False)

male      843
female    466
Name: Sex, dtype: int64

In [78]:
df['Sex'] = df['Sex'].map(lambda x: 1 if x == 'male' else 0)
df['Sex'].value_counts()

1    843
0    466
Name: Sex, dtype: int64

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  1309 non-null   int64  
 1   Sex     1309 non-null   int64  
 2   Age     1046 non-null   float64
 3   SibSp   1309 non-null   int64  
 4   Parch   1309 non-null   int64  
 5   Fare    1308 non-null   float64
dtypes: float64(2), int64(4)
memory usage: 61.5 KB


### fill in missing value for Fare with mean value

In [80]:
# The Fare Feature has 1308 values - need to replace 1
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  1309 non-null   int64  
 1   Sex     1309 non-null   int64  
 2   Age     1046 non-null   float64
 3   SibSp   1309 non-null   int64  
 4   Parch   1309 non-null   int64  
 5   Fare    1309 non-null   float64
dtypes: float64(2), int64(4)
memory usage: 61.5 KB


### use Iterative imputation with random forest for age

In [82]:
df['Age'].isnull().sum()

263

In [83]:
df['Age'].describe()

count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: Age, dtype: float64

In [84]:
df_ran_for_reg = df.copy()

In [85]:
df_ran_for_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  1309 non-null   int64  
 1   Sex     1309 non-null   int64  
 2   Age     1046 non-null   float64
 3   SibSp   1309 non-null   int64  
 4   Parch   1309 non-null   int64  
 5   Fare    1309 non-null   float64
dtypes: float64(2), int64(4)
memory usage: 61.5 KB


In [86]:
from sklearn.ensemble import RandomForestRegressor

# define features, to predict the age
age_df = df_ran_for_reg[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch']]


# partition out null values
has_age = age_df[age_df['Age'].notnull()].values
no_age = age_df[age_df['Age'].isnull()].values

# separate train data to X and y
X = has_age[:,1:]
y = has_age[:,0]

In [87]:
## Build age prediciton model with RF
rand_for_reg = RandomForestRegressor(random_state=0,n_estimators=10, n_jobs=-1)
rand_for_reg.fit(X,y)

RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=0)

In [88]:
# use the model to predict the age for test data
age_predicted = rand_for_reg.predict(no_age[:,1:])
age_predicted = np.round(age_predicted,decimals=1)
age_predicted

array([28.5, 33.5, 24.5, 28.5, 24.5, 28.5, 37.9, 24.5, 28.5, 28.5, 28.5,
       25.9, 24.5, 27.1, 42.9, 42.9, 16.5, 28.5, 28.5, 24.5, 28.5, 28.5,
       28.5, 28.5, 26.4, 28.5, 28.5, 16.6, 22.4, 28.5, 28.5, 13.1, 35.5,
       42.9,  8.1, 12.9, 33.5, 42.9, 26.4, 28.5, 24.5, 13.1, 25.9, 28.5,
        5. , 24.5, 26.4, 26.4, 28.5, 35.5, 28.5, 24.5, 42.9, 24.5, 33.5,
       42.9, 42.9, 42.9, 24.5, 27.1, 31. , 28.5, 35.5, 13.1, 22. , 37.9,
       28.5, 26.4, 42.9, 28.5, 24.5, 24.5, 25.9, 24.5, 24.5, 37.9, 28.5,
       28.5,  5. , 28.5, 28.5, 33.5, 24.5, 28.5, 28.5, 28.5, 26.4, 28.5,
       25.9, 28.5, 37.9, 28.5, 28.5, 33.5, 28.5, 28.5, 42.9, 33.5,  5. ,
       25.9, 28.5, 28.5, 24.5, 42.9, 28.5, 28.5, 28.5, 28.5, 42.9, 28.5,
       22.4, 28.5, 33.5, 28.5, 42.9, 28.5, 28.5, 24.5, 28.5, 24.5, 26.4,
       28.5, 28.5, 22.4, 31. , 28.5, 28.5, 42.9, 28.5, 26.4, 28.5, 28.5,
       42.9, 25.9, 28.5, 28.5, 28.5, 24.5, 28.5, 28.5, 37.9, 33.5, 24.5,
       28.5, 24.5, 16.5, 42.9, 28.5, 24.5, 33.5, 28

In [89]:
# Replace missing age values with age_predicted
df_ran_for_reg.loc[(df_ran_for_reg['Age'].isnull()), 'Age'] = age_predicted

In [90]:
df_ran_for_reg['Age'].isnull().sum()

0

In [91]:
# Comapre Age feature beforem imputation, and after
print('Before imputation')
print(df['Age'].describe())
print(df['Age'].value_counts().sort_index())
print(" ")

print('Age feature for df_rdn_for_reg')
print(df_ran_for_reg['Age'].describe())
print(df_ran_for_reg['Age'].value_counts().sort_index())

Before imputation
count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: Age, dtype: float64
0.17     1
0.33     1
0.42     1
0.67     1
0.75     3
        ..
70.50    1
71.00    2
74.00    1
76.00    1
80.00    1
Name: Age, Length: 98, dtype: int64
 
Age feature for df_rdn_for_reg
count    1309.000000
mean       29.633667
std        13.285671
min         0.170000
25%        22.000000
50%        28.500000
75%        36.000000
max        80.000000
Name: Age, dtype: float64
0.17     1
0.33     1
0.42     1
0.67     1
0.75     3
        ..
70.50    1
71.00    2
74.00    1
76.00    1
80.00    1
Name: Age, Length: 116, dtype: int64


In [92]:
# check if there are outliers or unkown value
df_ran_for_reg.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pclass,1309.0,2.294882,0.837836,1.0,2.0,3.0,3.0,3.0
Sex,1309.0,0.644003,0.478997,0.0,0.0,1.0,1.0,1.0
Age,1309.0,29.633667,13.285671,0.17,22.0,28.5,36.0,80.0
SibSp,1309.0,0.498854,1.041658,0.0,0.0,0.0,1.0,8.0
Parch,1309.0,0.385027,0.86556,0.0,0.0,0.0,0.0,9.0
Fare,1309.0,33.295479,51.738879,0.0,7.8958,14.4542,31.275,512.3292


In [93]:
for i in df_ran_for_reg.columns:
    print('Unique values for column: ' + i)
    print(df_ran_for_reg[i].value_counts(dropna=False))
    print(" ")

Unique values for column: Pclass
3    709
1    323
2    277
Name: Pclass, dtype: int64
 
Unique values for column: Sex
1    843
0    466
Name: Sex, dtype: int64
 
Unique values for column: Age
28.5    119
24.0     47
22.0     45
21.0     41
30.0     40
       ... 
16.6      1
55.5      1
23.5      1
8.1       1
38.5      1
Name: Age, Length: 116, dtype: int64
 
Unique values for column: SibSp
0    891
1    319
2     42
4     22
3     20
8      9
5      6
Name: SibSp, dtype: int64
 
Unique values for column: Parch
0    1002
1     170
2     113
3       8
5       6
4       6
6       2
9       2
Name: Parch, dtype: int64
 
Unique values for column: Fare
8.0500     60
13.0000    59
7.7500     55
26.0000    50
7.8958     49
           ..
26.2833     1
14.0000     1
15.0000     1
6.2375      1
7.7208      1
Name: Fare, Length: 282, dtype: int64
 


In [94]:
df_ran_for_reg['Fare'].value_counts().sort_index()

0.0000      17
3.1708       1
4.0125       1
5.0000       1
6.2375       1
            ..
227.5250     5
247.5208     3
262.3750     7
263.0000     6
512.3292     4
Name: Fare, Length: 282, dtype: int64

In [95]:
# the .min() value for Fare feature of 0 appears 17 times
# so either the passenger didn't pay; or fare was unknown

fare_unknown = df_ran_for_reg[df_ran_for_reg['Fare']==0]
fare_unknown

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
179,3,1,36.0,0,0,0.0
263,1,1,40.0,0,0,0.0
271,3,1,25.0,0,0,0.0
277,2,1,33.5,0,0,0.0
302,3,1,19.0,0,0,0.0
413,2,1,33.5,0,0,0.0
466,2,1,33.5,0,0,0.0
481,2,1,33.5,0,0,0.0
597,3,1,49.0,0,0,0.0
633,1,1,42.9,0,0,0.0


### use median of fare of Pclass and Embarked to fill in missing Fare values

In [96]:
# some of the passegners with 'unknown' fare are pClass1
# very unlikely they did not pay
# so assume how much fare they paid is unknown

# now use df2
df2[df2['Fare'].isnull()].index

Int64Index([1043], dtype='int64')

In [97]:
df_ran_for_reg_3 = df_ran_for_reg.copy()
df_ran_for_reg_3.loc[1043,:]

Pclass     3.000000
Sex        1.000000
Age       60.500000
SibSp      0.000000
Parch      0.000000
Fare      33.295479
Name: 1043, dtype: float64

In [98]:
df_ran_for_reg_3.loc[1043,'Fare']=0
len(df_ran_for_reg_3[df_ran_for_reg_3['Fare']==0])

18

In [99]:
# Fare is a fucntion of PClass and embarked
# impute values using the median of Pclass and embarked
# need to add back embarked
df_ran_for_reg_3.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,3,1,35.0,0,0,8.05


In [100]:
df_ran_for_reg_3['Embarked'] = df2['Embarked']
df_ran_for_reg_3.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,S
1,1,0,38.0,1,0,71.2833,C
2,3,0,26.0,0,0,7.925,S
3,1,0,35.0,1,0,53.1,S
4,3,1,35.0,0,0,8.05,S


In [101]:
# chcek th median of Fare values by PClass and embarked
data2 = df_ran_for_reg_3.loc[df_ran_for_reg_3['Fare'] !=0,:].groupby(['Pclass','Embarked']).agg(['mean','median','count'])['Fare']
data2

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,count
Pclass,Embarked,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,C,106.84533,76.7292,141
1,Q,90.0,90.0,3
1,S,75.118898,52.2771,170
2,C,23.300593,15.3146,28
2,Q,11.735114,12.35,7
2,S,21.746081,16.0,236
3,C,11.021624,7.8958,101
3,Q,10.39082,7.75,113
3,S,14.553262,8.05,490


In [102]:
for i in df_ran_for_reg_3['Pclass'].unique():
    for location in df_ran_for_reg_3['Embarked'].unique():
        df_ran_for_reg_3.loc[(df_ran_for_reg_3['Fare'] == 0) & (df_ran_for_reg_3['Pclass'] == i) & (df_ran_for_reg_3['Embarked'] == location), 'Fare'] = df_ran_for_reg_3.loc[(df_ran_for_reg_3['Fare'] == 0) & (df_ran_for_reg_3['Pclass'] == i) & (df_ran_for_reg_3['Embarked'] == location), 'Fare'].map(
        lambda x: df_ran_for_reg_3[(df_ran_for_reg_3['Fare'] != 0) & (df_ran_for_reg_3['Pclass'] == i) & (df_ran_for_reg_3['Embarked'] == location)]['Fare'].median()
    )
            
            

In [103]:
# check to see that all fares imputed
df_ran_for_reg_3[df_ran_for_reg_3['Fare']==0]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked


In [104]:
print('Before imputation: ')
print(data2)
print(" ")
print("-"*75)
print(" ")
print('After imputation: ')
print(df_ran_for_reg_3.groupby(['Pclass','Embarked']).agg(['mean','median','count'])['Fare'])

Before imputation: 
                       mean   median  count
Pclass Embarked                            
1      C         106.845330  76.7292    141
       Q          90.000000  90.0000      3
       S          75.118898  52.2771    170
2      C          23.300593  15.3146     28
       Q          11.735114  12.3500      7
       S          21.746081  16.0000    236
3      C          11.021624   7.8958    101
       Q          10.390820   7.7500    113
       S          14.553262   8.0500    490
 
---------------------------------------------------------------------------
 
After imputation: 
                       mean   median  count
Pclass Embarked                            
1      C         106.845330  76.7292    141
       Q          90.000000  90.0000      3
       S          74.215550  52.2771    177
2      C          23.300593  15.3146     28
       Q          11.735114  12.3500      7
       S          21.603616  16.0000    242
3      C          11.021624   7.8958    101
 

In [105]:
df_ran_for_reg_3 = df_ran_for_reg_3.drop(columns='Embarked',axis=1)

## Building the model

In [106]:
train = df_ran_for_reg_3.iloc[:891,:]
test = df_ran_for_reg_3.iloc[891:,:]

# add back the survived feature
train["Survived"] = survived_data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int64  
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    object 
 6   Survived  891 non-null    int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 48.9+ KB


In [107]:
X = train.drop('Survived', axis=1)
y = train['Survived']

### Build model with GridSearchCv

In [108]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
clf = RandomForestClassifier()



In [109]:
# cross validation of all combos iof the parameters
param_grid = [
    {'n_estimators':[i for i in range(10,100,10)],
    'criterion':['gini','entropy'],
    'max_depth':[i for i in range(10, 15, 1)],
    'min_samples_split':[2,4,10,12,16]},
]

# cv = 3
grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy',
                          return_train_score=True)
grid_search.fit(X,y)



GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_depth': [10, 11, 12, 13, 14],
                          'min_samples_split': [2, 4, 10, 12, 16],
                          'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80,
                                           90]}],
             return_train_score=True, scoring='accuracy')

In [110]:
# get the combo with the bdst performing score
final_clf = grid_search.best_estimator_
final_clf

RandomForestClassifier(criterion='entropy', max_depth=14, min_samples_split=16,
                       n_estimators=50)

In [111]:
# get the mena score of the mdoel with the bst combo of parameters
scores = []

for rs in np.arange(0, 1000, 50):
    score  = final_clf.score(X_test, y_test)
    scores.append(score)

print(scores)
print(" ")
print('-'*75)
print("MEAN SCORE: ")
print(np.array(scores).mean())

[0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681, 0.8715083798882681]
 
---------------------------------------------------------------------------
MEAN SCORE: 
0.8715083798882682


### check feature importance

In [112]:
final_clf.feature_importances_

array([0.11710225, 0.33072736, 0.21706082, 0.05257652, 0.03845059,
       0.24408246])

In [113]:
index_list = X_train.columns
print(index_list)

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


In [115]:
feature_importance = pd.DataFrame({'score': final_clf.feature_importances_},
                                 index = index_list)
feature_importance.sort_values('score', ascending=False)

Unnamed: 0,score
Sex,0.330727
Fare,0.244082
Age,0.217061
Pclass,0.117102
SibSp,0.052577
Parch,0.038451


## Predict on test dataset for submission

In [116]:
test = df_ran_for_reg_3.iloc[891:,:]
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
891,3,1,34.5,0,0,7.8292
892,3,0,47.0,1,0,7.0
893,2,1,62.0,0,0,9.6875
894,3,1,27.0,0,0,8.6625
895,3,0,22.0,1,1,12.2875


In [117]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 891 to 1308
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    int64  
 1   Sex     418 non-null    int64  
 2   Age     418 non-null    float64
 3   SibSp   418 non-null    int64  
 4   Parch   418 non-null    int64  
 5   Fare    418 non-null    object 
dtypes: float64(1), int64(4), object(1)
memory usage: 19.7+ KB


In [118]:
id_column = test_data['PassengerId']
id_column.head()

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

### Use the test data to make prediction

In [119]:
submission_prediction = final_clf.predict(test)
submission_prediction

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# Prediction submission

In [120]:
final_df = pd.DataFrame(id_column)
final_df['Survived'] = submission_prediction
final_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [121]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [122]:
final_df.to_csv('submission.csv', index=False)