In [1]:
import streamlit as st

import pandas as pd
import numpy as np
import warnings

from sklearn.tree  import DecisionTreeClassifier

2022-05-13 15:10:17.621 INFO    numexpr.utils: NumExpr defaulting to 8 threads.


## DATA LOADING

In [2]:
data = pd.read_csv('titanic.csv')

In [3]:
df = data.copy()

## Data Preprocessing

In [4]:
df.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   float64
 1   survived   1309 non-null   float64
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   float64
 6   parch      1309 non-null   float64
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(7), object(7)
memory usage: 143.4+ KB


### Dealing with missing value


In [6]:
# Cheking missing values
df.isnull().sum()

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

In [7]:
# Cabin , body, boat home.dest contain missing values more than 50%, so we will drop them.
df.drop(['cabin','body','boat','home.dest' ],axis=1,inplace=True )

In [8]:
df.isnull().sum()

pclass        1
survived      1
name          1
sex           1
age         264
sibsp         1
parch         1
ticket        1
fare          2
embarked      3
dtype: int64

In [9]:
#264 missing values of age column are not too much as much as previous ones, but we need to impute value to  not lose any data
df['age'].fillna(df['age'].median(),inplace=True)

In [10]:
# Cheking missing values again
df.isnull().sum()

pclass      1
survived    1
name        1
sex         1
age         0
sibsp       1
parch       1
ticket      1
fare        2
embarked    3
dtype: int64

In [11]:
# Now we can drop the missing value left
df.dropna(inplace=True)
df.isnull().sum()

pclass      0
survived    0
name        0
sex         0
age         0
sibsp       0
parch       0
ticket      0
fare        0
embarked    0
dtype: int64

In [12]:
# Cheking duplicated datas
df.duplicated().sum()

0

### Dealing with categorical features

In [13]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,S
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,S
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,S
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,S
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,S


In [14]:
# encoding sex column
df['sex'] = pd.get_dummies(df['sex'],drop_first=True)


In [15]:
#embarked (C = Cherbourg; Q = Queenstown; S = Southampton)
# encoding embarked column
df_embarked = pd.get_dummies(df['embarked'],drop_first=True)
df.drop('embarked',axis=1,inplace=True)

In [19]:
df1 = pd.concat([df,df_embarked],axis=1)

In [20]:
df1

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,Q,S
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",0,29.0000,0.0,0.0,24160,211.3375,0,1
1,1.0,1.0,"Allison, Master. Hudson Trevor",1,0.9167,1.0,2.0,113781,151.5500,0,1
2,1.0,0.0,"Allison, Miss. Helen Loraine",0,2.0000,1.0,2.0,113781,151.5500,0,1
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",1,30.0000,1.0,2.0,113781,151.5500,0,1
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0000,1.0,2.0,113781,151.5500,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,0.0,"Zabour, Miss. Hileni",0,14.5000,1.0,0.0,2665,14.4542,0,0
1305,3.0,0.0,"Zabour, Miss. Thamine",0,28.0000,1.0,0.0,2665,14.4542,0,0
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",1,26.5000,0.0,0.0,2656,7.2250,0,0
1307,3.0,0.0,"Zakarian, Mr. Ortin",1,27.0000,0.0,0.0,2670,7.2250,0,0


### Dealing with numeric features

In [22]:
# sibp and parch are summed for user frinedly input  
df1['nwith'] = df['sibsp']+df['parch']
df1.drop(['sibsp','parch'],axis=1,inplace=True)

In [24]:
df1.head()

Unnamed: 0,pclass,survived,name,sex,age,ticket,fare,Q,S,nwith
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",0,29.0,24160,211.3375,0,1,0.0
1,1.0,1.0,"Allison, Master. Hudson Trevor",1,0.9167,113781,151.55,0,1,3.0
2,1.0,0.0,"Allison, Miss. Helen Loraine",0,2.0,113781,151.55,0,1,3.0
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,113781,151.55,0,1,3.0
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,113781,151.55,0,1,3.0


In [30]:
# fare, name, ticket id are dropped. these features are unimportrant for model
df1.drop(['ticket','name','fare'],axis=1,inplace=True)

In [35]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pclass,1306.0,2.296325,0.837037,1.0,2.0,3.0,3.0,3.0
survived,1306.0,0.381317,0.485896,0.0,0.0,0.0,1.0,1.0
sex,1306.0,0.644717,0.478782,0.0,0.0,1.0,1.0,1.0
age,1306.0,29.44806,12.857854,0.1667,22.0,28.0,35.0,80.0
Q,1306.0,0.094181,0.292192,0.0,0.0,0.0,0.0,1.0
S,1306.0,0.699081,0.458833,0.0,0.0,1.0,1.0,1.0
nwith,1306.0,0.885911,1.584891,0.0,0.0,0.0,1.0,10.0


In [36]:
df1.head()

Unnamed: 0,pclass,survived,sex,age,Q,S,nwith
0,1.0,1.0,0,29.0,0,1,0.0
1,1.0,1.0,1,0.9167,0,1,3.0
2,1.0,0.0,0,2.0,0,1,3.0
3,1.0,0.0,1,30.0,0,1,3.0
4,1.0,0.0,0,25.0,0,1,3.0


In [31]:
df1.to_csv('df_final' ,index=False)