# Titanic Dataset

# Data cleaning

In [14]:
import pandas as pd
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [15]:
data = datasets.fetch_openml(name='Titanic', version=1, as_frame=True)
titanic = data.frame
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [17]:
titanic.drop(columns=["survived", "ticket", "cabin", "boat", "body", "home.dest"], inplace=True, axis=1)

In [18]:
titanic.tail()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,fare,embarked
1304,3,"Zabour, Miss. Hileni",female,14.5,1,0,14.4542,C
1305,3,"Zabour, Miss. Thamine",female,,1,0,14.4542,C
1306,3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,7.225,C
1307,3,"Zakarian, Mr. Ortin",male,27.0,0,0,7.225,C
1308,3,"Zimmerman, Mr. Leo",male,29.0,0,0,7.875,S


In [19]:
print(titanic.dtypes)

pclass         int64
name          object
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object


In [20]:
titanic.isnull().sum()

pclass        0
name          0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [21]:
titanic["fare"].fillna(titanic["fare"].mean(), inplace=True)

In [32]:
titanic["embarked"].fillna('S', inplace=True)

In [33]:
titanic.isnull().sum()

pclass        0
name          0
sex           0
age         263
sibsp         0
parch         0
fare          0
embarked      0
dtype: int64

In [36]:
print(titanic["parch"].value_counts())
print("--------------------------")
print(titanic["sibsp"].value_counts())
print("--------------------------")
print(titanic["sex"].value_counts())
print("--------------------------")
print(titanic["pclass"].value_counts())

parch
0    1002
1     170
2     113
3       8
4       6
5       6
6       2
9       2
Name: count, dtype: int64
--------------------------
sibsp
0    891
1    319
2     42
4     22
3     20
8      9
5      6
Name: count, dtype: int64
--------------------------
sex
male      843
female    466
Name: count, dtype: int64
--------------------------
pclass
3    709
1    323
2    277
Name: count, dtype: int64


### Some Feature enginnering

I will extract title from name column

In [51]:
def get_title(name):
    ans = name.split(", ")[1].split(".")[0]
    return ans
titanic["title"] = titanic["name"].apply(get_title)

In [52]:
titanic["title"].value_counts()

title
Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Major             2
Mlle              2
Ms                2
Lady              1
Capt              1
Mme               1
Sir               1
Jonkheer          1
Dona              1
Don               1
the Countess      1
Name: count, dtype: int64

In [53]:
rare_val = titanic["title"].value_counts().index[4:]
titanic["title"] = titanic["title"].replace(rare_val, "rare")

In [54]:
titanic["title"].value_counts()

title
Mr        757
Miss      260
Mrs       197
Master     61
rare       34
Name: count, dtype: int64

I will combine sibsp and parch column into one. This column will be total family size of a person.

No. of siblings and spouse + No. of children and parents = Family size

In [None]:
titanic["family_size"] = titanic["sibsp"] + titanic["parch"] + 1

In [56]:
titanic["family_size"].value_counts()

family_size
1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: count, dtype: int64

In [57]:
def family_size(members):
    if members == 1:
        return "Alone"
    elif members > 1 and members < 5:
        return "Small"
    else:
        return "Large"
    
titanic["family_size"] = titanic["family_size"].apply(family_size)

In [58]:
titanic["family_size"].value_counts()

family_size
Alone    790
Small    437
Large     82
Name: count, dtype: int64

In [61]:
titanic.drop(["name", "sibsp", "parch"], axis=1, inplace=True)

In [62]:
titanic.sample(5)

Unnamed: 0,pclass,sex,age,fare,embarked,title,family_size
1208,3,female,9.0,27.9,S,Miss,Large
681,3,female,,15.2458,C,Mrs,Small
1025,3,male,6.0,12.475,S,Master,Small
285,1,male,67.0,221.7792,S,Mr,Small
1121,3,male,,22.3583,C,Master,Small


In [9]:
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split

In [10]:
X = titanic.drop("survived", axis=1)
Y = titanic['survived']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.2)
X_train

Unnamed: 0,pclass,name,sex,age,sibsp,parch,fare,embarked
772,3,"Dika, Mr. Mirko",male,17.0,0,0,7.8958,S
543,2,"Reeves, Mr. David",male,36.0,0,0,10.5000,S
289,1,"Taussig, Miss. Ruth",female,18.0,0,2,79.6500,S
10,1,"Astor, Col. John Jacob",male,47.0,1,0,227.5250,C
147,1,"Harrington, Mr. Charles H",male,,0,0,42.4000,S
...,...,...,...,...,...,...,...,...
1095,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,7.6292,Q
1130,3,"Pettersson, Miss. Ellen Natalia",female,18.0,0,0,7.7750,S
1294,3,"Williams, Mr. Leslie",male,28.5,0,0,16.1000,S
860,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S


In [11]:
imp = SimpleImputer(missing_values=np.nan, strategy="mean")
X_train['age'] = imp.fit_transform(X_train[['age']])
X_test['age'] = imp.transform(X_test[['age']])

In [12]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler

In [13]:
# column sex, embarked --> nominal category
ohe = OneHotEncoder(drop='first', sparse_output=False)
X_train_Sex_embarked = ohe.fit_transform(X_train[['sex', 'embarked']])
X_test_sex_embarked = ohe.transform(X_test[['sex', 'embarked']])

In [14]:
X_train.drop('name', inplace=True, axis=1)
X_test.drop('name', inplace=True, axis=1)

In [15]:
X_train_Sex_embarked

array([[1., 0., 1.],
       [1., 0., 1.],
       [0., 0., 1.],
       ...,
       [1., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], shape=(1047, 3))

In [16]:
train_stacked = np.hstack((X_train[['pclass','age','sibsp','parch','fare']].values, X_train_Sex_embarked))
test_stacked = np.hstack((X_test[['pclass','age','sibsp','parch','fare']].values, X_test_sex_embarked))
test_stacked

array([[ 3.       , 35.       ,  0.       , ...,  1.       ,  0.       ,
         1.       ],
       [ 3.       , 20.       ,  1.       , ...,  1.       ,  0.       ,
         0.       ],
       [ 3.       , 29.5327381,  0.       , ...,  1.       ,  0.       ,
         1.       ],
       ...,
       [ 3.       , 21.       ,  0.       , ...,  0.       ,  0.       ,
         1.       ],
       [ 1.       , 50.       ,  1.       , ...,  1.       ,  0.       ,
         0.       ],
       [ 1.       , 24.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ]], shape=(262, 8))

In [17]:
ss = StandardScaler()
X_train_transformed = ss.fit_transform(train_stacked)
X_test_transformed = ss.transform(test_stacked)
X_test_transformed

array([[ 8.40358979e-01,  4.28120063e-01, -4.95964298e-01, ...,
         7.39275864e-01, -3.24956337e-01,  6.47069693e-01],
       [ 8.40358979e-01, -7.46471726e-01,  4.56833170e-01, ...,
         7.39275864e-01, -3.24956337e-01, -1.54542859e+00],
       [ 8.40358979e-01,  7.78957819e-15, -4.95964298e-01, ...,
         7.39275864e-01, -3.24956337e-01,  6.47069693e-01],
       ...,
       [ 8.40358979e-01, -6.68165607e-01, -4.95964298e-01, ...,
        -1.35267503e+00, -3.24956337e-01,  6.47069693e-01],
       [-1.55055366e+00,  1.60271185e+00,  4.56833170e-01, ...,
         7.39275864e-01, -3.24956337e-01, -1.54542859e+00],
       [-1.55055366e+00, -4.33247249e-01, -4.95964298e-01, ...,
        -1.35267503e+00, -3.24956337e-01, -1.54542859e+00]],
      shape=(262, 8))

In [18]:
X_test_final = pd.DataFrame(X_test_transformed)
X_test_final.columns = ['pclass', 'age', 'sibp', 'parch','fare', 'male', 'c', 's']
X_train_final = pd.DataFrame(X_train_transformed)
X_train_final.columns = ['pclass', 'age', 'sibp', 'parch','fare', 'male', 'c', 's']
X_train_final

Unnamed: 0,pclass,age,sibp,parch,fare,male,c,s
0,0.840359,-9.813901e-01,-0.495964,-0.442432,-0.495577,0.739276,-0.324956,0.647070
1,-0.355097,5.064262e-01,-0.495964,-0.442432,-0.445264,0.739276,-0.324956,0.647070
2,-1.550554,-9.030840e-01,-0.495964,1.795376,0.890704,-1.352675,-0.324956,0.647070
3,-1.550554,1.367793e+00,0.456833,-0.442432,3.747629,0.739276,-0.324956,-1.545429
4,-1.550554,7.789578e-15,-0.495964,-0.442432,0.171040,0.739276,-0.324956,0.647070
...,...,...,...,...,...,...,...,...
1042,0.840359,7.789578e-15,-0.495964,-0.442432,-0.500728,-1.352675,3.077337,-1.545429
1043,0.840359,-9.030840e-01,-0.495964,-0.442432,-0.497911,-1.352675,-0.324956,0.647070
1044,0.840359,-8.086971e-02,-0.495964,-0.442432,-0.337073,0.739276,-0.324956,0.647070
1045,0.840359,-2.766350e-01,-0.495964,-0.442432,-0.495013,-1.352675,-0.324956,0.647070


In [19]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
model = DecisionTreeClassifier()
model.fit(X_train_final, Y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [25]:
model.score(X_test_final, Y_test)

0.767175572519084

## will use column transformer with pipleines to do the same thing

In [30]:
import pandas as pd
import numpy as np
from sklearn import datasets
data = datasets.fetch_openml(name='Titanic', version=1, as_frame=True)
titanic = data.frame
titanic

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [31]:
titanic.drop(labels=["name", "cabin","ticket", "boat","body", "home.dest"], axis=1, inplace=True)
titanic.value_counts()

pclass  survived  sex   age   sibsp  parch  fare     embarked
2       0         male  30.0  0      0      13.0000  S           5
3       0         male  17.0  0      0      8.6625   S           4
2       0         male  23.0  0      0      13.0000  S           3
3       0         male  21.0  0      0      8.0500   S           3
2       0         male  25.0  0      0      13.0000  S           3
                                                                ..
                        29.0  1      0      26.0000  S           1
                                            27.7208  C           1
                        30.0  0      0      10.5000  S           1
                              1      0      21.0000  S           1
                        25.0  1      2      41.5792  C           1
Name: count, Length: 971, dtype: int64

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(titanic.drop(labels="survived", axis=1), titanic["survived"],random_state=42, test_size=0.2)
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
772,3,male,17.0,0,0,7.8958,S
543,2,male,36.0,0,0,10.5000,S
289,1,female,18.0,0,2,79.6500,S
10,1,male,47.0,1,0,227.5250,C
147,1,male,,0,0,42.4000,S
...,...,...,...,...,...,...,...
1095,3,female,,0,0,7.6292,Q
1130,3,female,18.0,0,0,7.7750,S
1294,3,male,28.5,0,0,16.1000,S
860,3,female,26.0,0,0,7.9250,S


In [39]:
trf1 = ColumnTransformer([
    ("impute_age", SimpleImputer(), [2]),
    ("impute_fare", SimpleImputer(), [5]),
    ("impute_embarked", SimpleImputer(strategy="most_frequent"), [6])
], remainder='passthrough')

In [40]:
trf2 = ColumnTransformer([
    ("encode_sex", OneHotEncoder(drop='first', sparse_output=False), [1]),
    ("encode_embarked", OneHotEncoder(drop='first', sparse_output=False), [6])
], remainder='passthrough')

In [None]:
trf3 = ColumnTransformer([
    ()
])