### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

### Dataset Description

In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [3]:
train.shape

(891, 12)

In [4]:
test.shape

(418, 11)

In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Data Pre-processing

In [6]:
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)

In [7]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [8]:
train.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
#as only 2 values are null we will drop it
train.dropna(subset = ['Embarked'], inplace = True)

In [10]:
test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [11]:
test.shape

(418, 7)

In [12]:
test.dropna(subset = ['Fare'], inplace = True)

In [13]:
y_train = train['Survived']
X_train = train.drop(['Survived'], axis = 1)

y_test = train['Survived']
X_test = train.drop(['Survived'], axis = 1)

Pre-processing steps  
* One-Hot Encode Sex, Embarked     
* Simple Imputer Age 
* Scaling


### Transformers

In [14]:
trf1 = ColumnTransformer([
    ('age_imputer', SimpleImputer(), [2]),
    ('embarked_sex_encoder', OneHotEncoder(), [1, 6])
], remainder = 'passthrough')

In [15]:
trf2 = ColumnTransformer([
    ('scaler', StandardScaler(), [2, 3, 4, 5])
], remainder = 'passthrough')

In [16]:
trf3 = DecisionTreeClassifier()

### Pipeline

In [17]:
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3)
])

In [18]:
pipe.fit(X_train, y_train)

In [19]:
#using pipeline to get information
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.6420927])

In [20]:
y_pred = pipe.predict(X_test)

In [21]:
accuracy_score(y_test, y_pred)

0.9820022497187851