In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier


In [2]:
data=pd.read_csv('titanic.csv')

In [3]:
data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
346,347,1,2,"Smith, Miss. Marion Elsie",female,40.0,0,0,31418,13.0,,S
273,274,0,1,"Natsch, Mr. Charles H",male,37.0,0,1,PC 17596,29.7,C118,C
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
811,812,0,3,"Lester, Mr. James",male,39.0,0,0,A/4 48871,24.15,,S
700,701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18.0,1,0,PC 17757,227.525,C62 C64,C


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
data1=data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)

In [7]:
data1

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(data1.drop('Survived',axis=1),data1['Survived'])

In [9]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
725,3,male,20.0,0,0,8.6625,S
812,2,male,35.0,0,0,10.5000,S
806,1,male,39.0,0,0,0.0000,S
435,1,female,14.0,1,2,120.0000,S
466,2,male,,0,0,0.0000,S
...,...,...,...,...,...,...,...
195,1,female,58.0,0,0,146.5208,C
454,3,male,,0,0,8.0500,S
481,2,male,,0,0,0.0000,S
119,3,female,2.0,4,2,31.2750,S


In [10]:
#Simple Imputer
impute_age=ColumnTransformer([
    ('age',SimpleImputer(),[2]),
    ('embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [11]:
#One Hot Encoder
one_hot_encoder=ColumnTransformer([
    ('ohe',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'),[1,6])
])

In [12]:
min_max_scaler=ColumnTransformer([
    ('scaler',MinMaxScaler(),slice(0,8))
])

In [13]:
decision_tree=DecisionTreeClassifier()

In [14]:
pipe=Pipeline([
    ('impute_age',impute_age),
    ('one_hot_encoding',one_hot_encoder),
    ('scaler',min_max_scaler),
    ('decision_tree',decision_tree)
])

In [15]:
pipe.fit(x_train,y_train)

In [16]:
y_pred=pipe.predict(x_test)



In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6143497757847534

In [18]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,x_train,y_train,cv=5,scoring='accuracy').mean()



0.6421501514981484