## Scikit Learn Pipeline

1. Pipelines chains together multiple steps so that the output of each step is used as input to the nest srep

2. Pipelines make it easy to apply the same preprocessing to train and test!

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import  train_test_split
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.sample(5)

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
17,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
850,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.0,4,2,347082,31.275,,S
825,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q
863,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S
788,1,3,"Dean, Master. Bertram Vere",male,1.0,1,2,C.A. 2315,20.575,,S


In [5]:
df.drop(columns=['name','ticket','cabin'],inplace=True)

In [6]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
# Step 2 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split (df.drop(columns=['survived']),
                                                   df['survived'],
                                                   test_size=0.2,
                                                   random_state=0)

In [8]:
X_train.shape

(712, 7)

In [9]:
y_train.shape

(712,)

In [10]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [11]:
#  applying imputer

si_age = SimpleImputer()
si_emmbaraked = SimpleImputer(strategy="most_frequent")

X_train_age = si_age.fit_transform(X_train[['age']])
X_train_embarkede = si_emmbaraked.fit_transform(X_train[['embarked']])

X_test_age = si_age.fit_transform(X_test[['age']])
X_test_embarked = si_emmbaraked.fit_transform(X_test[['embarked']])




In [12]:
X_train_age.shape

(712, 1)

In [13]:
X_test_age.shape

(179, 1)

In [14]:
# one hot encoding Sex and Embarked
OH_sex = OneHotEncoder(sparse=False,handle_unknown='ignore')
OH_embarked = OneHotEncoder(sparse=False,handle_unknown= "ignore")

X_train_sex = OH_sex.fit_transform(X_train[['sex']])
X_train_embarked = OH_embarked.fit_transform(X_train_embarkede)


X_test_sex = OH_sex.fit_transform(X_test[['sex']])
X_test_embarked = OH_embarked.fit_transform(X_test_embarked)

In [15]:
X_train_embarked.shape

(712, 3)

In [16]:
X_test_embarked.shape

(179, 3)

In [17]:
X_train_rem = X_train.drop(columns=['sex','age','embarked'])

In [18]:
X_test_rem = X_test.drop(columns=['sex','age','embarked'])


In [19]:
X_train_rem

Unnamed: 0,pclass,sibsp,parch,fare
140,3,0,2,15.2458
439,2,0,0,10.5000
817,2,1,1,37.0042
378,3,0,0,4.0125
491,3,0,0,7.2500
...,...,...,...,...
835,1,1,1,83.1583
192,3,1,0,7.8542
629,3,0,0,7.7333
559,3,1,0,17.4000


In [20]:
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked),axis=1)


In [21]:
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked),axis=1)

In [22]:
X_train_transformed


array([[3., 0., 2., ..., 1., 0., 0.],
       [2., 0., 0., ..., 0., 0., 1.],
       [2., 1., 1., ..., 1., 0., 0.],
       ...,
       [3., 0., 0., ..., 0., 1., 0.],
       [3., 1., 0., ..., 0., 0., 1.],
       [2., 1., 1., ..., 0., 0., 1.]])

In [23]:
X_test_transformed.shape

(179, 10)

In [24]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [25]:
y_pred = clf.predict(X_test_transformed)

In [26]:
y_pred

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1], dtype=int64)

In [27]:
y_test

495    0
648    0
278    0
31     1
255    1
      ..
780    1
837    0
215    1
833    0
372    0
Name: survived, Length: 179, dtype: int64

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
accuracy_score(y_pred,y_test)

0.770949720670391

In [30]:
import pickle 

In [31]:
pickle.dump(OH_sex,open('models/OH_sex.pkl','wb'))
pickle.dump(OH_embarked,open("models/OH_embarked.pkl",'wb'))
pickle.dump(clf,open('models/clf.pkl','eb'))

ValueError: invalid mode: 'eb'