## Scikit Learn Pipeline

1. Pipelines chains together multiple steps so that the output of each step is used as input to the nest srep

2. Pipelines make it easy to apply the same preprocessing to train and test!

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import  train_test_split
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.sample(5)

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
152,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S
675,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18.0,0,0,349912,7.775,,S
539,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22.0,0,2,13568,49.5,B39,C
358,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q
872,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0,B51 B53 B55,S


In [5]:
df.drop(columns=['name','ticket','cabin'],inplace=True)

In [6]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
# Step 2 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split (df.drop(columns=['survived']),
                                                   df['survived'],
                                                   test_size=0.2,
                                                   random_state=0)

In [8]:
X_train.head(2)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
140,3,female,,0,2,15.2458,C
439,2,male,31.0,0,0,10.5,S


In [9]:
y_train.head(3)

140    0
439    0
817    0
Name: survived, dtype: int64

In [10]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [11]:
#  applying imputer

si_age = SimpleImputer()
si_emmbaraked = SimpleImputer(strategy="most_frequent")

X_train_age = si_age.fit_transform(X_train[['age']])
X_train_embarked = si_emmbaraked.fit_transform(X_train[['embarked']])

X_test_age = si_age.fit_transform(X_test[['age']])
X_test_embarked = si_emmbaraked.fit_transform(X_test[['embarked']])




In [12]:
X_train_age

array([[29.74518389],
       [31.        ],
       [31.        ],
       [20.        ],
       [21.        ],
       [45.5       ],
       [22.        ],
       [29.74518389],
       [29.74518389],
       [26.        ],
       [25.        ],
       [21.        ],
       [31.        ],
       [15.        ],
       [29.74518389],
       [29.74518389],
       [65.        ],
       [29.74518389],
       [ 1.        ],
       [34.        ],
       [49.        ],
       [18.        ],
       [29.74518389],
       [70.        ],
       [14.        ],
       [19.        ],
       [30.        ],
       [31.        ],
       [32.        ],
       [16.        ],
       [50.        ],
       [24.        ],
       [56.        ],
       [ 7.        ],
       [ 9.        ],
       [33.        ],
       [19.        ],
       [32.5       ],
       [ 1.        ],
       [45.        ],
       [29.74518389],
       [19.        ],
       [21.        ],
       [ 4.        ],
       [28.        ],
       [17

In [13]:
X_train_embarked

array([['C'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
      

In [14]:
# one hot encoding Sex and Embarked
OH_sex = OneHotEncoder(sparse=False,handle_unknown='ignore')
OH_embarked = OneHotEncoder(sparse=False,handle_unknown= "ignore")

X_train_sex = OH_sex.fit_transform(X_train[['sex']])
X_train_embarked = OH_embarked.fit_transform(X_train_embarked)


X_test_sex = OH_sex.fit_transform(X_test[['sex']])
X_test_embarked = OH_embarked.fit_transform(X_test_embarked)

In [15]:
X_train_rem = X_train.drop(columns=['sex','age','embarked'])

In [16]:
X_test_rem = X_test.drop(columns=['sex','age','embarked'])


In [17]:
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked),axis=1)


In [18]:
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked),axis=1)

In [19]:
X_train_transformed.shape


(712, 10)

In [20]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [21]:
y_pred = clf.predict(X_test_transformed)

In [22]:
y_pred

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1], dtype=int64)

In [23]:
y_test

495    0
648    0
278    0
31     1
255    1
      ..
780    1
837    0
215    1
833    0
372    0
Name: survived, Length: 179, dtype: int64

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
accuracy_score(y_pred,y_test)

0.7877094972067039

In [26]:
import pickle 

In [27]:
pickle.dump(OH_sex,open('models/OH_sex.pkl','wb'))
pickle.dump(OH_embarked,open("models/OH_embarked.pkl",'wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))