### ML workflow. 

- Transforming data and using a pipeline.
- Handling missing values 
- Switching to the full dataset
- Recap
- Evaluating and tuning a Pipeline

In [21]:
import sklearn
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [22]:
train = pd.read_csv("http://bit.ly/kaggletrain", nrows = 10)
test = pd.read_csv("http://bit.ly/kaggletest", nrows = 10)

In [23]:
cols = ['Parch', 'Fare', 'Embarked', 'Sex']

In [24]:
X_train = train[cols]
y_train = train['Survived']

In [25]:
X_test = test[cols]

In [26]:
X_train

Unnamed: 0,Parch,Fare,Embarked,Sex
0,0,7.25,S,male
1,0,71.2833,C,female
2,0,7.925,S,female
3,0,53.1,S,female
4,0,8.05,S,male
5,0,8.4583,Q,male
6,0,51.8625,S,male
7,1,21.075,S,male
8,2,11.1333,S,female
9,0,30.0708,C,female


In [27]:
ohe = OneHotEncoder()

In [28]:
ct = make_column_transformer(
     (ohe, ['Embarked', 'Sex']),
      remainder = 'passthrough') 

In [29]:
logreg = LogisticRegression(solver= 'liblinear', random_state = 1)

In [30]:
pipe = make_pipeline(ct, logreg)
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Embarked', 'Sex'])])),
                ('logisticregression',
                 LogisticRegression(random_state=1, solver='liblinear'))])

In [31]:
pipe.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [48]:
cols = ['Parch', 'Fare', 'Embarked', 'Sex', 'Age']
X_train = train[cols]
X_test = test[cols]

In [49]:
# it throws an error because of NaNs.
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Embarked', 'Sex']),
                                                 ('simpleimputer',
                                                  SimpleImputer(), ['Age'])])),
                ('logisticregression',
                 LogisticRegression(random_state=1, solver='liblinear'))])

### What options do we have for handling the NaNs? 
- drop rows
- imputation

In [50]:
from sklearn.impute import SimpleImputer

In [51]:
imp = SimpleImputer()

In [52]:
imp.fit_transform(X_train[['Age']])

array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [28.11111111],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ]])

In [53]:
imp.statistics_

array([28.11111111])

In [54]:
ct = make_column_transformer(
     (ohe, ['Embarked', 'Sex']),
     (imp, ['Age']),
     remainder = 'passthrough') 

In [55]:
ct.fit_transform(X_train)

array([[ 0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        22.        ,  0.        ,  7.25      ],
       [ 1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        38.        ,  0.        , 71.2833    ],
       [ 0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
        26.        ,  0.        ,  7.925     ],
       [ 0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
        35.        ,  0.        , 53.1       ],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        35.        ,  0.        ,  8.05      ],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        28.11111111,  0.        ,  8.4583    ],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        54.        ,  0.        , 51.8625    ],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         2.        ,  1.        , 21.075     ],
       [ 0.        ,  0.        

In [56]:
pipe = make_pipeline(ct, logreg)
pipe.fit(X_train, y_train)
pipe.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoder', OneHotEncoder(),
                                  ['Embarked', 'Sex']),
                                 ('simpleimputer', SimpleImputer(), ['Age'])]),
 'logisticregression': LogisticRegression(random_state=1, solver='liblinear')}

In [58]:
pipe.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

### Part 6  - Use full data

In [59]:
train = pd.read_csv("http://bit.ly/kaggletrain")
test = pd.read_csv("http://bit.ly/kaggletest")

In [62]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [63]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [70]:
cols = ['Parch', 'Fare', 'Embarked', 'Sex', 'Name', 'Age']
X = train[cols]
y = train['Survived']
vect = CountVectorizer()

In [71]:
ct = make_column_transformer(
     (ohe, ['Embarked', 'Sex']),
     (imp, ['Age']),
     (vect, ['Name']),
     remainder = 'passthrough') 

In [72]:
ct.fit_transform(X)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 891 and the array at index 2 has size 1