### Building an Effective Machine Learning Workflow with scikit-learn.

#### Outline

- 1 Review of the basics ML workflow. 
- 2 Encoding categorical data
- 3 Using ColumnTransformer and Pipeline
- 4 Encoding Text Data
- 5 Handing missing data 
- 6 Switching to the full dataset
- 7 Evaluating and tuning a Pipeline



In [54]:
import pandas as pd
import sklearn
sklearn.__version__

'0.24.1'

In [55]:
#data = pd.read_csv("data/titanic_train.csv")
df = pd.read_csv("http://bit.ly/kaggletrain", nrows = 10)

In [56]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [57]:
y = df['Survived']

In [58]:
 X = df[["Parch","Fare"]]

In [59]:
y.shape

(10,)

In [60]:
from sklearn.linear_model import LogisticRegression

In [61]:
logreg = LogisticRegression(solver='liblinear', random_state = 1)

In [62]:
logreg.fit(X, y)

LogisticRegression(random_state=1, solver='liblinear')

In [63]:
from sklearn.model_selection import cross_val_score

In [64]:
scores = cross_val_score(logreg, X, y, cv = 3, scoring= 'accuracy')

In [65]:
scores.mean()

0.6944444444444443

In [66]:
df_new = pd.read_csv('http://bit.ly/kaggletest', nrows = 10)

In [67]:
X_new = df_new[['Parch', 'Fare']]

In [68]:
logreg.predict(X_new)

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1])

### Part 2

In [69]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
ohe.fit_transform(df[['Embarked']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [70]:
ohe.categories_

[array(['C', 'Q', 'S'], dtype=object)]

In [71]:
ohe.fit_transform(df[['Embarked', 'Sex']])

array([[0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.]])

### Part 3

In [72]:
cols = ['Parch', 'Fare', 'Embarked', 'Sex']

X = df[cols]
X

Unnamed: 0,Parch,Fare,Embarked,Sex
0,0,7.25,S,male
1,0,71.2833,C,female
2,0,7.925,S,female
3,0,53.1,S,female
4,0,8.05,S,male
5,0,8.4583,Q,male
6,0,51.8625,S,male
7,1,21.075,S,male
8,2,11.1333,S,female
9,0,30.0708,C,female


In [73]:
ohe = OneHotEncoder()
from sklearn.compose import make_column_transformer

In [75]:
ct = make_column_transformer(
     (ohe, ['Embarked', 'Sex']),
     remainder ='passthrough')

In [76]:
ct.fit_transform(X)

array([[ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    , 71.2833],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  0.    ,  7.925 ],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  0.    , 53.1   ],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    ,  8.05  ],
       [ 0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  8.4583],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    , 51.8625],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  1.    , 21.075 ],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  2.    , 11.1333],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    , 30.0708]])

In [77]:
from sklearn.pipeline import make_pipeline

In [79]:
pipe = make_pipeline(ct, logreg)
pipe.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Embarked', 'Sex'])])),
                ('logisticregression',
                 LogisticRegression(random_state=1, solver='liblinear'))])

In [82]:
logreg.fit(ct.fit_transform(X), y)

LogisticRegression(random_state=1, solver='liblinear')

In [83]:
pipe.named_steps.logisticregression.coef_

array([[ 0.26491287, -0.19848033, -0.22907928,  1.0075062 , -1.17015293,
         0.20056557,  0.01597307]])

In [85]:
X_new = df_new[cols]
X_new

Unnamed: 0,Parch,Fare,Embarked,Sex
0,0,7.8292,Q,male
1,0,7.0,S,female
2,0,9.6875,Q,male
3,0,8.6625,S,male
4,1,12.2875,S,female
5,0,9.225,S,male
6,0,7.6292,Q,female
7,1,29.0,S,male
8,0,7.2292,C,female
9,0,24.15,S,male


In [86]:
pipe.predict(X_new)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

#### Recap 
http://bit.ly/basic-pipelines

### Part 4

In [87]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [88]:
from sklearn.feature_extraction.text import CountVectorizer

In [89]:
vect = CountVectorizer()
dtm = vect.fit_transform(df['Name'])
dtm

<10x40 sparse matrix of type '<class 'numpy.int64'>'
	with 46 stored elements in Compressed Sparse Row format>

In [91]:
vect.get_feature_names()

['achem',
 'adele',
 'allen',
 'berg',
 'bradley',
 'braund',
 'briggs',
 'cumings',
 'elisabeth',
 'florence',
 'futrelle',
 'gosta',
 'harris',
 'heath',
 'heikkinen',
 'henry',
 'jacques',
 'james',
 'john',
 'johnson',
 'laina',
 'leonard',
 'lily',
 'master',
 'may',
 'mccarthy',
 'miss',
 'moran',
 'mr',
 'mrs',
 'nasser',
 'nicholas',
 'oscar',
 'owen',
 'palsson',
 'peel',
 'thayer',
 'timothy',
 'vilhelmina',
 'william']

In [95]:
df[['Survived']].to_numpy()

array([[0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1]])