# Scikit-learn for feature processing and training

### environment

In [1]:
import sys
sys.version

'3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]'

### imports

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

#libraries
######################################################################

import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

#configuration options
######################################################################

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

seed = 12345

In [3]:
pd.__version__

'1.3.4'

In [4]:
np.__version__

'1.21.0'

In [5]:
sklearn.__version__

'0.24.2'

## dataset

In [6]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [7]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [8]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [9]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [11]:
X.shape, y.shape

((1309, 13), (1309,))

In [12]:
X_train.shape, X_test.shape

((1047, 13), (262, 13))

In [13]:
y_train.shape, y_test.shape

((1047,), (262,))

# preprocessing

### na imputation

In [14]:
X_train.isna().sum()

pclass         0
name           0
sex            0
age          203
sibsp          0
parch          0
ticket         0
fare           1
cabin        820
embarked       1
boat         676
body         950
home.dest    438
dtype: int64

In [15]:
X_train.dtypes

pclass        float64
name           object
sex          category
age           float64
sibsp         float64
parch         float64
ticket         object
fare          float64
cabin          object
embarked     category
boat           object
body          float64
home.dest      object
dtype: object

the `pandas` way

you can separate categorical from numeric and impute with different techniques

In [16]:
X_train.select_dtypes('number').head()

Unnamed: 0,pclass,age,sibsp,parch,fare,body
0,1.0,29.0,0.0,0.0,211.3375,
394,2.0,18.0,0.0,0.0,73.5,
547,2.0,23.0,0.0,0.0,15.0458,
1155,3.0,,0.0,0.0,7.775,
1178,3.0,,8.0,2.0,69.55,


In [17]:
X_train.select_dtypes('object').head()

Unnamed: 0,name,ticket,cabin,boat,home.dest
0,"Allen, Miss. Elisabeth Walton",24160,B5,2.0,"St Louis, MO"
394,"Dibden, Mr. William",S.O.C. 14879,,,"New Forest, England"
547,"Richard, Mr. Emile",SC/PARIS 2133,,,"Paris / Montreal, PQ"
1155,"Rommetvedt, Mr. Knud Paust",312993,,,
1178,"Sage, Mr. George John Jr",CA. 2343,,,


In [18]:
X_copy = X_train.copy()

you can use pandas functions to fill null values

In [19]:
X_copy.age = X_copy.age.fillna(X_copy.age.median())

In [20]:
X_copy.cabin.value_counts().head()

C23 C25 C27    5
G6             5
F2             4
A34            3
C101           3
Name: cabin, dtype: int64

In [21]:
X_copy.cabin = X_copy.cabin.fillna('Other')

In [22]:
X_copy.isna().sum()

pclass         0
name           0
sex            0
age            0
sibsp          0
parch          0
ticket         0
fare           1
cabin          0
embarked       1
boat         676
body         950
home.dest    438
dtype: int64

the `scikit-learn` way

In [23]:
from sklearn.impute import SimpleImputer

In [24]:
X_train.isna().sum()

pclass         0
name           0
sex            0
age          203
sibsp          0
parch          0
ticket         0
fare           1
cabin        820
embarked       1
boat         676
body         950
home.dest    438
dtype: int64

In [25]:
X_copy = X_train.copy()

In [26]:
imputer = SimpleImputer(strategy = 'median')

In [27]:
X_copy_num = X_copy.select_dtypes('number')
X_copy_num.head(1)

Unnamed: 0,pclass,age,sibsp,parch,fare,body
0,1.0,29.0,0.0,0.0,211.3375,


In [28]:
imputer.fit(X_copy_num)

SimpleImputer(strategy='median')

In [29]:
X_copy_num.columns

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'body'], dtype='object')

In [30]:
imputer.statistics_

array([  3. ,  28. ,   0. ,   0. ,  14.5, 165. ])

In [31]:
X_copy_num = imputer.transform(X_copy_num)
X_copy_num

array([[  1.    ,  29.    ,   0.    ,   0.    , 211.3375, 165.    ],
       [  2.    ,  18.    ,   0.    ,   0.    ,  73.5   , 165.    ],
       [  2.    ,  23.    ,   0.    ,   0.    ,  15.0458, 165.    ],
       ...,
       [  2.    ,  28.    ,   0.    ,   0.    ,  21.    , 165.    ],
       [  1.    ,  35.    ,   0.    ,   0.    , 211.5   , 165.    ],
       [  2.    ,  17.    ,   0.    ,   0.    ,  12.    , 165.    ]])

In [32]:
# X_copy_num = pd.DataFrame(data = X_copy_num, columns=X_train.select_dtypes('number').columns)
# X_copy_num.isna().sum()

In [33]:
X_train.shape, X_test.shape

((1047, 13), (262, 13))

**summary**

*numeric*

In [34]:
X_train_num = X_train.select_dtypes('number')
X_test_num = X_test.select_dtypes('number')

imputer = SimpleImputer(strategy = 'median')

In [35]:
imputer.fit(X_train_num)

SimpleImputer(strategy='median')

In [36]:
imputer.statistics_

array([  3. ,  28. ,   0. ,   0. ,  14.5, 165. ])

In [37]:
X_train_imp = imputer.transform(X_train_num)
X_test_imp = imputer.transform(X_test_num)

*categories*

In [38]:
X_train_cat = X_train.get(['age', 'embarked'])
X_test_cat = X_train.get(['age', 'embarked'])

In [39]:
imputer_cat = SimpleImputer(strategy='constant', fill_value = 'Other')

In [40]:
imputer_cat.fit(X_train_cat)

SimpleImputer(fill_value='Other', strategy='constant')

In [41]:
imputer_cat.statistics_

array(['Other', 'Other'], dtype=object)

In [42]:
X_train_cat_imp = imputer_cat.transform(X_train_cat)
X_test_cat_imp = imputer_cat.transform(X_test_cat)

### one hot encoding

the `pandas` way

In [43]:
X_train.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
394,2.0,"Dibden, Mr. William",male,18.0,0.0,0.0,S.O.C. 14879,73.5,,S,,,"New Forest, England"
547,2.0,"Richard, Mr. Emile",male,23.0,0.0,0.0,SC/PARIS 2133,15.0458,,C,,,"Paris / Montreal, PQ"
1155,3.0,"Rommetvedt, Mr. Knud Paust",male,,0.0,0.0,312993,7.775,,S,,,
1178,3.0,"Sage, Mr. George John Jr",male,,8.0,2.0,CA. 2343,69.55,,S,,,


In [44]:
X_train.embarked.value_counts()

S    741
C    211
Q     94
Name: embarked, dtype: int64

In [45]:
X_train[['embarked', 'sex']].head()

Unnamed: 0,embarked,sex
0,S,female
394,S,male
547,C,male
1155,S,male
1178,S,male


In [46]:
pd.get_dummies(X_train[['embarked', 'sex']].head())

Unnamed: 0,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
0,0,0,1,1,0
394,0,0,1,0,1
547,1,0,0,0,1
1155,0,0,1,0,1
1178,0,0,1,0,1


the `scikit-learn` way

In [47]:
from sklearn.preprocessing import OneHotEncoder

In [48]:
X_train_cat = X_train.copy().get(['sex', 'embarked'])
X_test_cat = X_test.copy().get(['sex', 'embarked'])

ohe = OneHotEncoder(handle_unknown = 'ignore')

In [49]:
ohe.fit(X_train_cat)

OneHotEncoder(handle_unknown='ignore')

In [50]:
ohe.transform(X_train_cat)

<1047x6 sparse matrix of type '<class 'numpy.float64'>'
	with 2094 stored elements in Compressed Sparse Row format>

In [51]:
ohe.get_feature_names(input_features = X_train_cat.columns)

array(['sex_female', 'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S',
       'embarked_nan'], dtype=object)

## full dataprep with scikit pipelines

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [53]:
from sklearn import set_config

set_config(display='diagram')

In [54]:
pipe = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
                        ('scaler', StandardScaler())])

In [55]:
pipe

In [56]:
X_train_num = X_train.copy().select_dtypes('number')
X_test_num = X_test.copy().select_dtypes('number')

In [57]:
pipe.fit(X_train_num)

In [58]:
X_train_num_transformed = pipe.transform(X_train_num)
X_test_num_transformed = pipe.transform(X_test_num)

In [59]:
pipe['imputer'].statistics_

array([  3. ,  28. ,   0. ,   0. ,  14.5, 165. ])

### everything together with ColumnTransformer

numeric dataprep

In [60]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [61]:
numeric_transformer

categorical dataprep

In [62]:
categorical_features = ['embarked', 'sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))])

In [63]:
categorical_transformer

both together with `ColumnTransformer`

In [64]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

In [65]:
preprocessor

In [66]:
X_train.head(1)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"


In [67]:
preprocessor.fit(X_train)

In [68]:
preprocessor

In [69]:
preprocessor.named_transformers_

{'num': Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                 ('scaler', StandardScaler())]),
 'cat': Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                 ('ohe', OneHotEncoder(handle_unknown='ignore'))]),
 'remainder': 'drop'}

access individual elements with keys

In [70]:
preprocessor.named_transformers_['num']['imputer']

we can also use the pipeline to transform the test in one line

In [71]:
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [73]:
preprocessor

## pipelines with training

In [74]:
from sklearn.linear_model import LogisticRegression

classifier = Pipeline(steps = [('preprocessor', preprocessor),
                              ('classifier', LogisticRegression())])

classifier

In [75]:
classifier.fit(X_train, y_train)

In [76]:
classifier.score(X_train, y_train)

0.775549188156638

In [77]:
classifier.score(X_test, y_test)

0.7824427480916031

## pipelines with grid search

In [78]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [79]:
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', DecisionTreeClassifier())])

In [80]:
classifier

In [81]:
classifier.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler())]),
                                    ['age', 'fare']),
                                   ('cat',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ohe',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['embarked', 'sex'])])),
  ('classifier', DecisionTreeClassifier())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('imputer',
                    

In [82]:
param_grid = {'preprocessor__num__imputer__strategy': ['mean', 'median'],
              'classifier__max_depth': [2, 4, 6, 8, 10],
              'classifier__min_samples_split': [10, 15, 20]}

In [83]:
from sklearn.model_selection import GridSearchCV

In [84]:
gs = GridSearchCV(classifier, param_grid=param_grid, scoring='roc_auc')

In [85]:
gs.fit(X_train, y_train)

In [86]:
gs.best_params_

{'classifier__max_depth': 6,
 'classifier__min_samples_split': 20,
 'preprocessor__num__imputer__strategy': 'mean'}

In [87]:
gs.score(X_train, y_train)

0.8636735818604102

In [88]:
gs.score(X_test, y_test)

0.8446916195856874

In [89]:
best_model = gs.best_estimator_

In [90]:
best_model

In [91]:
predictions = best_model.predict(X_train)
predictions

array(['1', '0', '0', ..., '1', '1', '1'], dtype=object)

### save model

In [94]:
os.mkdir('output')

In [95]:
import pickle

In [96]:
with open('output/best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

### read model

In [97]:
with open('output/best_model.pkl', 'rb') as file:
    best_model = pickle.load(file)

In [98]:
best_model.predict(X_train)

array(['1', '0', '0', ..., '1', '1', '1'], dtype=object)

In [99]:
best_model.predict(X_test)

array(['1', '1', '0', '0', '0', '1', '0', '1', '1', '0', '1', '0', '0',
       '0', '1', '1', '0', '0', '0', '0', '0', '1', '0', '1', '1', '0',
       '0', '1', '0', '0', '1', '1', '0', '0', '1', '0', '0', '1', '0',
       '0', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '1', '0', '0', '0', '1', '0', '0', '0', '1', '1', '0', '1',
       '0', '0', '0', '0', '1', '0', '1', '0', '1', '0', '0', '1', '0',
       '0', '1', '1', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0',
       '1', '0', '0', '0', '0', '0', '0', '1', '1', '1', '0', '1', '0',
       '1', '0', '0', '1', '1', '0', '1', '1', '0', '0', '1', '1', '0',
       '1', '0', '0', '1', '1', '0', '0', '0', '1', '1', '0', '1', '1',
       '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0',
       '0', '0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '0', '0', '0',
       '0', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0', '1

In [102]:
df_predictions = X_test.copy()
predictions = best_model.predict(X_test)

In [103]:
df_predictions['target'] = y_test.values
df_predictions['prediction'] = predictions

df_predictions.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,target,prediction
1097,3.0,"Palsson, Master. Paul Folke",male,6.0,3.0,1.0,349909,21.075,,S,,,,0,1
281,1.0,"Stengel, Mrs. Charles Emil Henry (Annie May Mo...",female,43.0,1.0,0.0,11778,55.4417,C116,C,5.0,,"Newark, NJ",1,1
1032,3.0,"Morrow, Mr. Thomas Rowan",male,,0.0,0.0,372622,7.75,,Q,,,,0,0
942,3.0,"Lahoud, Mr. Sarkis",male,,0.0,0.0,2624,7.225,,C,,,,0,0
1263,3.0,"van Billiard, Master. Walter John",male,11.5,1.0,1.0,A/5. 851,14.5,,S,,1.0,,0,0
