# Scikit-learn for feature processing and training

### environment

In [1]:
import sys
sys.version

'3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]'

### imports

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

#libraries
######################################################################

import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

#configuration options
######################################################################

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

seed = 12345

In [3]:
pd.__version__

'1.3.4'

In [4]:
np.__version__

'1.21.0'

In [5]:
sklearn.__version__

'0.24.2'

## dataset

In [6]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [7]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

## null values

### na imputation

the `pandas` way

you can separate categorical from numeric and impute with different techniques

you can use pandas functions to fill null values

the `scikit-learn` way

### one hot encoding

the `pandas` way

the `scikit-learn` way

In [42]:
ohe.transform(X_test_cat)

<262x9 sparse matrix of type '<class 'numpy.float64'>'
	with 786 stored elements in Compressed Sparse Row format>

## full dataprep with scikit pipelines

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [44]:
pipe = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
                        ('scaler', StandardScaler())])

### todo en un solo paso con ColumnTransformer

numeric dataprep

In [50]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical dataprep

In [51]:
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))])

both together with `ColumnTransformer`

In [52]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

In [53]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['age', 'fare']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['embarked', 'sex', 'pclass'])])

In [54]:
from sklearn import set_config

set_config(display='diagram')

In [55]:
preprocessor

In [56]:
preprocessor.named_transformers_

{'num': Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                 ('scaler', StandardScaler())]),
 'cat': Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                 ('ohe', OneHotEncoder(handle_unknown='ignore'))]),
 'remainder': 'drop'}

access individual elements with keys

In [57]:
preprocessor.named_transformers_['num']['imputer']

we can also use the pipeline to transform the test in one line

## pipelines with training

In [59]:
from sklearn.linear_model import LogisticRegression

## pipelines with grid search

In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [64]:
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', DecisionTreeClassifier())])

In [66]:
classifier