# Core API

1. Estimador (Modelo)

Para construir e ajustar os modelos.

2. Preditor

Para fazer as predições.

3. Tranformador

Para converter os dados.

# Principios

1. Consistência

Interface composta com métodos limitados.

2. Inspeção

Parametros públicos

3. Não Proliferação de classes

Utiliza Numpy, Pandas e SciPy para dados. Strings e numeros para hiperparametros.

4. Composição

Agilizar a contrução de pipelines

5. Padrões sensíveis

Todo objeto tem definido padrões iniciais para que o modelo funcione.

# Representação dos Dados

Usam numpy e scipy e preferem rodar os algoritmo em batches.

# Estimators

- É uma interface
- 2 em 1
- Ele tem um estado antes e depois do método `.fit()`
- Os hiperparâmetros são públicos
- Os parâmetros são públicos
- Os hiperparâmetros são passados antes do `.fit()`
- A biblioteca sugere hiperparâmetros padrão
- Estimadores diferentes são facilmente trocados

# Preditores

- Observar a performance do modelo
- Quanto maior melhor `.score(X_test, y_test)`
- Observar a predição do modelo em uma conjunto novo `.predict(X_test)`

In [2]:
import pandas as pd

In [3]:
from sklearn.datasets import load_iris

In [4]:
df = load_iris()

In [5]:
y = df.target

In [6]:
df = pd.DataFrame(df['data'])

In [7]:
X = df.values

In [8]:
df.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
# OneVsOne
# OneVsRest

In [11]:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [132]:
# OneVsRest
#1 -> 0 contra 1..9 {True:0 , False:[1,2,3,4,5,6,7,8,9]}
#2 -> 1 contra 2..0 {True:1 , False:[2,3,4,5,6,7,8,9,0]}
#3 -> 2 contra 3..1 {True:2 , False:[3,4,5,6,7,8,9,0,1]}
#4 -> ...
#9 -> 9 contra 0..8 {True:9 , False:[0..8]}

# OneVsOne
#1 -> 0 contra 1 80%
#2 -> 0 contra 2
#3 -> 0 contra 3
...
#9 -> 0 contra 9
#10 -> 1 contra 2
#11 -> 1 contra 3


Ellipsis

In [13]:
(10*(10-1)) / 2

45.0

In [14]:
import itertools

In [15]:
len(list(itertools.combinations(range(10), 2)))

45

In [16]:
df.shape

(150, 4)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
y_test

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 2, 2, 1, 2])

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler = StandardScaler()

In [22]:
scaler.fit(X_train)

StandardScaler()

In [23]:
(X_train[:, 0] - X_train[:, 0].mean()) / X_train[:, 0].std()

array([-0.13835603,  2.14752625, -0.25866563, -0.8602136 ,  2.26783585,
       -0.01804644, -0.739904  , -0.98052319, -0.8602136 , -0.98052319,
        0.58350153, -1.22114238, -0.98052319, -0.8602136 , -0.25866563,
       -0.8602136 , -0.13835603,  2.26783585, -1.46176157,  0.46319194,
       -0.13835603, -0.37897522,  0.22257275, -0.01804644,  0.22257275,
       -0.49928482,  0.46319194, -0.37897522, -0.49928482, -0.98052319,
        0.70381112, -0.98052319, -0.98052319, -0.37897522,  1.06473991,
       -1.10083279, -0.01804644, -0.98052319, -0.98052319,  0.10226315,
       -0.8602136 ,  1.30535909,  0.22257275,  0.34288234,  2.26783585,
       -0.37897522, -1.70238076, -1.82269035,  0.22257275,  1.66628788,
       -1.46176157, -0.8602136 , -1.70238076,  0.58350153,  0.58350153,
       -1.46176157,  1.1850495 ,  0.58350153, -1.34145197,  0.34288234,
        0.82412072,  0.46319194,  1.42566869,  0.70381112, -0.8602136 ,
        1.30535909,  0.10226315,  0.82412072, -0.13835603, -0.73

In [24]:
(X_test[:, 0] - X_train[:, 0].mean()) / X_train[:, 0].std()

array([ 0.34288234, -0.13835603,  2.26783585,  0.22257275,  1.1850495 ,
       -0.49928482, -0.25866563,  1.30535909,  0.46319194, -0.01804644,
        0.82412072, -1.22114238, -0.37897522, -1.10083279, -0.8602136 ,
        0.58350153,  0.82412072, -0.25866563, -0.13835603,  0.70381112,
       -1.34145197,  0.34288234, -0.98052319,  0.70381112,  2.50845503,
        1.06473991,  1.06473991,  1.1850495 , -1.22114238, -1.22114238,
       -1.46176157, -0.13835603,  1.06473991, -1.22114238, -1.70238076,
        0.58350153,  0.70381112, -0.739904  , -0.98052319, -0.739904  ,
       -0.01804644,  0.22257275,  1.06473991, -0.49928482, -0.49928482,
       -0.37897522,  0.58350153,  0.70381112,  0.94443031,  1.66628788])

In [25]:
scaler.transform(X_test)

array([[ 0.34288234, -0.50470526,  0.51173177, -0.02555604],
       [-0.13835603,  1.88726279, -1.2249145 , -1.23610527],
       [ 2.26783585, -0.98309887,  1.78527236,  1.45400413],
       [ 0.22257275, -0.26550845,  0.39595535,  0.37796037],
       [ 1.1850495 , -0.50470526,  0.56961997,  0.2434549 ],
       [-0.49928482,  0.93047557, -1.34069092, -1.1015998 ],
       [-0.25866563, -0.26550845, -0.12503853,  0.10894943],
       [ 1.30535909,  0.21288516,  0.7432846 ,  1.45400413],
       [ 0.46319194, -1.93988609,  0.39595535,  0.37796037],
       [-0.01804644, -0.74390206,  0.0486261 , -0.02555604],
       [ 0.82412072,  0.45208196,  0.7432846 ,  1.05048772],
       [-1.22114238, -0.02631165, -1.39857913, -1.50511621],
       [-0.37897522,  1.16967238, -1.45646733, -1.37061074],
       [-1.10083279,  0.21288516, -1.34069092, -1.50511621],
       [-0.8602136 ,  1.88726279, -1.34069092, -1.23610527],
       [ 0.58350153,  0.69127877,  0.51173177,  0.51246584],
       [ 0.82412072, -0.

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

In [31]:
lg1 = LogisticRegression(penalty="l1", solver='liblinear')
lg2 = OneVsOneClassifier(LogisticRegression(penalty="l1", solver='liblinear'))

In [32]:
lg1.fit(X_train, y_train)
lg2.fit(X_train, y_train)

OneVsOneClassifier(estimator=LogisticRegression(penalty='l1',
                                                solver='liblinear'))

In [33]:
lg1.score(X_train, y_train)

0.96

In [34]:
lg2.score(X_train, y_train)

0.96

In [35]:
lg1.score(X_test, y_test)

0.98

In [36]:
lg2.score(X_test, y_test)

0.94

In [37]:
type(df)

pandas.core.frame.DataFrame

In [38]:
from sklearn.cluster import KMeans, AgglomerativeClustering

In [39]:
k = KMeans(n_clusters=3, )
#k = AgglomerativeClustering()

In [40]:
k.fit(df)

KMeans(n_clusters=3)

In [41]:
#k.inertia_

In [42]:
k.get_params()

{'algorithm': 'auto',
 'copy_x': True,
 'init': 'k-means++',
 'max_iter': 300,
 'n_clusters': 3,
 'n_init': 10,
 'n_jobs': 'deprecated',
 'precompute_distances': 'deprecated',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

In [43]:
#k.cluster_centers_

In [44]:
k.inertia_

78.851441426146

In [45]:
k.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0], dtype=int32)

In [46]:
k.n_features_in_

4

In [47]:
k.n_iter_

6

In [48]:
from sklearn.base import BaseEstimator
import numpy as np

In [49]:
class AlgoGene(BaseEstimator):
    def fit(self, X):
        return 'Modelo sendo treinado'
    def predict(self, X):
        return np.ones_like(X)

In [50]:
a = AlgoGene()

In [51]:
a.fit(np.ones((3,3)))

'Modelo sendo treinado'

In [52]:
a.predict(np.zeros((3,3)))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [53]:
from sklearn.linear_model import LogisticRegression

In [54]:
df = load_iris()

In [55]:
df.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [56]:
X = df['data']
y = df['target']

In [57]:
l = LogisticRegression(max_iter=400)

In [58]:
l.fit(X, y)

LogisticRegression(max_iter=400)

In [59]:
l.score(X, y)

0.9733333333333334

# Pipelines and feature unions

- Pipelines (sequenciais)
- FeatureUnion (paralelo)



In [61]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import SelectKBest

In [62]:
union = FeatureUnion([('pca', PCA()),
                      ('kpca', KernelPCA(kernel='rbf'))
                     ])
#union = FeatureUnion([('str', StandardScaler())])

In [63]:
lg1 = LogisticRegression(penalty="l1", solver='liblinear')
lg2 = OneVsOneClassifier(LogisticRegression(penalty="l1", solver='liblinear'))

In [64]:
X_l_train = union.fit_transform(X_train)

In [65]:
lg1.fit(X_l_train, y_train)
lg2.fit(X_l_train, y_train)

OneVsOneClassifier(estimator=LogisticRegression(penalty='l1',
                                                solver='liblinear'))

In [66]:
lg1.score(union.transform(X_test), y_test), lg2.score(union.transform(X_test), y_test)

(1.0, 1.0)

In [67]:
X_l_train.shape, X_train.shape

((100, 103), (100, 4))

In [68]:
pca = KernelPCA(kernel='rbf')
pca_f = pca.fit_transform(X_train)

In [69]:
pca_f.shape

(100, 99)

In [70]:
pipe = Pipeline([('feat_union', union),
                 ('feat_sel', SelectKBest(k=10)),
                 #('log_reg0', LogisticRegression(penalty='l2')),
                 #('log_reg1', LogisticRegression(penalty="l1", solver='liblinear')),
                 ('log_reg2', OneVsOneClassifier(LogisticRegression(penalty="l1", 
                                        solver='liblinear')))
                ]).fit(X_train, y_train).predict(X_test)

In [71]:
pipe

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 2, 2, 1, 2])

In [72]:
y_test

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 2, 2, 1, 2])

In [73]:
for t, v in zip(pipe, y_test):
    if t != v:
        print('errou')

# Model selection


In [76]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [113]:
model_to_set = OneVsRestClassifier(SVC())

para_grid = [
    {"estimator__kernel": ['linear'], 'estimator__C': [1, 10, 100, 1000]},
    {"estimator__kernel": ['rbf'], 'estimator__C': [1, 10, 100, 1000],
     "estimator__gamma": [0.001, 0.0001]},
]

In [114]:
OneVsRestClassifier(SVC()).get_params().keys()

dict_keys(['estimator__C', 'estimator__break_ties', 'estimator__cache_size', 'estimator__class_weight', 'estimator__coef0', 'estimator__decision_function_shape', 'estimator__degree', 'estimator__gamma', 'estimator__kernel', 'estimator__max_iter', 'estimator__probability', 'estimator__random_state', 'estimator__shrinking', 'estimator__tol', 'estimator__verbose', 'estimator', 'n_jobs'])

In [115]:
clf = GridSearchCV(model_to_set, para_grid, scoring='f1_weighted', cv=10, )

In [105]:
clf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=OneVsRestClassifier(estimator=SVC(kernel='poly')),
             param_grid=[{'estimator__C': [1, 10, 100, 1000],
                          'estimator__kernel': ['linear']},
                         {'estimator__C': [1, 10, 100, 1000],
                          'estimator__gamma': [0.001, 0.0001],
                          'estimator__kernel': ['rbf']}],
             scoring='f1_weighted')

In [106]:
y_pred = clf.predict(X_test)

In [107]:
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 2, 1, 2])

In [108]:
clf.score(X_test, y_test)

0.98

In [116]:
clf.fit(X_l_train, y_train)

GridSearchCV(cv=10, estimator=OneVsRestClassifier(estimator=SVC()),
             param_grid=[{'estimator__C': [1, 10, 100, 1000],
                          'estimator__kernel': ['linear']},
                         {'estimator__C': [1, 10, 100, 1000],
                          'estimator__gamma': [0.001, 0.0001],
                          'estimator__kernel': ['rbf']}],
             scoring='f1_weighted')

In [117]:
clf.score(union.transform(X_test), y_test)

1.0

In [118]:
clf.get_params()

{'cv': 10,
 'error_score': nan,
 'estimator__estimator__C': 1.0,
 'estimator__estimator__break_ties': False,
 'estimator__estimator__cache_size': 200,
 'estimator__estimator__class_weight': None,
 'estimator__estimator__coef0': 0.0,
 'estimator__estimator__decision_function_shape': 'ovr',
 'estimator__estimator__degree': 3,
 'estimator__estimator__gamma': 'scale',
 'estimator__estimator__kernel': 'rbf',
 'estimator__estimator__max_iter': -1,
 'estimator__estimator__probability': False,
 'estimator__estimator__random_state': None,
 'estimator__estimator__shrinking': True,
 'estimator__estimator__tol': 0.001,
 'estimator__estimator__verbose': False,
 'estimator__estimator': SVC(),
 'estimator__n_jobs': None,
 'estimator': OneVsRestClassifier(estimator=SVC()),
 'iid': 'deprecated',
 'n_jobs': None,
 'param_grid': [{'estimator__kernel': ['linear'],
   'estimator__C': [1, 10, 100, 1000]},
  {'estimator__kernel': ['rbf'],
   'estimator__C': [1, 10, 100, 1000],
   'estimator__gamma': [0.001, 

In [119]:
clf.param_grid

[{'estimator__kernel': ['linear'], 'estimator__C': [1, 10, 100, 1000]},
 {'estimator__kernel': ['rbf'],
  'estimator__C': [1, 10, 100, 1000],
  'estimator__gamma': [0.001, 0.0001]}]

In [120]:
clf.scorer_

make_scorer(f1_score, pos_label=None, average=weighted)

In [121]:
clf.scoring

'f1_weighted'

In [122]:
clf.cv_results_

{'mean_fit_time': array([0.00533643, 0.00641501, 0.00476112, 0.00477781, 0.00584886,
        0.00564344, 0.00501397, 0.00574872, 0.00468869, 0.00558846,
        0.00392904, 0.0043047 ]),
 'std_fit_time': array([0.0021574 , 0.00134145, 0.00098314, 0.00109168, 0.00052891,
        0.00060621, 0.00026003, 0.00053411, 0.00064859, 0.00096818,
        0.00028929, 0.00029167]),
 'mean_score_time': array([0.00146077, 0.00205526, 0.00129755, 0.00133359, 0.00165703,
        0.0014812 , 0.00142438, 0.00149271, 0.00145998, 0.00138338,
        0.00130746, 0.00133953]),
 'std_score_time': array([0.0002941 , 0.00063428, 0.0001391 , 0.00019546, 0.00027696,
        0.00010204, 0.00013331, 0.00020111, 0.0002601 , 0.00013653,
        0.00016768, 0.00010033]),
 'param_estimator__C': masked_array(data=[1, 10, 100, 1000, 1, 1, 10, 10, 100, 100, 1000, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
    

In [123]:
clf.cv_results_['params']

[{'estimator__C': 1, 'estimator__kernel': 'linear'},
 {'estimator__C': 10, 'estimator__kernel': 'linear'},
 {'estimator__C': 100, 'estimator__kernel': 'linear'},
 {'estimator__C': 1000, 'estimator__kernel': 'linear'},
 {'estimator__C': 1, 'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'},
 {'estimator__C': 1, 'estimator__gamma': 0.0001, 'estimator__kernel': 'rbf'},
 {'estimator__C': 10, 'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'},
 {'estimator__C': 10, 'estimator__gamma': 0.0001, 'estimator__kernel': 'rbf'},
 {'estimator__C': 100, 'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'},
 {'estimator__C': 100, 'estimator__gamma': 0.0001, 'estimator__kernel': 'rbf'},
 {'estimator__C': 1000, 'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'},
 {'estimator__C': 1000,
  'estimator__gamma': 0.0001,
  'estimator__kernel': 'rbf'}]

In [124]:
clf.cv_results_['mean_test_score']

array([0.96872222, 0.94555556, 0.96872222, 0.95843651, 0.52239683,
       0.52239683, 0.79563492, 0.52239683, 0.94526984, 0.79563492,
       0.94555556, 0.94526984])

In [125]:
for param, score in zip(clf.cv_results_['params'], clf.cv_results_['mean_test_score']):
    print(f"{param}, acertou={score}")

{'estimator__C': 1, 'estimator__kernel': 'linear'}, acertou=0.9687222222222222
{'estimator__C': 10, 'estimator__kernel': 'linear'}, acertou=0.9455555555555556
{'estimator__C': 100, 'estimator__kernel': 'linear'}, acertou=0.9687222222222222
{'estimator__C': 1000, 'estimator__kernel': 'linear'}, acertou=0.9584365079365078
{'estimator__C': 1, 'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'}, acertou=0.5223968253968254
{'estimator__C': 1, 'estimator__gamma': 0.0001, 'estimator__kernel': 'rbf'}, acertou=0.5223968253968254
{'estimator__C': 10, 'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'}, acertou=0.7956349206349206
{'estimator__C': 10, 'estimator__gamma': 0.0001, 'estimator__kernel': 'rbf'}, acertou=0.5223968253968254
{'estimator__C': 100, 'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'}, acertou=0.9452698412698413
{'estimator__C': 100, 'estimator__gamma': 0.0001, 'estimator__kernel': 'rbf'}, acertou=0.7956349206349206
{'estimator__C': 1000, 'estimator__gamma': 0.001, '

In [126]:
clf.best_estimator_

OneVsRestClassifier(estimator=SVC(C=1, kernel='linear'))

In [129]:
_df = pd.DataFrame(clf.cv_results_)[["mean_test_score", 'split0_test_score', 'split1_test_score', 'split2_test_score','std_test_score','params']]\
    .sort_values(by='mean_test_score', ascending=False)


_df = pd.concat([_df[['mean_test_score', 'std_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']], _df['params'].apply(lambda x: pd.Series(x))], axis=1)
_df.head(10)

Unnamed: 0,mean_test_score,std_test_score,split0_test_score,split1_test_score,split2_test_score,estimator__C,estimator__kernel,estimator__gamma
0,0.968722,0.06673,1.0,0.895556,1.0,1,linear,
2,0.968722,0.06673,1.0,0.895556,1.0,100,linear,
3,0.958437,0.069004,1.0,0.895556,1.0,1000,linear,
1,0.945556,0.103376,1.0,0.895556,1.0,10,linear,
10,0.945556,0.103376,1.0,0.895556,1.0,1000,rbf,0.001
8,0.94527,0.103506,1.0,0.895556,1.0,100,rbf,0.001
11,0.94527,0.103506,1.0,0.895556,1.0,1000,rbf,0.0001
6,0.795635,0.136353,0.9,0.9,0.780952,10,rbf,0.001
9,0.795635,0.136353,0.9,0.9,0.780952,100,rbf,0.0001
4,0.522397,0.056679,0.457143,0.48,0.457143,1,rbf,0.001
