# Core API

1. Estimador (Modelo)

Para construir e ajustar os modelos.

2. Preditor

Para fazer as predições.

3. Tranformador

Para converter os dados.

# Principios

1. Consistência

Interface composta com métodos limitados.

2. Inspeção

Parametros públicos

3. Não Proliferação de classes

Utiliza Numpy, Pandas e SciPy para dados. Strings e numeros para hiperparametros.

4. Composição

Agilizar a contrução de pipelines

5. Padrões sensíveis

Todo objeto tem definido padrões iniciais para que o modelo funcione.

# Representação dos Dados

Usam numpy e scipy e preferem rodar os algoritmo em batches.

# Estimators

- É uma interface
- 2 em 1
- Ele tem um estado antes e depois do método `.fit()`
- Os hiperparâmetros são públicos
- Os parâmetros são públicos
- Os hiperparâmetros são passados antes do `.fit()`
- A biblioteca sugere hiperparâmetros padrão
- Estimadores diferentes são facilmente trocados

# Preditores

- Observar a performance do modelo
- Quanto maior melhor `.score(X_test, y_test)`
- Observar a predição do modelo em uma conjunto novo `.predict(X_test)`

In [1]:
import pandas as pd

In [2]:
from sklearn.datasets import load_iris

In [14]:
df = load_iris()

In [15]:
y = df.target

In [16]:
df = pd.DataFrame(df['data'])

In [17]:
X = df.values

In [18]:
df.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [20]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
# OneVsOne
# OneVsRest

In [None]:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
# OneVsRest
#1 -> 0 contra 1..9 {True:0 , False:[1,2,3,4,5,6,7,8,9]}
#2 -> 1 contra 2..0 {True:1 , False:[2,3,4,5,6,7,8,9,0]}
#3 -> 2 contra 3..1 {True:2 , False:[3,4,5,6,7,8,9,0,1]}
#4 -> ...
#9 -> 9 contra 0..8 {True:9 , False:[0..8]}

# OneVsOne
#1 -> 0 contra 1 80%
#2 -> 0 contra 2
#3 -> 0 contra 3
...
#9 -> 0 contra 9
#10 -> 1 contra 2
#11 -> 1 contra 3


In [59]:
(10*(10-1)) / 2

45.0

In [60]:
import itertools

In [64]:
len(list(itertools.combinations(range(10), 2)))

45

In [19]:
df.shape

(150, 4)

In [9]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [23]:
y_test

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 2, 2, 1, 2])

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()

In [24]:
scaler.fit(X_train)

StandardScaler()

In [31]:
(X_train[:, 0] - X_train[:, 0].mean()) / X_train[:, 0].std()

array([-0.13835603,  2.14752625, -0.25866563, -0.8602136 ,  2.26783585,
       -0.01804644, -0.739904  , -0.98052319, -0.8602136 , -0.98052319,
        0.58350153, -1.22114238, -0.98052319, -0.8602136 , -0.25866563,
       -0.8602136 , -0.13835603,  2.26783585, -1.46176157,  0.46319194,
       -0.13835603, -0.37897522,  0.22257275, -0.01804644,  0.22257275,
       -0.49928482,  0.46319194, -0.37897522, -0.49928482, -0.98052319,
        0.70381112, -0.98052319, -0.98052319, -0.37897522,  1.06473991,
       -1.10083279, -0.01804644, -0.98052319, -0.98052319,  0.10226315,
       -0.8602136 ,  1.30535909,  0.22257275,  0.34288234,  2.26783585,
       -0.37897522, -1.70238076, -1.82269035,  0.22257275,  1.66628788,
       -1.46176157, -0.8602136 , -1.70238076,  0.58350153,  0.58350153,
       -1.46176157,  1.1850495 ,  0.58350153, -1.34145197,  0.34288234,
        0.82412072,  0.46319194,  1.42566869,  0.70381112, -0.8602136 ,
        1.30535909,  0.10226315,  0.82412072, -0.13835603, -0.73

In [34]:
(X_test[:, 0] - X_train[:, 0].mean()) / X_train[:, 0].std()

array([ 0.34288234, -0.13835603,  2.26783585,  0.22257275,  1.1850495 ,
       -0.49928482, -0.25866563,  1.30535909,  0.46319194, -0.01804644,
        0.82412072, -1.22114238, -0.37897522, -1.10083279, -0.8602136 ,
        0.58350153,  0.82412072, -0.25866563, -0.13835603,  0.70381112,
       -1.34145197,  0.34288234, -0.98052319,  0.70381112,  2.50845503,
        1.06473991,  1.06473991,  1.1850495 , -1.22114238, -1.22114238,
       -1.46176157, -0.13835603,  1.06473991, -1.22114238, -1.70238076,
        0.58350153,  0.70381112, -0.739904  , -0.98052319, -0.739904  ,
       -0.01804644,  0.22257275,  1.06473991, -0.49928482, -0.49928482,
       -0.37897522,  0.58350153,  0.70381112,  0.94443031,  1.66628788])

In [33]:
scaler.transform(X_test)

array([[ 0.34288234, -0.50470526,  0.51173177, -0.02555604],
       [-0.13835603,  1.88726279, -1.2249145 , -1.23610527],
       [ 2.26783585, -0.98309887,  1.78527236,  1.45400413],
       [ 0.22257275, -0.26550845,  0.39595535,  0.37796037],
       [ 1.1850495 , -0.50470526,  0.56961997,  0.2434549 ],
       [-0.49928482,  0.93047557, -1.34069092, -1.1015998 ],
       [-0.25866563, -0.26550845, -0.12503853,  0.10894943],
       [ 1.30535909,  0.21288516,  0.7432846 ,  1.45400413],
       [ 0.46319194, -1.93988609,  0.39595535,  0.37796037],
       [-0.01804644, -0.74390206,  0.0486261 , -0.02555604],
       [ 0.82412072,  0.45208196,  0.7432846 ,  1.05048772],
       [-1.22114238, -0.02631165, -1.39857913, -1.50511621],
       [-0.37897522,  1.16967238, -1.45646733, -1.37061074],
       [-1.10083279,  0.21288516, -1.34069092, -1.50511621],
       [-0.8602136 ,  1.88726279, -1.34069092, -1.23610527],
       [ 0.58350153,  0.69127877,  0.51173177,  0.51246584],
       [ 0.82412072, -0.

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn . multiclass import OneVsOneClassifier

In [53]:
lg1 = LogisticRegression(penalty="l1", solver='liblinear')
lg2 = OneVsOneClassifier(LogisticRegression(penalty="l1", solver='liblinear'))

In [54]:
lg1.fit(X_train, y_train)
lg2.fit(X_train, y_train)

OneVsOneClassifier(estimator=LogisticRegression(penalty='l1',
                                                solver='liblinear'))

In [57]:
lg1.score(X_train, y_train)

0.96

In [58]:
lg2.score(X_train, y_train)

0.96

In [55]:
lg1.score(X_test, y_test)

0.98

In [56]:
lg2.score(X_test, y_test)

0.94

In [10]:
type(df)

pandas.core.frame.DataFrame

In [32]:
from sklearn.cluster import KMeans, AgglomerativeClustering

In [33]:
#k = KMeans(n_clusters=3, )
k = AgglomerativeClustering()

In [34]:
k.fit(df)

AgglomerativeClustering()

In [17]:
k.inertia_

78.851441426146

In [19]:
k.get_params()

{'algorithm': 'auto',
 'copy_x': True,
 'init': 'k-means++',
 'max_iter': 300,
 'n_clusters': 3,
 'n_init': 10,
 'n_jobs': 'deprecated',
 'precompute_distances': 'deprecated',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

In [20]:
k.cluster_centers_

array([[6.85      , 3.07368421, 5.74210526, 2.07105263],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [5.9016129 , 2.7483871 , 4.39354839, 1.43387097]])

In [21]:
k.inertia_

78.851441426146

In [35]:
k.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [36]:
k.n_leaves_
k.

150

In [23]:
k.n_features_in_

4

In [24]:
k.n_iter_

5

In [26]:
from sklearn.base import BaseEstimator
import numpy as np

In [27]:
class AlgoGene(BaseEstimator):
    def fit(self, X):
        return 'Modelo sendo treinado'
    def predict(self, X):
        return np.ones_like(X)

In [28]:
a = AlgoGene()

In [29]:
a.fit(np.ones((3,3)))

'Modelo sendo treinado'

In [31]:
a.predict(np.zeros((3,3)))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [42]:
from sklearn.linear_model import LogisticRegression

In [38]:
df = load_iris()

In [40]:
df.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [41]:
X = df['data']
y = df['target']

In [45]:
l = LogisticRegression(max_iter=400)

In [46]:
l.fit(X, y)

LogisticRegression(max_iter=400)

In [48]:
l.score(X, y)

0.9733333333333334