# ** Minerando Dados** -  [www.minerandodados.com.br](http://www.minerandodados.com.br)

_Aprenda Data Science e Alavanque sua Carreira_

**Autor: Rodrigo Santana**

**e-mail: contato@minerandodados.com.br**


# Artigo FeatureUnion

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [None]:
# Ler o dataset.
dataset = pd.read_csv('data.csv', sep=',')


__Separa dados de treino e classes__

In [None]:
# Separa a classe dos dados
classes = dataset['target']
dataset.drop('target', axis=1, inplace=True)

__Pre-processamento de dados__

In [None]:
def remove_features(lista_features):
    for i in lista_features:
        dataset.drop(i, axis=1, inplace=True)
    return 0

In [None]:
# Remove colunas 'id' e 'title'
remove_features(['id','song_title'])

__Aplica o LabelEncoder na coluna 'artist'__

In [None]:
# Label Encoder

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
inteiros = enc.fit_transform(dataset['artist'])

# Cria uma nova coluna chamada 'artist_inteiros'

dataset['artist_inteiros'] = inteiros
remove_features(['artist'])

0

__Visualizando os dados__

In [None]:
dataset.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artist_inteiros
0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,449
1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588,222
2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173,449
3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23,95
4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904,628


In [None]:
# Visualizando o valor das médias das 10 primeiras linhas.
dataset.head(10).mean(axis=1)

0    14657.103436
1    23379.485251
2    13302.342974
3    14256.537593
4    28121.121386
5    17984.451628
6    17273.765893
7    25048.888093
8    14522.720793
9    16247.345506
dtype: float64

In [None]:
# Visualizando o valor de desvio padrão das 10 primeiras linhas.
dataset.head(10).std(axis=1)

0     54669.464606
1     87368.739744
2     49621.617927
3     53291.706129
4    104988.706256
5     67162.400181
6     64508.038528
7     93431.806459
8     54205.256516
9     60612.891926
dtype: float64

In [None]:
# Visualizando o número de colunas
len(dataset.columns)

14

__Classe Stats__

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class Stats(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def transform(self, df, y=None):
        df = df.assign(mean=df.mean(axis=1),std=df.std(axis=1))
        return df

    def fit(self, df, y=None):
        return self

__ Aplica FeatureUnion __

In [None]:
# importa a função do PCA e o utilitário FeatureUnion.
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion

# Define o número de componentes do PCA.
pca = PCA(n_components=1)

# Cria um objeto com o PCA e a classe Stats usando o FeatureUnion
features = FeatureUnion([("pca" ,pca),
                         ('stats',Stats())
                        ])
# Aplica a combinação criada acima nos dados.
dataset = features.fit(dataset,classes).transform(dataset)

In [None]:
# Visualizando as Colunas (linhas,colunas)
dataset.shape

(2017, 17)

In [None]:
# Transforma em Dataframe para melhor visualização
dataset = pd.DataFrame(dataset)
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-41706.163219,0.0102,0.833,204600.0,0.434,0.0219,2.0,0.165,-8.795,1.0,0.431,150.062,4.0,0.286,449.0,14657.103436,54669.464606
1,80626.870843,0.199,0.743,326933.0,0.359,0.00611,1.0,0.137,-10.401,1.0,0.0794,160.083,4.0,0.588,222.0,23379.485251,87368.739744
2,-60599.161129,0.0344,0.838,185707.0,0.412,0.000234,2.0,0.159,-7.148,1.0,0.289,75.044,4.0,0.173,449.0,13302.342974,49621.617927
3,-46893.105551,0.604,0.494,199413.0,0.338,0.51,5.0,0.0922,-15.236,1.0,0.0261,86.468,4.0,0.23,95.0,14256.537593,53291.706129
4,146586.805519,0.18,0.678,392893.0,0.561,0.512,5.0,0.439,-11.648,0.0,0.0694,174.004,4.0,0.904,628.0,28121.121386,104988.706256


__Cria um Pipeline para automatizar todos os passos __

In [None]:
# Instancia o PCA.
pca = PCA(n_components=1)

# Cria um pipeline
pipeline = Pipeline([
                    ('Features', FeatureUnion([
                        ('Stats', Stats()),
                        ('PCA', pca)
                        ])
            ),
            ('Scaler', StandardScaler()),      # Aplica um Scaler nos dados.
            ('SVM', svm.SVC())                 # Executa um Classificador usando o SVM.
         ])

In [None]:
# Quantidade de colunas
len(dataset.columns)

17

In [None]:
# Passos do Pipeline
pipeline.steps

[('Features', FeatureUnion(n_jobs=1,
         transformer_list=[('Stats', Stats()), ('PCA', PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False))],
         transformer_weights=None)),
 ('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('SVM', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [None]:
# Função que retorna a acurácia
def Acuracia(clf,X,y):
    resultados = cross_val_predict(clf, X, y, cv=5)
    return metrics.accuracy_score(y,resultados)

In [None]:
# Chama a função 'Acuracia'
Acuracia(pipeline,dataset,classes)

0.6668319286068418