In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Projeto: Classificação de Notícias Curtas em Português utilizando *Machine Learning* (um subprojeto do Projeto Luppar News-Rec)
- 1 - Definição do Problema
- 2 - Preparação dos Dados e *Embeddings*
- 3 - Criação dos Modelos (*Pipelines*)
- 4 - *Deploy* em Produção



## 1. Definição do Problema
Classificação de Notícias Curtas em Português (*uma parte do Projeto Luppar Recommender*, maiores informações em [Luppar News-Rec](https://pessoalex.wordpress.com/2019/11/24/luppar-news-rec-recomendador-inteligente-de-noticias/))

## 2. Preparação dos Dados e *Embeddings*

Importando as Bibliotecas necessárias

In [None]:
from time import time
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import gensim
import pickle
from gensim.models.word2vec import Word2Vec
from gensim.models import FastText
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn import metrics
from sklearn.preprocessing import label_binarize
#from sklearn.utils.fixes import signature
from sklearn.ensemble import RandomForestClassifier

### Criando as Classes Personalizadas

Classes *Embeddings* Médio
- Calcula a média dos vetores de cada uma das palavras do documento - para cada um dos documentos

In [None]:
class E2V_AVG(object):
    def __init__(self, word2vec):
        self.w2v = word2vec
        self.dimensao = 300
    
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.w2v[word] for word in words if word in self.w2v] or [np.zeros(self.dimensao)], axis=0)
            for words in X
        ])

Classe da Abordagem Proposta - **E2V-IDF**

`Essa abordagem representa um documento pela média dos vetores dos seus termos, ponderando cada vetor de termo pelo IDF (Inverso da Frequência nos Documentos) do termo. A intuição por trás desta proposta é que um termo, apresente poder discriminatório diferente dependendo do número de documentos em que esse termo esteja presente, ou seja, o peso dos termos que ocorrem com mais frequência em documentos da coleção tendem a diminuir, e aumentar caso os termos ocorram mais raramente em documentos da coleção (SOUZA, 2019).`

In [None]:
# Referência (SOUZA, 2019)
class E2V_IDF(object):
    def __init__(self, word2vec):
        self.w2v = word2vec
        self.wIDF = None # IDF da palavra na colecao
        self.dimensao = 300
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        maximo_idf = max(tfidf.idf_) # Uma palavra que nunca foi vista (rara) então o IDF padrão é o máximo de idfs conhecidos (exemplo: 9.2525763918954524)
        self.wIDF = defaultdict(
            lambda: maximo_idf, 
            [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
        return self
    
    # Gera um vetor de 300 dimensões, para cada documento, com a média dos vetores (embeddings) dos termos * IDF, contidos no documento.
    def transform(self, X):
        return np.array([
                np.mean([self.w2v[word] * self.wIDF[word] for word in words if word in self.w2v] or [np.zeros(self.dimensao)], axis=0)
                for words in X
            ])

### Carregando a Fonte de Dados (z6News)
Nóticias curtas colhidas do site G1 Notícias

Tópicos
- esporteNews
- politicaNews
- tecnologiaNews
- financaPessoal
- educacaonews
- ciencianaturezasaudenews


In [None]:
# Arquivo com nóticias curtas em Português do site G1
X = pickle.load(open('/content/drive/My Drive/0. Business/2. Consultoria em Dados/2. IA, ML/NLP/data/z6News_X.ipy', 'rb'))
# Arquivo com o rótulos das notícias
y = pickle.load(open('/content/drive/My Drive/0. Business/2. Consultoria em Dados/2. IA, ML/NLP/data/z6News_y.ipy', 'rb'))

# Essa fonte de dados é própria e esta disponível aqui no GitHub na Pasta: data
# - Podem utilizar, bastando referenciar o autor: SOUZA, 2019 (descrito na seção Referências)

In [None]:
# Tranformando em Array
X, y = np.array(X), np.array(y)

In [None]:
print ("Total de Notícias - G1: %s" % len(y))

Total de Notícias - G1: 34327


### Treinando os *Embeddings* com a Coleção

Word2Vec - [GENSIM](https://radimrehurek.com/gensim/models/word2vec.html)

Parâmetros
- sg=1 -- Skip Gram


In [None]:
model = Word2Vec(X, size=300, window=5, sg=1, workers=4)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.vectors)}

In [None]:
# Verificando tamanho do Vetor do W2V
len(w2v)

7398

In [None]:
# Consultando o vetor embedding de uma das palavras
w2v['internaco']

array([ 0.0827674 ,  0.08832473, -0.01311889,  0.02288111,  0.08373141,
        0.02018465, -0.00747525, -0.2001954 , -0.00445932, -0.02290371,
       -0.10552743,  0.05140657, -0.04853147, -0.11712656, -0.01261191,
       -0.05801427,  0.0759929 , -0.05284167,  0.04576398, -0.00043701,
        0.05200208,  0.05424974,  0.07770283,  0.14550638,  0.01520923,
        0.08429807,  0.07875729, -0.21486288,  0.11415743, -0.20992391,
       -0.0685881 ,  0.03464196,  0.06639262,  0.0711642 , -0.02454626,
        0.08453867, -0.19376495, -0.11627585, -0.09920968, -0.08798337,
       -0.04663217, -0.00564719, -0.07723233,  0.05682064, -0.0043568 ,
        0.04893576, -0.08547731, -0.0648665 ,  0.04425338, -0.03170114,
        0.06667162, -0.02683596,  0.01788042,  0.14344227, -0.03635204,
        0.14050098,  0.04153308, -0.003143  , -0.04599666,  0.10887373,
        0.00666679,  0.1045256 , -0.1121958 ,  0.23045965,  0.04365543,
       -0.02381746,  0.05996049,  0.17308174, -0.03465504,  0.03

FastText - [GENSIM](https://radimrehurek.com/gensim/models/fasttext.html)

Parâmetros
- sg=1 -- Skip Gram

In [None]:
model_ft = FastText(X, size=300, window=5, sg=1, workers=4)
ft  = {w: vec for w, vec in zip(model_ft.wv.index2word, model_ft.wv.vectors)}

In [None]:
# Verificando tamanho do Vetor do FT
len(ft)

7398

In [None]:
# Consultando o vetor embedding de uma das palavras
ft['internaco']

array([ 0.09549961,  0.08079952, -0.05853191,  0.10622388,  0.26360252,
       -0.04130889, -0.0367393 , -0.00132439,  0.00984141, -0.12110252,
        0.05304415, -0.13210458, -0.00318064, -0.04753127, -0.1026172 ,
       -0.00143091,  0.09791364, -0.01845025,  0.07289398,  0.00916332,
       -0.00217504,  0.12655093,  0.12437317, -0.0367079 , -0.05496674,
        0.05962734,  0.04754778,  0.16907352, -0.13044724, -0.04237279,
        0.14104742, -0.14993742,  0.08066963,  0.01977439, -0.17184722,
       -0.0761484 , -0.04307758, -0.3444477 ,  0.01158098, -0.04213077,
        0.21088813,  0.23800279,  0.06659839, -0.0394408 , -0.07801507,
       -0.06512019,  0.11480021,  0.04270018, -0.16348153,  0.24273798,
        0.13051596, -0.00213754,  0.19108306, -0.17877099,  0.10638157,
        0.16212033, -0.1783929 , -0.13039097,  0.01150367, -0.13874988,
        0.05618127,  0.12809561,  0.06574134, -0.12794922, -0.08536585,
        0.0442215 , -0.10086555, -0.01372329, -0.04248967, -0.05

## 3. Criação dos Modelos (*Pipelines*)

#### Classificadores
- SVM + RBF (Support Vector Machine + Radial Basis Function)
- KNN - K-Nearest Neighbors
- Decision Tree
- Random Forest (teste)

#### Representações de Documentos Tradicionais
- BoW

In [None]:
svm_rbf_bow   = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("svm rbf bow"  , OneVsRestClassifier(SVC(kernel="rbf", gamma=0.01, C=1.0)))])

In [None]:
knn_bow   = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("knn bow"  , OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5, p=2)))])

In [None]:
dt_bow   = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("dt bow"  , OneVsRestClassifier(tree.DecisionTreeClassifier(min_samples_split=40), n_jobs=-1))])

In [None]:
rf_bow   = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("rf bow"  , OneVsRestClassifier(RandomForestClassifier(min_samples_split=40, n_estimators=10, n_jobs=-1), n_jobs=-1))])

- TF-IDF

In [None]:
svm_rbf_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("svm rbf tfidf", OneVsRestClassifier(SVC(kernel="rbf", gamma=0.01, C=1.0)))])

In [None]:
knn_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("knn tfidf", OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5, p=2)))])

In [None]:
dt_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("dt tfidf", OneVsRestClassifier(tree.DecisionTreeClassifier(min_samples_split=40), n_jobs=-1))])

In [None]:
rf_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("rf tfidf", OneVsRestClassifier(RandomForestClassifier(min_samples_split=40, n_estimators=10, n_jobs=-1), n_jobs=-1))])

#### Representações de Documentos *Embeddings*
- Word2Vec
 - Vetor médio (padrão)

In [1]:
svm_rbf_w2v  = Pipeline([("w2v", E2V_AVG(w2v))    , ("svm rbf w2v",     OneVsRestClassifier(SVC(kernel="rbf", gamma=0.01, C=1.0), n_jobs=-1))])

NameError: ignored

In [2]:
knn_w2v      = Pipeline([("w2v", E2V_AVG(w2v))    , ("knn w2v",     OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5, p=2)))])

NameError: ignored

In [None]:
dt_w2v       = Pipeline([("w2v", E2V_AVG(w2v))    , ("dt w2v",     OneVsRestClassifier(tree.DecisionTreeClassifier(min_samples_split=40), n_jobs=-1))])

In [None]:
rf_w2v       = Pipeline([("w2v", E2V_AVG(w2v))    , ("rf w2v",     OneVsRestClassifier(RandomForestClassifier(min_samples_split=40, n_estimators=10, n_jobs=-1), n_jobs=-1))])

- Word2Vec
 - Abordagem Proposta **E2V-IDF**

In [None]:
svm_rbf_w2v_idf = Pipeline([("w2v-idf", E2V_IDF(w2v)), ("svm rbf w2v-idf", OneVsRestClassifier(SVC(kernel="rbf", gamma=0.01, C=1.0), n_jobs=-1))])

In [None]:
knn_w2v_idf     = Pipeline([("w2v-idf", E2V_IDF(w2v)), ("knn w2v-idf", OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5, p=2)))])

In [None]:
dt_w2v_idf   = Pipeline([("w2v-idf", E2V_IDF(w2v)), ("dt w2v-idf", OneVsRestClassifier(tree.DecisionTreeClassifier(min_samples_split=40), n_jobs=-1))])

In [None]:
rf_w2v_idf   = Pipeline([("w2v-idf", E2V_IDF(w2v)), ("rf w2v-idf", OneVsRestClassifier(RandomForestClassifier(min_samples_split=40, n_estimators=10, n_jobs=-1), n_jobs=-1))])

- FastText
 - Vetor médio (padrão)

In [None]:
svm_rbf_ft  = Pipeline([("ft", E2V_AVG(ft))    , ("svm rbf ft",     OneVsRestClassifier(SVC(kernel="rbf", gamma=0.01, C=1.0), n_jobs=-1))])

In [None]:
knn_ft      = Pipeline([("ft", E2V_AVG(ft))    , ("knn ft",     OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5, p=2)))])

In [None]:
dt_ft       = Pipeline([("ft", E2V_AVG(ft))    , ("dt ft",     OneVsRestClassifier(tree.DecisionTreeClassifier(min_samples_split=40), n_jobs=-1))])

In [None]:
rf_ft       = Pipeline([("ft", E2V_AVG(ft))    , ("rf ft",     OneVsRestClassifier(RandomForestClassifier(min_samples_split=40, n_estimators=10, n_jobs=-1), n_jobs=-1))])

- FastText
 - Abordagem Proposta **E2V-IDF**

In [None]:
svm_rbf_ft_idf = Pipeline([("ft-idf", E2V_IDF(ft)), ("svm rbf ft-idf", OneVsRestClassifier(SVC(kernel="rbf", gamma=0.01, C=1.0), n_jobs=-1))])

In [None]:
knn_ft_idf     = Pipeline([("ft-idf", E2V_IDF(ft)), ("knn ft-idf", OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5, p=2)))])

In [None]:
dt_ft_idf   = Pipeline([("ft-idf", E2V_IDF(ft)), ("dt ft-idf", OneVsRestClassifier(tree.DecisionTreeClassifier(min_samples_split=40), n_jobs=-1))])

In [None]:
rf_ft_idf   = Pipeline([("ft-idf", E2V_IDF(ft)), ("rf ft-idf", OneVsRestClassifier(RandomForestClassifier(min_samples_split=40, n_estimators=10, n_jobs=-1), n_jobs=-1))])

#### Agrupando os Modelos por Classificador
- SVM



In [None]:
all_models_svm = [
    ("SVM(RBF)+BoW", svm_rbf_bow),
    ("SVM(RBF)+TFIDF", svm_rbf_tfidf),
    ("SVM(RBF)+W2V", svm_rbf_w2v),
    ("SVM(RBF)+W2V-IDF", svm_rbf_w2v_idf),
    ("SVM(RBF)+FT", svm_rbf_ft),
    ("SVM(RBF)+FT-IDF", svm_rbf_ft_idf)
]

In [None]:
# Visualizando
all_models_svm

[('SVM(RBF)+BoW', Pipeline(memory=None,
           steps=[('count_vectorizer',
                   CountVectorizer(analyzer=<function <lambda> at 0x7f1571829598>,
                                   binary=False, decode_error='strict',
                                   dtype=<class 'numpy.int64'>, encoding='utf-8',
                                   input='content', lowercase=True, max_df=1.0,
                                   max_features=None, min_df=1,
                                   ngram_range=(1, 1), preprocessor=None,
                                   stop_words=None, strip_accents=None,
                                   token_pattern='(?u)\\b\\w\\w+\\b',
                                   tokenizer=None, vocabulary=None)),
                  ('svm rbf bow',
                   OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False,
                                                     cache_size=200,
                                                     class_weight=None, c

- KNN

In [None]:
all_models_knn = [
    ("KNN+BoW", knn_bow),
    ("KNN+TFIDF", knn_tfidf),
    ("KNN+W2V", knn_w2v),
    ("KNN+W2V-IDF", knn_w2v_idf),
    ("KNN+FT", knn_ft),
    ("KNN+FT-IDF", knn_ft_idf)
]

- *Decision Tree* (DT)

In [None]:
all_models_dt = [
    ("DT+BoW", dt_bow),
    ("DT+TFIDF", dt_tfidf),
    ("DT+W2V", dt_w2v),
    ("DT+W2V-IDF", dt_w2v_idf),
    ("DT+FT", dt_ft),
    ("DT+FT-IDF", dt_ft_idf)
]

- *Random Forest* (RF)

In [None]:
all_models_rf = [
    ("RF+BoW", rf_bow),
    ("RF+TFIDF", rf_tfidf),
    ("RF+W2V", rf_w2v),
    ("RF+W2V-IDF", rf_w2v_idf),
    ("RF+TF", rf_ft),
    ("RF+TF-IDF", rf_ft_idf)
]

#### Treinamento dos Modelos

- Usando as métricas *F1-Score* e Acurácia
- Average = micro
- Cross-Validation = 10


In [None]:
# Criando a função para a métrica F1-Score
from sklearn.model_selection import KFold
def benchmark_new_f1(model, X, y):
	scores = []
	kf = KFold(n_splits=10, random_state=66, shuffle=True)
	kf.get_n_splits(X, y)
	for train, test in kf.split(X, y):
		X_train, X_test = X[train], X[test]
		y_train, y_test = y[train], y[test]
		scores.append(f1_score(model.fit(X_train, y_train).predict(X_test), y_test, average = 'micro'))
		print (pd.DataFrame(scores)) # Guardar dados das 10 rodadas
	return np.mean(scores)

In [None]:
# Criando a função para a métrica Acurácia
from sklearn.model_selection import KFold
def benchmark_new(model, X, y):
    scores = []
    kf = KFold(n_splits=10, random_state=66, shuffle=True)
    kf.get_n_splits(X, y)
    for train, test in kf.split(X, y):
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        scores.append(accuracy_score(model.fit(X_train, y_train).predict(X_test), y_test))
        print (pd.DataFrame(scores)) # Guardar dados das 10 rodadas
    return np.mean(scores)

##### Classificadores
1.   SVM
2.   KNN
3.   Decision Tree
4.   Random Forest



###### *F1-Score*

In [None]:
# SVM
table = []
t0 = time()
for name, model in all_models_svm:
	 print(name)
	 table.append({'model': name, 
				   'f1-score': benchmark_new_f1(model, X, y)})
	 print(table)

df_result_f1 = pd.DataFrame(table)
print(df_result_f1)
print("Resultados (SVM) - F1-Score - DONE in %0.3fs." % (time() - t0))

SVM(RBF)+BoW
          0
0  0.809496
          0
0  0.809496
1  0.814157
          0
0  0.809496
1  0.814157
2  0.800757
          0
0  0.809496
1  0.814157
2  0.800757
3  0.820856
          0
0  0.809496
1  0.814157
2  0.800757
3  0.820856
4  0.804835
          0
0  0.809496
1  0.814157
2  0.800757
3  0.820856
4  0.804835
5  0.801923
          0
0  0.809496
1  0.814157
2  0.800757
3  0.820856
4  0.804835
5  0.801923
6  0.805709
          0
0  0.809496
1  0.814157
2  0.800757
3  0.820856
4  0.804835
5  0.801923
6  0.805709
7  0.798368
          0
0  0.809496
1  0.814157
2  0.800757
3  0.820856
4  0.804835
5  0.801923
6  0.805709
7  0.798368
8  0.803904
          0
0  0.809496
1  0.814157
2  0.800757
3  0.820856
4  0.804835
5  0.801923
6  0.805709
7  0.798368
8  0.803904
9  0.807401
[{'model': 'SVM(RBF)+BoW', 'f1-score': 0.8067407420232937}]
SVM(RBF)+TFIDF
          0
0  0.779202
          0
0  0.779202
1  0.772211
          0
0  0.779202
1  0.772211
2  0.769589
          0
0  0.779202




          0
0  0.739878
1  0.753568
2  0.739295
3  0.747742
4  0.739004
5  0.731430
6  0.736382
7  0.734266




          0
0  0.739878
1  0.753568
2  0.739295
3  0.747742
4  0.739004
5  0.731430
6  0.736382
7  0.734266
8  0.742424




          0
0  0.739878
1  0.753568
2  0.739295
3  0.747742
4  0.739004
5  0.731430
6  0.736382
7  0.734266
8  0.742424
9  0.741259
[{'model': 'SVM(RBF)+BoW', 'f1-score': 0.8067407420232937}, {'model': 'SVM(RBF)+TFIDF', 'f1-score': 0.7731518845267752}, {'model': 'SVM(RBF)+W2V', 'f1-score': 0.7405248455787344}]
SVM(RBF)+W2V-IDF
          0
0  0.777454
          0
0  0.777454
1  0.795514
          0
0  0.777454
1  0.795514
2  0.778328
          0
0  0.777454
1  0.795514
2  0.778328
3  0.789106
          0
0  0.777454
1  0.795514
2  0.778328
3  0.789106
4  0.777163
          0
0  0.777454
1  0.795514
2  0.778328
3  0.789106
4  0.777163
5  0.780076
          0
0  0.777454
1  0.795514
2  0.778328
3  0.789106
4  0.777163
5  0.780076
6  0.783571
          0
0  0.777454
1  0.795514
2  0.778328
3  0.789106
4  0.777163
5  0.780076
6  0.783571
7  0.779138
          0
0  0.777454
1  0.795514
2  0.778328
3  0.789106
4  0.777163
5  0.780076
6  0.783571
7  0.779138
8  0.780012
          0
0  0.777454



          0
0  0.735508
          0
0  0.735508
1  0.744247
          0
0  0.735508
1  0.744247
2  0.734052
          0
0  0.735508
1  0.744247
2  0.734052
3  0.748616
          0
0  0.735508
1  0.744247
2  0.734052
3  0.748616
4  0.736673
          0
0  0.735508
1  0.744247
2  0.734052
3  0.748616
4  0.736673
5  0.729682
          0
0  0.735508
1  0.744247
2  0.734052
3  0.748616
4  0.736673
5  0.729682
6  0.740169




          0
0  0.735508
1  0.744247
2  0.734052
3  0.748616
4  0.736673
5  0.729682
6  0.740169
7  0.733100
          0
0  0.735508
1  0.744247
2  0.734052
3  0.748616
4  0.736673
5  0.729682
6  0.740169
7  0.733100
8  0.738928
          0
0  0.735508
1  0.744247
2  0.734052
3  0.748616
4  0.736673
5  0.729682
6  0.740169
7  0.733100
8  0.738928
9  0.740676
[{'model': 'SVM(RBF)+BoW', 'f1-score': 0.8067407420232937}, {'model': 'SVM(RBF)+TFIDF', 'f1-score': 0.7731518845267752}, {'model': 'SVM(RBF)+W2V', 'f1-score': 0.7405248455787344}, {'model': 'SVM(RBF)+W2V-IDF', 'f1-score': 0.7818915730836792}, {'model': 'SVM(RBF)+FT', 'f1-score': 0.7381652404300234}]
SVM(RBF)+FT-IDF
          0
0  0.774541
          0
0  0.774541
1  0.780658
          0
0  0.774541
1  0.780658
2  0.774833
          0
0  0.774541
1  0.780658
2  0.774833
3  0.786193
          0
0  0.774541
1  0.780658
2  0.774833
3  0.786193
4  0.765803
          0
0  0.774541
1  0.780658
2  0.774833
3  0.786193
4  0.765803
5  0.769007

In [None]:
# KNN
table = []
t0 = time()
for name, model in all_models_knn:
	 print(name)
	 table.append({'model': name, 
				   'f1-score': benchmark_new_f1(model, X, y)})
	 print(table)

df_result_f1 = pd.DataFrame(table)
print(df_result_f1)
print("Resultados (KNN) - F1-Score - DONE in %0.3fs." % (time() - t0))

KNN+BoW
         0
0  0.65453
          0
0  0.654530
1  0.651908
          0
0  0.654530
1  0.651908
2  0.647247
          0
0  0.654530
1  0.651908
2  0.647247
3  0.651908
          0
0  0.654530
1  0.651908
2  0.647247
3  0.651908
4  0.660647
          0
0  0.654530
1  0.651908
2  0.647247
3  0.651908
4  0.660647
5  0.648412
          0
0  0.654530
1  0.651908
2  0.647247
3  0.651908
4  0.660647
5  0.648412
6  0.652199
          0
0  0.654530
1  0.651908
2  0.647247
3  0.651908
4  0.660647
5  0.648412
6  0.652199
7  0.652972
          0
0  0.654530
1  0.651908
2  0.647247
3  0.651908
4  0.660647
5  0.648412
6  0.652199
7  0.652972
8  0.643648
          0
0  0.654530
1  0.651908
2  0.647247
3  0.651908
4  0.660647
5  0.648412
6  0.652199
7  0.652972
8  0.643648
9  0.662587
[{'model': 'KNN+BoW', 'f1-score': 0.6526058609804605}]
KNN+TFIDF
          0
0  0.769298
          0
0  0.769298
1  0.775706
          0
0  0.769298
1  0.775706
2  0.757355
          0
0  0.769298
1  0.775706
2  0.

In [None]:
# Decision Tree
table = []
t0 = time()
for name, model in all_models_dt:
	 print(name)
	 table.append({'model': name, 
				   'f1-score': benchmark_new_f1(model, X, y)})
	 print(table)

df_result_f1 = pd.DataFrame(table)
print(df_result_f1)
print("Resultados (Decision Tree) - F1-Score - DONE in %0.3fs." % (time() - t0))

DT+BoW
          0
0  0.680163
          0
0  0.680163
1  0.692689
          0
0  0.680163
1  0.692689
2  0.681328
          0
0  0.680163
1  0.692689
2  0.681328
3  0.690941
          0
0  0.680163
1  0.692689
2  0.681328
3  0.690941
4  0.681037
          0
0  0.680163
1  0.692689
2  0.681328
3  0.690941
4  0.681037
5  0.672298
          0
0  0.680163
1  0.692689
2  0.681328
3  0.690941
4  0.681037
5  0.672298
6  0.671716
          0
0  0.680163
1  0.692689
2  0.681328
3  0.690941
4  0.681037
5  0.672298
6  0.671716
7  0.675408
          0
0  0.680163
1  0.692689
2  0.681328
3  0.690941
4  0.681037
5  0.672298
6  0.671716
7  0.675408
8  0.673951
          0
0  0.680163
1  0.692689
2  0.681328
3  0.690941
4  0.681037
5  0.672298
6  0.671716
7  0.675408
8  0.673951
9  0.673660
[{'model': 'DT+BoW', 'f1-score': 0.6793190509364411}]
DT+TFIDF
          0
0  0.664142
          0
0  0.664142
1  0.664725
          0
0  0.664142
1  0.664725
2  0.659773
          0
0  0.664142
1  0.664725
2  0.6

In [None]:
# Random Forest
table = []
t0 = time()
for name, model in all_models_rf:
	 print(name)
	 table.append({'model': name, 
				   'f1-score': benchmark_new_f1(model, X, y)})
	 print(table)

df_result_f1 = pd.DataFrame(table)
print(df_result_f1)
print("Resultados (Random Forest) - F1-Score - DONE in %0.3fs." % (time() - t0))

RF+BoW
          0
0  0.774541
          0
0  0.774541
1  0.775415
          0
0  0.774541
1  0.775415
2  0.766094
          0
0  0.774541
1  0.775415
2  0.766094
3  0.771337
          0
0  0.774541
1  0.775415
2  0.766094
3  0.771337
4  0.765511
          0
0  0.774541
1  0.775415
2  0.766094
3  0.771337
4  0.765511
5  0.763181
          0
0  0.774541
1  0.775415
2  0.766094
3  0.771337
4  0.765511
5  0.763181
6  0.774541
          0
0  0.774541
1  0.775415
2  0.766094
3  0.771337
4  0.765511
5  0.763181
6  0.774541
7  0.763112
          0
0  0.774541
1  0.775415
2  0.766094
3  0.771337
4  0.765511
5  0.763181
6  0.774541
7  0.763112
8  0.764569
          0
0  0.774541
1  0.775415
2  0.766094
3  0.771337
4  0.765511
5  0.763181
6  0.774541
7  0.763112
8  0.764569
9  0.771270
[{'model': 'RF+BoW', 'f1-score': 0.768957149753829}]
RF+TFIDF
          0
0  0.763763
          0
0  0.763763
1  0.766385
          0
0  0.763763
1  0.766385
2  0.756773
          0
0  0.763763
1  0.766385
2  0.75

## 3.1 Teste dos Modelos para Notícias Curtas em Português

Abaixo a compilação dos resultados:

- **model	           (f1-score)**
- **SVM(RBF)+BoW     (0.806741)**
- **SVM(RBF)+W2V-IDF (0.781892)**
- SVM(RBF)+FT-IDF  (0.774696)
- SVM(RBF)+TFIDF   (0.773152)
- RF+BoW           (0.768957)
- RF+TFIDF         (0.759868)
- KNN+TFIDF        (0.759518)
- KNN+W2V-IDF      (0.752294)
- KNN+W2V          (0.746992)
- KNN+FT-IDF       (0.742418)
- SVM(RBF)+W2V     (0.740525)
- KNN+FT           (0.740292)
- SVM(RBF)+FT      (0.738165)
- RF+W2V-IDF       (0.732630)
- RF+W2V           (0.730999)
- RF+TF-IDF        (0.721182)
- RF+TF            (0.719608)
- DT+BoW           (0.679319)
- DT+TFIDF         (0.657645)
- KNN+BoW          (0.652606)
- DT+W2V-IDF       (0.640516)
- DT+W2V           (0.636350)
- DT+FT-IDF        (0.624523)
- DT+FT            (0.620765)

### 3.1.1 Validando os 2 melhores modelos
- **SVM(RBF)+BoW**

In [None]:
# "Bizarizando" as classes
from sklearn.preprocessing import label_binarize

name_labels = ['esporteNews', 'politicaNews', 'tecnologiaNews', 'financaPessoal', 'educacaonews', 'ciencianaturezasaudenews']
Y = label_binarize(y, classes=['esporteNews', 'politicaNews', 'tecnologiaNews', 'financaPessoal', 'educacaonews', 'ciencianaturezasaudenews'])

In [None]:
n_classes = Y.shape[1]

In [None]:
# Visualizando o número de classes
n_classes

6

In [None]:
# Criando o conjunto de treinamento e testes
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=66)

In [None]:
# Treinando o modelo RBF+BoW
svm_rbf_bow.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('count_vectorizer',
                 CountVectorizer(analyzer=<function <lambda> at 0x7f949311a840>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('svm rbf bow',
                 OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False,
                                                   cache_size=200,
                                                   class_weight=None, coef0=0.0,
                                    

In [None]:
# Predições
predictions = svm_rbf_bow.predict(X_test)

In [None]:
# Visualizando as métricas
print ("Precision: %s" %precision_score(Y_test, predictions, average="micro"))
print ("Recall...: %s" %recall_score(Y_test, predictions, average="micro"))
print ("F1-Score.: %s" %f1_score(Y_test, predictions, average="micro"))
print ("Accuracy.: %s" %accuracy_score(Y_test, predictions))

print (classification_report(predictions,Y_test))

Precision: 0.9002514668901928
Recall...: 0.625691814739295
F1-Score.: 0.738271180615226
Accuracy.: 0.621176813282843
              precision    recall  f1-score   support

           0       0.84      0.97      0.90       924
           1       0.74      0.85      0.79      1169
           2       0.50      0.83      0.62       592
           3       0.26      0.92      0.41       294
           4       0.69      0.91      0.78       857
           5       0.67      0.91      0.77       936

   micro avg       0.63      0.90      0.74      4772
   macro avg       0.61      0.90      0.71      4772
weighted avg       0.67      0.90      0.76      4772
 samples avg       0.63      0.62      0.62      4772



  _warn_prf(average, modifier, msg_start, len(result))


Podemos observar que as notícias de tecnologia e Finanças não tiveram bons resultados (*será analisado aqui em breve!*)

- **SVM(RBF)+W2V-IDF**

In [None]:
# Training
svm_rbf_w2v_idf.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('w2v-idf', <__main__.E2V_IDF object at 0x7f94931225c0>),
                ('svm rbf w2v-idf',
                 OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False,
                                                   cache_size=200,
                                                   class_weight=None, coef0=0.0,
                                                   decision_function_shape='ovr',
                                                   degree=3, gamma=0.01,
                                                   kernel='rbf', max_iter=-1,
                                                   probability=False,
                                                   random_state=None,
                                                   shrinking=True, tol=0.001,
                                                   verbose=False),
                                     n_jobs=-1))],
         verbose=False)

In [None]:
# Prediction E2VIDF
pred_E2VIDF = svm_rbf_w2v_idf.predict(X_test)

In [None]:
# Reports
print ("Precision: %s" %precision_score(Y_test, pred_E2VIDF, average="micro"))
print ("Recall...: %s" %recall_score(Y_test, pred_E2VIDF, average="micro"))
print ("F1-Score.: %s" %f1_score(Y_test, pred_E2VIDF, average="micro"))
print ("Accuracy.: %s" %accuracy_score(Y_test, pred_E2VIDF))

print (classification_report(pred_E2VIDF,Y_test))

Precision: 0.8612184796613289
Recall...: 0.6814739295077192
F1-Score.: 0.7608748678754371
Accuracy.: 0.6746286047189047
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1008
           1       0.80      0.83      0.82      1319
           2       0.58      0.78      0.66       735
           3       0.33      0.85      0.48       408
           4       0.72      0.89      0.79       925
           5       0.69      0.85      0.76      1038

   micro avg       0.68      0.86      0.76      5433
   macro avg       0.67      0.86      0.74      5433
weighted avg       0.72      0.86      0.78      5433
 samples avg       0.68      0.68      0.68      5433



  _warn_prf(average, modifier, msg_start, len(result))


Podemos observar que as notícias de tecnologia e Finanças não tiveram bons resultados (*será analisado aqui em breve!*)

##4. *Deploy* em Produção (Projeto Completo)
Aplicação em Produção: **Luppar Recommender**

[Luppar News-Rec](http://luppar.com/recommender)




## Versionamento
- **v1.0** 
 - Adicionado mais 1 tópico (saúde) - coleção Z6News;
 - Adaptação para versão em Notebook.
- **v2.0** (*em desenvolvimento*)
 - Melhorias em Parâmetros;
 - Testar com notícias de outras fontes de notíticas;
 - Novos métodos Embeddings;
 - Melhorias em Features.

## Referências
- (SOUZA, 2019) SOUZA, ANTONIO ALEX DE. LUPPAR NEWS-REC: UM RECOMENDADOR INTELIGENTE DE NOTÍCIAS. 2019. 95 f. Dissertação (Mestrado Acadêmico em Computação) – Universidade Estadual do Ceará, , 2019. Disponível em: <http://siduece.uece.br/siduece/trabalhoAcademicoPublico.jsf?id=93501> Acesso em: 27 de fevereiro de 2020

- Alex Souza ([Blog](https://blogdozouza.wordpress.com/))
