# 1. Pré-processamento

## 1.1 Carrega pacotes e dados

In [None]:
import pandas as pd #pacate para a criação de data-frames
import re #pacote para o processamento de string por meio de "regulas expressions"
import nltk #pacote para o processamento de textos
#import spacy #pacote para o processamento de textos
import xgboost #pacote com o algoritmo extreme gradient boosting
import numpy as np #pacote de algoritmos numéricos
#SKLEARN é um pacote com vários algorimtos de processamento de dados e modelos de machine learning
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, tree, neural_network, neighbors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import ensemble
from sklearn.utils import resample
from sklearn.manifold import TSNE #função usada para a redução de dimensionalidade

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import layers, models, optimizers, callbacks

from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

import joblib
from time import time
from datetime import datetime
import imblearn
from imblearn.over_sampling import SMOTE

import warnings
warnings.simplefilter("ignore") #omite os warnings

#KERAS é um pacote para a criação de redes neurais
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers
#Gensim é um pacote para processamento de words embeddings
#from gensim.models import KeyedVectors

import matplotlib.pyplot as plt #pacote para visualizar dados
import seaborn as sns # pacote para visualizar dados baseado no matplotlib

from nltk.tokenize import word_tokenize
from gensim.models import FastText

import pickle
import spacy

import sys
sys.path.insert(1, '..')
import utils


In [None]:
#Carrega base
df = pd.read_pickle('dados\\df_processado.pkl')
lista_tweets = df.tweet.to_list().copy()

df = df[df['sent_manual'].fillna('nan').str.contains('N|E|S|C')]
def corrige_label(label):
    if label == 'S' or label == 'E':
        return('N')
    else:
        return(label)
df['sent_manual'] = df['sent_manual'].apply(corrige_label)


corpus = df.tweet.to_list().copy()

df['sent_manual'].value_counts()

## 1.2 Pré-processamento dos dados

In [None]:
#TRANSFORMA PARA CAIXA BAIXA
#REMOVE NUMEROS, PONTUACAO E URLS
for i in range(len(corpus)):
    corpus[i] = utils.corrige_ortografia(corpus[i])
    corpus[i]=corpus[i].lower()
    corpus[i] = re.sub(r'http\S+', ' ', corpus[i]) #urls (tem que ser antes dos outros)
    corpus[i] = re.sub('\n', ' ', corpus[i]) #newline
    corpus[i] = re.sub('[0-9]+', ' ', corpus[i]) #números
    corpus[i] = re.sub(r'[^\w\s]',' ',corpus[i]) #pontuação
    corpus[i] = re.sub('º','',corpus[i])
    corpus[i] = re.sub('ª','',corpus[i])
    corpus[i] = re.sub('@','',corpus[i])
    corpus[i] = re.sub('#','',corpus[i])
    
    

In [None]:
corpus = utils.altera_expressoes(corpus)

In [None]:
#carrega modelo pré-treinado para processar textos em português. Desabilita duas funções que não vamos usar
nlp = spacy.load('pt_core_news_lg', disable=['parser', 'ner'])

for i in range(0,len(corpus)): # varre a lista de textos
    doc = nlp(corpus[i]) # executa um processamento de texto
    corpus[i]=" ".join([token.lemma_ for token in doc]) # substitui o texto anterior por um texto contendo os lemas extraídos



In [None]:
corpus = utils.corrige_lema(corpus)

## 1.3 Divisão dos dados em treino, validação e teste

In [None]:
# Dataset preparation

# Divisão dos textos em um conjunto de treinamento e outro de validação
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(corpus, df.sent_manual.to_list(), 
                                                                      test_size=0.40, 
                                                                      random_state = 100, 
                                                                      stratify=df.sent_manual.to_list() )
# Divide o conjunto de validação em validação e teste
X_test, X_valid, y_test, y_valid = model_selection.train_test_split(X_valid, y_valid, 
                                                                      test_size=0.50, 
                                                                      random_state = 100, 
                                                                      stratify=y_valid )

print("Treino:",len(X_train),len(y_train))
print("Validação:",len(X_valid),len(y_valid))
print("Teste:",len(X_test),len(y_test))

## 1.4 Codifica os labels

In [None]:
# Copia os labels
y_train_labels = y_train.copy()
y_valid_labels = y_valid.copy()
y_test_labels = y_test.copy()

#Tratamento dos dados de saída
# Codificação das variveis alvo da classificação
encoder = preprocessing.LabelEncoder() #criação do codificador
encoder.fit(df.sent_manual)
y_train = encoder.transform(y_train) #codificação dos dados de treinamento
y_valid = encoder.transform(y_valid) #codificação dos dados de validação
y_test = encoder.transform(y_test) #codificação dos dados de validação
labels = encoder.classes_ #criação de uma lista contendo os tipos de norma (classes da classificação)

## 1.5 Upsampling e Subsampling

In [None]:
trainDF = pd.DataFrame()
trainDF['text'] = X_train
trainDF['label'] = y_train

# Separate majority and minority classes
df_majority = trainDF[trainDF.label==1] #'N'
df_minority = trainDF[trainDF.label==0] #'C'

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples= df_majority.shape[0] ,    # to match majority class
                                 random_state=123) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled = df_upsampled.sample(df_upsampled.shape[0])
X_train_UP = df_upsampled.text.to_list()
y_train_UP = df_upsampled.label.to_list()

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=df_minority.shape[0],     # to match minority class
                                 random_state=123) # reproducible results
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled = df_downsampled.sample(df_downsampled.shape[0])
X_train_SUB = df_downsampled.text.to_list()
y_train_SUB = df_downsampled.label.to_list()

print("X_train:", len(X_train))
print("X_train_UP:", len(X_train_UP))
print("X_train_SUB:", len(X_train_SUB))

## 1.6 Remoção de stopwords

Define as stopwords

Faz uma cópia as listas X_train, X_train_UP, X_train_SUB, X_valid e X_test para serem usadas nas redes neurais mais complexas

In [None]:
mystopwords=utils.mystopwords

# Faz uma cópia das listas de treinamento, validação e teste (X) para serem usadas nas redes neurais mais complexas
X_train_NN = X_train.copy()
X_train_UP_NN = X_train_UP.copy()
X_train_SUB_NN = X_train_SUB.copy()
X_valid_NN = X_valid.copy()
X_test_NN = X_test.copy()

#Remove as stopwords das listas de treino, valid e test (X) para usar nos algoritmos de ML
for i in range(0,len(X_train)): # varre a lista de textos
    words=X_train[i].split(" ") # separa o texto em palavras
    words_new = [w for w in words if w not in mystopwords] #remove as stop words
    X_train[i] = ' '.join(words_new) # concantena as palavras novamente

for i in range(0,len(X_train_UP)): # varre a lista de textos
    words=X_train_UP[i].split(" ") # separa o texto em palavras
    words_new = [w for w in words if w not in mystopwords] #remove as stop words
    X_train_UP[i] = ' '.join(words_new) # concantena as palavras novamente

for i in range(0,len(X_train_SUB)): # varre a lista de textos
    words=X_train_SUB[i].split(" ") # separa o texto em palavras
    words_new = [w for w in words if w not in mystopwords] #remove as stop words
    X_train_SUB[i] = ' '.join(words_new) # concantena as palavras novamente

for i in range(0,len(X_valid)): # varre a lista de textos
    words=X_valid[i].split(" ") # separa o texto em palavras
    words_new = [w for w in words if w not in mystopwords] #remove as stop words
    X_valid[i] = ' '.join(words_new) # concantena as palavras novamente

for i in range(0,len(X_test)): # varre a lista de textos
    words=X_test[i].split(" ") # separa o texto em palavras
    words_new = [w for w in words if w not in mystopwords] #remove as stop words
    X_test[i] = ' '.join(words_new) # concantena as palavras novamente

## 1.7 Matrizes Termo-Documento 

In [None]:
max_tokens = 2000
#Tratamento dos dados de entrada
#DTM-FREQUÊNCIA DE PALAVRAS
# cria um objeto contador 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                            ngram_range=(1, 1), max_features = max_tokens)
count_vect.fit(X_train) # treina o objeto nos textos processados
#print(count_vect.get_feature_names())
print("count:", len(count_vect.get_feature_names())," tokens")
#Transforma os documentos na matriz documento termo.
X_train_count =  count_vect.transform(X_train)
X_train_count_UP =  count_vect.transform(X_train_UP)
X_train_count_SUB =  count_vect.transform(X_train_SUB)
X_valid_count =  count_vect.transform(X_valid)
X_test_count =  count_vect.transform(X_test)
pickle.dump(count_vect.vocabulary_, open("dados/count-vocab", 'wb'))

#DTM-BINÁRIA
# cria um objeto contador binário
binary_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', binary=True,
                             ngram_range=(1, 1), max_features = max_tokens)
binary_vect.fit(X_train) # treina o objeto nos textos processados
#print(binary_vect.get_feature_names())
print("binary:",len(binary_vect.get_feature_names())," tokens")
#Transforma os documentos na matriz documento termo binária.
X_train_binary =  binary_vect.transform(X_train)
X_train_binary_UP =  binary_vect.transform(X_train_UP)
X_train_binary_SUB =  binary_vect.transform(X_train_SUB)
X_valid_binary =  binary_vect.transform(X_valid)
X_test_binary =  binary_vect.transform(X_test)
pickle.dump(binary_vect.vocabulary_, open("dados/binary-vocab", 'wb'))

#DTM-TF-IDF
# cria um objeto que calcula o TF-IDF
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                            ngram_range=(1, 1), max_features = max_tokens)
tfidf_vect.fit(X_train) # treina o objeto nos textos processados
#print(tfidf_vect.get_feature_names())
print("tfidf:",len(tfidf_vect.get_feature_names())," tokens")
#Transforma os documentos na matriz documento termo TF-IDF.
X_train_tfidf =  tfidf_vect.transform(X_train)
X_train_tfidf_UP =  tfidf_vect.transform(X_train_UP)
X_train_tfidf_SUB =  tfidf_vect.transform(X_train_SUB)
X_valid_tfidf =  tfidf_vect.transform(X_valid) 
X_test_tfidf =  tfidf_vect.transform(X_test)
pickle.dump(tfidf_vect.vocabulary_, open("dados/tfidf-vocab", 'wb'))

## 1.8 Word embeddings 

In [None]:
#Word Embeddings
EMBEDDING_DIM = 300

lista_texto_word_tokenized  = []
for texto in lista_tweets:
    lista_texto_word_tokenized.append(word_tokenize(texto))

In [None]:
# calcula da media e mediana para definir manualmente MAX_NB_WORDS
x_ = []
for l in lista_texto_word_tokenized:
    x_.append(len(l))
print(np.mean(np.array(x_)))
print(np.median(np.array(x_)))
print(np.percentile(x_,90))
print(np.max(x_))

In [None]:
MAX_NB_WORDS = 50 

In [None]:
model = FastText(lista_texto_word_tokenized, size=EMBEDDING_DIM, window=5, min_count=1, iter=10)

model.save("dados\\tweets_gensim_fasttext.model")
#model = FastText.load("dados\\tweets_gensim_fasttext.model") 

In [None]:
X_train_fasttext = np.zeros((len(X_train), EMBEDDING_DIM))
X_train_fasttext_UP = np.zeros((len(X_train_UP), EMBEDDING_DIM))
X_train_fasttext_SUB = np.zeros((len(X_train_SUB), EMBEDDING_DIM))
X_valid_fasttext = np.zeros((len(X_valid), EMBEDDING_DIM))
X_test_fasttext = np.zeros((len(X_test), EMBEDDING_DIM))

for i in range(len(X_train)):
    X_train_fasttext[i] = model[X_train[i]]

for i in range(len(X_train_UP)):
    X_train_fasttext_UP[i] = model[X_train_UP[i]]

for i in range(len(X_train_SUB)):
    X_train_fasttext_SUB[i] = model[X_train_SUB[i]]

for i in range(len(X_valid)):
    X_valid_fasttext[i] = model[X_valid[i]]

for i in range(len(X_test)):
    X_test_fasttext[i] = model[X_test[i]]


In [None]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(corpus)
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
X_train_seq = sequence.pad_sequences(token.texts_to_sequences(X_train_NN), maxlen=MAX_NB_WORDS)
X_train_seq_UP = sequence.pad_sequences(token.texts_to_sequences(X_train_UP_NN), maxlen=MAX_NB_WORDS)
X_train_seq_SUB = sequence.pad_sequences(token.texts_to_sequences(X_train_SUB_NN), maxlen=MAX_NB_WORDS)
X_valid_seq = sequence.pad_sequences(token.texts_to_sequences(X_valid_NN), maxlen=MAX_NB_WORDS)
X_test_seq = sequence.pad_sequences(token.texts_to_sequences(X_test_NN), maxlen=MAX_NB_WORDS)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = model[word] #embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector



## 1.9 Definição de funções

In [None]:
#função que treina os algoritmos de classificação
def train_model(classifier, train_x, train_y, test_x, test_y, save = False, nome_arquivo = None, nome_modelo = None, parameters = None, smote=False):
    """
    #Parâmetros
    #classifier: algoritmo de classificação
    #train_x: dados de treinamento de entrada (X)
    #train_y: dados de treinamento de saída (Y)
    #test_x: dados de teste de entrada (X)
    #test_y: dados de teste de saída (Y)
    #save: salva o modelo treinado. Default = False
    #nome_arquivo: nome do arquivo para salvar o modelo
    #nome_modelo: nome do modelo para salvar no arquivo "resultados-classificacao.csv"
    #parameters: parâmetros do classificador para serem testados pelo GridSearch
    #smote: bool para aplicar SMOTE ou não
    """
    
    if smote:
        # APLICA A TÉCNICA DO SMOTE
        oversample = SMOTE(random_state=100, n_jobs=-1)
        train_x, train_y = oversample.fit_resample(train_x, train_y)
    
    if (__name__ == "__main__") & (parameters != None) :
        # multiprocessing requires the fork to happen in a __main__ protected
        # block

        # find the best parameters for both the feature extraction and the
        # classifier
        grid_search = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=0,cv=5)
        t0 = time()
        grid_search.fit(train_x, train_y)
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))
        predictions= grid_search.best_estimator_.predict(test_x)
        classifier = grid_search.best_estimator_
        
    else:
        # treina o classificador
        classifier.fit(train_x, train_y)
        # realiza uma previsão da classificação com base nos dados de teste
        predictions = classifier.predict(test_x)
    

    #calcula a matriz de confusão
    confusionMatrix(predictions, test_y)
    print("\n") #pula uma linha
    #cria um relatório com base nas previsões realizdas
    classificationReport(predictions, test_y)
    
    #calcula o kapppa
    kappa = metrics.cohen_kappa_score(test_y, predictions)
    print("Kappa score: {:.3f}\n".format(kappa))
    acc = metrics.accuracy_score(test_y, predictions)
    print("Accuracy score: {:.3f}\n".format(acc))
    f1 = metrics.f1_score(test_y, predictions, average='weighted')
    print("f1 weighted score: {:.3f}\n".format(f1))
    acc_bal = metrics.balanced_accuracy_score(test_y, predictions)
    print("Balanced Accuracy score: {:.3f}\n".format(acc_bal))
    roc = metrics.roc_auc_score(test_y, predictions)
    print("Area under the ROC curve: {:.3f}\n".format(roc))
    rec = metrics.recall_score(test_y, predictions, pos_label = 0, average='binary')
    print("Recall classe C: {:.3f}\n".format(rec))
    prec = metrics.precision_score(test_y, predictions, pos_label = 0, average='binary')
    print("Precision classe C: {:.3f}\n".format(prec))

    
    #salva se for o caso
    if save:
        if not nome_arquivo:
            nome_arquivo = type(classifier).__name__
        joblib.dump(classifier, "dados/class-"+nome_arquivo)

    if nome_modelo:
        dateTimeObj = datetime.now()
        with open("Classificação de tweets\\resultados-classificacao.csv", "a") as myfile:
            myfile.write(nome_modelo+","+str(test_x.shape[0])+","+str(acc)+","+str(kappa)+","+str(f1)+","+str(acc_bal)+","+str(roc)+","+str(rec)+","+str(prec)+","+dateTimeObj.strftime("%Y-%m-%d")+"\n")
    # retorna a acurácia do modelo        
    return  classifier

# função que calcula a matriz de confusão
def confusionMatrix(predictions, real):
    #faz um processamento dos dados para uma melhor impressão
    X = np.array( metrics.confusion_matrix(y_true=real,y_pred=predictions))
    X = pd.DataFrame(X,index = labels, columns = labels)
    print(X)
    return

# função que cria um relatório com base nas previsões realizadas pelo modelo
def classificationReport(predictions, real):
    print(metrics.classification_report(y_true=real,y_pred=predictions, target_names=labels))    
    return


Visualizando dados

In [None]:
#"""
#VISUALIZANDO OS DADOS

#Redução da dimensionalidade das matrizes DTm usando o algoritmo T-SNE
#X_train_count_embedded = TSNE(n_components=2).fit_transform(X_train_count.toarray())
#X_train_binary_embedded = TSNE(n_components=2).fit_transform(X_train_binary.toarray())
#X_train_tfidf_embedded = TSNE(n_components=2).fit_transform(X_train_tfidf.toarray())
X_train_fasttext_embedded = TSNE(n_components=2).fit_transform(X_train_fasttext)

#Plota os gráficos
#sns.scatterplot(x=X_train_count_embedded[:,0], y=X_train_count_embedded[:,1], hue=train_y_labels)
#plt.title('DTM - Frequência de Palavras')
#plt.xlabel('x')
#plt.ylabel('y')
#plt.show()

#sns.scatterplot(x=X_train_binary_embedded[:,0], y=X_train_binary_embedded[:,1], hue=train_y_labels)
#plt.title('DTM - Binária')
#plt.xlabel('x')
#plt.ylabel('y')
#plt.show()

#sns.scatterplot(x=X_train_tfidf_embedded[:,0], y=X_train_tfidf_embedded[:,1], hue=train_y_labels)
#plt.title('DTM - TFIDF')
#plt.xlabel('x')
#plt.ylabel('y')
#plt.show()


sns_plot = sns.scatterplot(x=X_train_fasttext_embedded[:,0], y=X_train_fasttext_embedded[:,1], palette=['blue','red'],
                hue=pd.Series(y_train).apply(lambda x:'C' if x==0 else 'N').values)
plt.title('TSNE - FASTTEXT')
plt.xlabel('x')
plt.ylabel('y')
plt.savefig("imagens\\DTM-fasttext.png", dpi=900)
plt.show()
#"""

# 2. Treinamentos

## 2.1 Multinomial Naive Bayes 

In [None]:
#NAIVE BAYES
nome = "MULTINOMIAL NAIVE BAYES"
nome2 = "MultinomialNB"
modelosNB = [naive_bayes.MultinomialNB() for i in range(16)]
parameters_ = {'alpha': (0.0, 0.5, 1.0)}

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
modelosNB[0] = train_model(modelosNB[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosNB[1] = train_model(modelosNB[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosNB[2] = train_model(modelosNB[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosNB[3] = train_model(modelosNB[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosNB[4] = train_model(modelosNB[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosNB[5] = train_model(modelosNB[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosNB[6] = train_model(modelosNB[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosNB[7] = train_model(modelosNB[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosNB[8] = train_model(modelosNB[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosNB[9] = train_model(modelosNB[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosNB[10] = train_model(modelosNB[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosNB[11] = train_model(modelosNB[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)

#MULTINOMIAL NB NÃO ACEITA VALORES NEGATIVOS
# FastText Vectors
#print ("\n",nome," - FASTTEXT VECTORS")
#modelosNB[12] = train_model(modelosNB[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

#print ("\n",nome," - FASTTEXT VECTORS Upsampling")
#modelosNB[13] = train_model(modelosNB[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

#print ("\n",nome," - FASTTEXT VECTORS Downsampling")
#modelosNB[14] = train_model(modelosNB[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

#print ("\n",nome," - FASTTEXT VECTORS Smote")
#modelosNB[15] = train_model(modelosNB[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.1.1 Gaussian Naive Bayes 

In [None]:
#NAIVE BAYES
nome = "GAUSSIAN NAIVE BAYES"
nome2 = "GaussianNB"
modelosGNB = [naive_bayes.GaussianNB() for i in range(16)]
parameters_ = None

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
modelosGNB[0] = train_model(modelosGNB[0], X_train_count.toarray(), y_train, X_test_count.toarray(), y_test, nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosGNB[1] = train_model(modelosGNB[1], X_train_count_UP.toarray(), y_train_UP, X_test_count.toarray(), y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosGNB[2] = train_model(modelosGNB[2], X_train_count_SUB.toarray(), y_train_SUB, X_test_count.toarray(), y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosGNB[3] = train_model(modelosGNB[3], X_train_count.toarray(), y_train, X_test_count.toarray(), y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosGNB[4] = train_model(modelosGNB[4], X_train_binary.toarray(), y_train, X_test_binary.toarray(), y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosGNB[5] = train_model(modelosGNB[5], X_train_binary_UP.toarray(), y_train_UP, X_test_binary.toarray(), y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosGNB[6] = train_model(modelosGNB[6], X_train_binary_SUB.toarray(), y_train_SUB, X_test_binary.toarray(), y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosGNB[7] = train_model(modelosGNB[7], X_train_binary.toarray(), y_train, X_test_binary.toarray(), y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosGNB[8] = train_model(modelosGNB[8], X_train_tfidf.toarray(), y_train, X_test_tfidf.toarray(), y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosGNB[9] = train_model(modelosGNB[9], X_train_tfidf_UP.toarray(), y_train_UP, X_test_tfidf.toarray(), y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosGNB[10] = train_model(modelosGNB[10], X_train_tfidf_SUB.toarray(), y_train_SUB, X_test_tfidf.toarray(), y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosGNB[11] = train_model(modelosGNB[11], X_train_tfidf.toarray(), y_train, X_test_tfidf.toarray(), y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosGNB[12] = train_model(modelosGNB[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosGNB[13] = train_model(modelosGNB[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosGNB[14] = train_model(modelosGNB[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosGNB[15] = train_model(modelosGNB[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.2 K Nearest Neighbors

In [None]:
#KNN
nome = "KNN"
nome2 = "KNeighbors"
modelosKN = [neighbors.KNeighborsClassifier() for i in range(16)]
parameters_ = {'n_neighbors' : (1, 3, 5, 7, 9),
               'weights' : ('uniform', 'distance'),
               'p' : (1, 2)}

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
modelosKN[0] = train_model(modelosKN[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosKN[1] = train_model(modelosKN[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosKN[2] = train_model(modelosKN[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosKN[3] = train_model(modelosKN[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosKN[4] = train_model(modelosKN[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosKN[5] = train_model(modelosKN[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosKN[6] = train_model(modelosKN[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosKN[7] = train_model(modelosKN[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosKN[8] = train_model(modelosKN[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosKN[9] = train_model(modelosKN[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosKN[10] = train_model(modelosKN[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosKN[11] = train_model(modelosKN[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)

# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosKN[12] = train_model(modelosKN[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosKN[13] = train_model(modelosKN[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosKN[14] = train_model(modelosKN[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosKN[15] = train_model(modelosKN[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.3 Stochastic Gradient Descent (SGD)

In [None]:
#Stochastic Gradient Descent (SGD)
nome = "STOCHASTIC GRADIENT DESCENT"
nome2 = "SGDClassifier"
modelosSGD = [linear_model.SGDClassifier() for i in range(16)]
parameters_ = {'penalty': ('L1', 'l2', 'elasticnet')}

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
modelosSGD[0] = train_model(modelosSGD[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosSGD[1] = train_model(modelosSGD[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosSGD[2] = train_model(modelosSGD[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosSGD[3] = train_model(modelosSGD[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosSGD[4] = train_model(modelosSGD[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosSGD[5] = train_model(modelosSGD[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosSGD[6] = train_model(modelosSGD[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosSGD[7] = train_model(modelosSGD[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosSGD[8] = train_model(modelosSGD[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosSGD[9] = train_model(modelosSGD[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosSGD[10] = train_model(modelosSGD[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosSGD[11] = train_model(modelosSGD[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosSGD[12] = train_model(modelosSGD[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosSGD[13] = train_model(modelosSGD[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosSGD[14] = train_model(modelosSGD[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosSGD[15] = train_model(modelosSGD[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.4 Regressão Logística

In [None]:
#REGRESSÃO LOGÍSTICA
nome = "REGRESSÃO LOGÍSTICA"
nome2 = "LogisticRegression"
modelosRL = [linear_model.LogisticRegression() for i in range(16)]
parameters_ = {'penalty':('l1', 'l2', 'elasticnet'),
               'C':(0.5, 1.0),
               'class_weight' : ('balanced',None)}

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
#modelosRL[0] = train_model(modelosRL[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)
modelosRL[0] = train_model(modelosRL[0], X_train_count, y_train, X_test_count, y_test,
                           save = True, nome_arquivo=nome2+"-count",
                           nome_modelo = nome2+"-count", parameters = parameters_)
print ("\n",nome," - COUNT VECTORS Upsampling")
modelosRL[1] = train_model(modelosRL[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosRL[2] = train_model(modelosRL[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosRL[3] = train_model(modelosRL[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosRL[4] = train_model(modelosRL[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosRL[5] = train_model(modelosRL[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosRL[6] = train_model(modelosRL[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosRL[7] = train_model(modelosRL[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosRL[8] = train_model(modelosRL[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosRL[9] = train_model(modelosRL[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosRL[10] = train_model(modelosRL[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosRL[11] = train_model(modelosRL[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosRL[12] = train_model(modelosRL[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosRL[13] = train_model(modelosRL[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosRL[14] = train_model(modelosRL[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosRL[15] = train_model(modelosRL[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.5 SVM 

In [None]:
#SVM
nome = "SVM"
nome2 = "SVM"
modelosSVM = [svm.SVC() for i in range(16)]
parameters_ = {'C': (0.0, 0.5, 1.0),
               'kernel':('linear','poly','rbf','sigmoid'),
               'class_weight' : ('balanced',None)}

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
#modelosSVM[0] = train_model(modelosSVM[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)
modelosSVM[0] = train_model(modelosSVM[0], X_train_count, y_train, X_test_count, y_test, 
                            save = True, nome_arquivo=nome2+"-count",
                            nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosSVM[1] = train_model(modelosSVM[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosSVM[2] = train_model(modelosSVM[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosSVM[3] = train_model(modelosSVM[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosSVM[4] = train_model(modelosSVM[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosSVM[5] = train_model(modelosSVM[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosSVM[6] = train_model(modelosSVM[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosSVM[7] = train_model(modelosSVM[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
#modelosSVM[8] = train_model(modelosSVM[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)
modelosSVM[8] = train_model(modelosSVM[8], X_train_tfidf, y_train, X_test_tfidf, y_test, 
                            save = True, nome_arquivo=nome2+"-tfidf",
                            nome_modelo = nome2+"-tfidf", parameters = parameters_)


print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosSVM[9] = train_model(modelosSVM[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosSVM[10] = train_model(modelosSVM[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosSVM[11] = train_model(modelosSVM[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosSVM[12] = train_model(modelosSVM[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosSVM[13] = train_model(modelosSVM[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosSVM[14] = train_model(modelosSVM[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosSVM[15] = train_model(modelosSVM[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.6 Árvore de Decisão

In [None]:
#DECISION TREE
nome = "DECISION TREE"
nome2 = "DecisionTree"
modelosAD = [tree.DecisionTreeClassifier() for i in range(16)]
parameters_ = {'criterion': ('gini', 'entropy'),
               'splitter':('best','random'),
               'max_depth':(10, 20, 40, 50, None),
               'class_weight' : ('balanced',None)
               }

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
#modelosAD[0] = train_model(modelosAD[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)
modelosAD[0] = train_model(modelosAD[0], X_train_count, y_train, X_test_count, y_test, 
                           save = True, nome_arquivo = nome2+"-count",
                           nome_modelo = nome2+"-count", parameters = parameters_)
print ("\n",nome," - COUNT VECTORS Upsampling")
modelosAD[1] = train_model(modelosAD[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosAD[2] = train_model(modelosAD[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosAD[3] = train_model(modelosAD[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosAD[4] = train_model(modelosAD[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosAD[5] = train_model(modelosAD[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosAD[6] = train_model(modelosAD[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosAD[7] = train_model(modelosAD[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosAD[8] = train_model(modelosAD[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosAD[9] = train_model(modelosAD[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosAD[10] = train_model(modelosAD[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosAD[11] = train_model(modelosAD[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosAD[12] = train_model(modelosAD[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosAD[13] = train_model(modelosAD[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosAD[14] = train_model(modelosAD[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosAD[15] = train_model(modelosAD[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.7 Random Forest

In [None]:
#RANDOM FOREST
nome = "RANDOM FOREST"
nome2 = "RandomForest"
modelosRF = [ensemble.RandomForestClassifier(random_state=100) for i in range(16)]
parameters_ = {'n_estimators' : (50, 75, 100),
               'criterion': ('gini', 'entropy'),
               'max_depth':(20, 40, 50, None),
               'class_weight' : ('balanced','balanced_subsample',None)
               }

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
#modelosRF[0] = train_model(modelosRF[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)
modelosRF[0] = train_model(modelosRF[0], X_train_count, y_train, X_test_count, y_test, 
                           save = True, nome_arquivo=nome2+"-count",
                           nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosRF[1] = train_model(modelosRF[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosRF[2] = train_model(modelosRF[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosRF[3] = train_model(modelosRF[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosRF[4] = train_model(modelosRF[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosRF[5] = train_model(modelosRF[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosRF[6] = train_model(modelosRF[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosRF[7] = train_model(modelosRF[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosRF[8] = train_model(modelosRF[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosRF[9] = train_model(modelosRF[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosRF[10] = train_model(modelosRF[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosRF[11] = train_model(modelosRF[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosRF[12] = train_model(modelosRF[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosRF[13] = train_model(modelosRF[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosRF[14] = train_model(modelosRF[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosRF[15] = train_model(modelosRF[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.8 XGBoost

In [None]:
#EXTREME GRADIENT BOOSTING
nome = "EXTREME GRADIENT BOOSTING"
nome2 = "xgboost.XGBC"
modelosXGB = [xgboost.XGBClassifier(seed=100, random_state=100) for i in range(16)]
parameters_ = None#{}

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
modelosXGB[0] = train_model(modelosXGB[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosXGB[1] = train_model(modelosXGB[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosXGB[2] = train_model(modelosXGB[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosXGB[3] = train_model(modelosXGB[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosXGB[4] = train_model(modelosXGB[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosXGB[5] = train_model(modelosXGB[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosXGB[6] = train_model(modelosXGB[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosXGB[7] = train_model(modelosXGB[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosXGB[8] = train_model(modelosXGB[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosXGB[9] = train_model(modelosXGB[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosXGB[10] = train_model(modelosXGB[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosXGB[11] = train_model(modelosXGB[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosXGB[12] = train_model(modelosXGB[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosXGB[13] = train_model(modelosXGB[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosXGB[14] = train_model(modelosXGB[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosXGB[15] = train_model(modelosXGB[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.9 Perceptron

In [None]:
#PERCEPTRON
nome = "PERCEPTRON"
nome2 = "Perceptron"
modelosP = [linear_model.Perceptron() for i in range(16)]
parameters_ = {'penalty':('l1', 'l2', 'elasticnet'),
               'class_weight' : ('balanced','weight',None)}

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
modelosP[0] = train_model(modelosP[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosP[1] = train_model(modelosP[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosP[2] = train_model(modelosP[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosP[3] = train_model(modelosP[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosP[4] = train_model(modelosP[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosP[5] = train_model(modelosP[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosP[6] = train_model(modelosP[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosP[7] = train_model(modelosP[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosP[8] = train_model(modelosP[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosP[9] = train_model(modelosP[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosP[10] = train_model(modelosP[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosP[11] = train_model(modelosP[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosP[12] = train_model(modelosP[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosP[13] = train_model(modelosP[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosP[14] = train_model(modelosP[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosP[15] = train_model(modelosP[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.10 Multi-Layer Perceptron

In [None]:
#MULTI-LAYER PERCEPTRON
nome = "MULTI-LAYER PERCEPTRON"
nome2 = "MLP"
modelosMLP = [neural_network.MLPClassifier(random_state=100) for i in range(16)]
parameters_ = {'activation':('relu', 'logistic')}

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
modelosMLP[0] = train_model(modelosMLP[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosMLP[1] = train_model(modelosMLP[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosMLP[2] = train_model(modelosMLP[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosMLP[3] = train_model(modelosMLP[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosMLP[4] = train_model(modelosMLP[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosMLP[5] = train_model(modelosMLP[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosMLP[6] = train_model(modelosMLP[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosMLP[7] = train_model(modelosMLP[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosMLP[8] = train_model(modelosMLP[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosMLP[9] = train_model(modelosMLP[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosMLP[10] = train_model(modelosMLP[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosMLP[11] = train_model(modelosMLP[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosMLP[12] = train_model(modelosMLP[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosMLP[13] = train_model(modelosMLP[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosMLP[14] = train_model(modelosMLP[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosMLP[15] = train_model(modelosMLP[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


## 2.11 Voting Classifier

In [None]:
#VOTING CLASSIFIER
nome = "VOTING CLASSIFIER"
nome2 = "Voting"
modelosVC = []
parameters_ = None
    

for i in range(16):
    eclf = ensemble.VotingClassifier(
    estimators=[('log_reg', modelosRL[i]), 
                ('SVM', modelosSVM[i]), 
                ('RF', modelosRF[i]), 
                ('xgb', modelosXGB[i]),
                ('MLP', modelosMLP[i])
               ],
    voting = 'hard')
    modelosVC.append(eclf)

# Count Vectors 
print ("\n",nome," - COUNT VECTORS")
modelosVC[0] = train_model(modelosVC[0], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Upsampling")
modelosVC[1] = train_model(modelosVC[1], X_train_count_UP, y_train_UP, X_test_count, y_test, nome_modelo = nome2+"-count-UP", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Downsampling")
modelosVC[2] = train_model(modelosVC[2], X_train_count_SUB, y_train_SUB, X_test_count, y_test, nome_modelo = nome2+"-count-SUB", parameters = parameters_)

print ("\n",nome," - COUNT VECTORS Smote")
modelosVC[3] = train_model(modelosVC[3], X_train_count, y_train, X_test_count, y_test, nome_modelo = nome2+"-count-Smote", parameters = parameters_, smote=True)



# Binary Vectors
print ("\n",nome," - BINARY VECTORS")
modelosVC[4] = train_model(modelosVC[4], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Upsampling")
modelosVC[5] = train_model(modelosVC[5], X_train_binary_UP, y_train_UP, X_test_binary, y_test, nome_modelo = nome2+"-binary-UP", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Dowsampling")
modelosVC[6] = train_model(modelosVC[6], X_train_binary_SUB, y_train_SUB, X_test_binary, y_test, nome_modelo = nome2+"-binary-SUB", parameters = parameters_)

print ("\n",nome," - BINARY VECTORS Smote")
modelosVC[7] = train_model(modelosVC[7], X_train_binary, y_train, X_test_binary, y_test, nome_modelo = nome2+"-binary-Smote", parameters = parameters_, smote=True)


# TF IDF Vectors
print ("\n",nome," - TF-IDF VECTORS")
modelosVC[8] = train_model(modelosVC[8], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Upsampling")
modelosVC[9] = train_model(modelosVC[9], X_train_tfidf_UP, y_train_UP, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-UP", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Downsampling")
modelosVC[10] = train_model(modelosVC[10], X_train_tfidf_SUB, y_train_SUB, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-SUB", parameters = parameters_)

print ("\n",nome," - TF-IDF VECTORS Smote")
modelosVC[11] = train_model(modelosVC[11], X_train_tfidf, y_train, X_test_tfidf, y_test, nome_modelo = nome2+"-tfidf-Smote", parameters = parameters_, smote=True)


# FastText Vectors
print ("\n",nome," - FASTTEXT VECTORS")
modelosVC[12] = train_model(modelosVC[12], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Upsampling")
modelosVC[13] = train_model(modelosVC[13], X_train_fasttext_UP, y_train_UP, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-UP", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Downsampling")
modelosVC[14] = train_model(modelosVC[14], X_train_fasttext_SUB, y_train_SUB, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-SUB", parameters = parameters_)

print ("\n",nome," - FASTTEXT VECTORS Smote")
modelosVC[15] = train_model(modelosVC[15], X_train_fasttext, y_train, X_test_fasttext, y_test, nome_modelo = nome2+"-fasttext-Smote", parameters = parameters_, smote=True)


# 3. Redes Neurais 

## Definição de Funções

In [None]:
# função para criar uma rede neural simples (feed foward)
def create_model_architecture(input_size):
    # cria a camada de entrada
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # cria a camada interna com 10000 nós
    hidden_layer = layers.Dense(10000, activation="relu")(input_layer)
    hidden_layer = layers.Dropout(0.25)(hidden_layer)
    # cria a camada de saída
    output_layer = layers.Dense(2, activation="sigmoid")(hidden_layer)
    #cria o modelo da rede neural
    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    #compila o modelo atribuindo alguns parâmetros
    classifier.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    #retorna o modelo classificador
    return classifier 

# função para treinar todas as redes neurais 
def train_model_neural_network(classifier, train_x, train_y, valid_x, valid_y, test_x, test_y, save = False, nome_arquivo = None, epochs=1, nome_modelo = None):
    
    #cria dados dummies de treino, teste e validação
    train_y_dummies = pd.get_dummies(train_y).values
    valid_y_dummies = pd.get_dummies(valid_y).values
    test_y_dummies = pd.get_dummies(test_y).values
    #Agrupa dados de validação
    valid = (valid_x, valid_y_dummies)
    
    checkpoint_filepath = 'tmp\\weights.hdf5'
    cbks = [
    callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="val_loss",
        # "no longer improving" being defined as "no better than 1e-4 less"
        min_delta=1e-4,
        # "no longer improving" being further defined as "for at least 3 epochs"
        patience=5,
        verbose=1,
        ),
    callbacks.ModelCheckpoint(filepath=checkpoint_filepath, verbose=1, save_best_only=True) 
    ]
    
    # fit the training dataset on the classifier
    #batch_size = 20
    #treina a rede neural
    classifier.fit(train_x, train_y_dummies, epochs=epochs, validation_data=valid, callbacks=cbks)
    classifier.load_weights(checkpoint_filepath)
    
    # faz a previsão da saída do modelo com base nos dados de teste
    predictions = classifier.predict(test_x)
    print("\n")
    confusionMatrix_neural_networks(predictions, test_y_dummies) # imprime a matriz de confusão
    print("\n")
    classificationReport_neural_networks(predictions, test_y_dummies) # imprime o relatório da validação dos dados
    
    #recupera os labels
    test_labels=labels[test_y_dummies.argmax(1)]
    predictions_labels=labels[predictions.argmax(1)]
    
    #retorna o perda (loss) e a acurácia do modelo
    accuracy = classifier.evaluate(valid_x, valid_y_dummies)
    print('Valid set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accuracy[0], accuracy[1]))
    print("Test set:")
    #calcula o kapppa
    kappa = metrics.cohen_kappa_score(test_labels, predictions_labels)
    print("Kappa score: {:.3f}\n".format(kappa))
    acc = metrics.accuracy_score(test_labels, predictions_labels)
    print("Accuracy score: {:.3f}\n".format(acc))
    f1 = metrics.f1_score(test_labels, predictions_labels, average='weighted')
    print("f1 macro score: {:.3f}\n".format(f1))
    acc_bal = metrics.balanced_accuracy_score(test_labels, predictions_labels)
    print("Balanced Accuracy score: {:.3f}\n".format(acc_bal))
    roc = metrics.roc_auc_score(test_y, predictions.argmax(1))
    print("Area under the ROC curve: {:.3f}\n".format(roc))
    rec = metrics.recall_score(test_labels, predictions_labels, pos_label = 'C', average='binary')
    print("Recall classe C: {:.3f}\n".format(rec))
    prec = metrics.precision_score(test_labels, predictions_labels, pos_label = 'C', average='binary')
    print("Precision classe C: {:.3f}\n".format(prec))
    
    #salva se for o caso
    if save:
        if not nome_arquivo:
            nome_arquivo = type(classifier).__name__
        classifier.save(nome_arquivo)
    
    if nome_modelo:
        dateTimeObj = datetime.now()
        with open("Classificação de tweets\\resultados-classificacao.csv", "a") as myfile:
            myfile.write(nome_modelo+","+str(test_x.shape[0])+","+str(acc)+","+str(kappa)+","+str(f1)+","+str(acc_bal)+","+str(roc)+","+str(rec)+","+str(prec)+","+dateTimeObj.strftime("%Y-%m-%d")+"\n")
            
    
    return 
# função para a impressão da matriz de confusão de redes neurais
def confusionMatrix_neural_networks(predictions, y_dummies):
    X = np.array( metrics.confusion_matrix(y_true=labels[y_dummies.argmax(1)],y_pred=labels[predictions.argmax(1)]))
    X = pd.DataFrame(X,index = labels, columns = labels)
    print(X)
    return
# função para a impressão do relatório de classificação de redes neurais
def classificationReport_neural_networks(predictions, y_dummies):
    print(metrics.classification_report(y_true=labels[y_dummies.argmax(1)],y_pred=labels[predictions.argmax(1)], target_names=labels))    
    return

## 3.1 Rede neural simples 

Esse treino utiliza as matrizes Termo-Documento como dataset

In [None]:
#REDE NEURAL SIMPLES
nome = "REDE NEURAL SIMPLES"
nome2 = "NN"

print("\n",nome," - COUNT VECTORS")
classifier = create_model_architecture(X_train_count.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-countVectors.h5')
train_model_neural_network(classifier, X_train_count.todense(), y_train, X_valid_count.todense(), y_valid, X_test_count.todense(), y_test, epochs=50,nome_modelo = nome2+"-count")

print("\n",nome," - COUNT VECTORS Upsampling")
classifier = create_model_architecture(X_train_count_UP.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-countVectors.h5')
train_model_neural_network(classifier, X_train_count_UP.todense(), y_train_UP, X_valid_count.todense(), y_valid, X_test_count.todense(), y_test, epochs=50,nome_modelo = nome2+"-count-UP")

print("\n",nome," - COUNT VECTORS Downsampling")
classifier = create_model_architecture(X_train_count_SUB.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-countVectors.h5')
train_model_neural_network(classifier, X_train_count_SUB.todense(), y_train_SUB, X_valid_count.todense(), y_valid, X_test_count.todense(), y_test, epochs=50,nome_modelo = nome2+"-count-SUB")



print("\n",nome," - BINARY VECTORS")
classifier = create_model_architecture(X_train_binary.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-binaryVectors.h5')
train_model_neural_network(classifier, X_train_binary.todense(), y_train, X_valid_binary.todense(), y_valid, X_test_binary.todense(), y_test, epochs=50,nome_modelo = nome2+"-binary")

print("\n",nome," - BINARY VECTORS Upsampling")
classifier = create_model_architecture(X_train_binary_UP.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-binaryVectors.h5')
train_model_neural_network(classifier, X_train_binary_UP.todense(), y_train_UP, X_valid_binary.todense(), y_valid, X_test_binary.todense(), y_test, epochs=50,nome_modelo = nome2+"-binary-UP")

print("\n",nome," - BINARY VECTORS Downsampling")
classifier = create_model_architecture(X_train_binary_SUB.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-binaryVectors.h5')
train_model_neural_network(classifier, X_train_binary_SUB.todense(), y_train_SUB, X_valid_binary.todense(), y_valid, X_test_binary.todense(), y_test, epochs=50,nome_modelo = nome2+"-binary-SUB")



print("\n",nome," - TF-IDF VECTORS")
classifier = create_model_architecture(X_train_tfidf.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-tfidfVectors.h5')
train_model_neural_network(classifier, X_train_tfidf.todense(), y_train, X_valid_tfidf.todense(), y_valid, X_test_tfidf.todense(), y_test, epochs=50,nome_modelo = nome2+"-tfidf")

print("\n",nome," - TF-IDF VECTORS Upsampling")
classifier = create_model_architecture(X_train_tfidf_UP.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-tfidfVectors.h5')
train_model_neural_network(classifier, X_train_tfidf_UP.todense(), y_train_UP, X_valid_tfidf.todense(), y_valid, X_test_tfidf.todense(), y_test, epochs=50,nome_modelo = nome2+"-tfidf-UP")

print("\n",nome," - TF-IDF VECTORS Downsampling")
classifier = create_model_architecture(X_train_tfidf_SUB.todense().shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-tfidfVectors.h5')
train_model_neural_network(classifier, X_train_tfidf_SUB.todense(), y_train_SUB, X_valid_tfidf.todense(), y_valid, X_test_tfidf.todense(), y_test, epochs=50,nome_modelo = nome2+"-tfidf-SUB")



print("\n",nome," - FASTTEXT VECTORS")
classifier = create_model_architecture(X_train_fasttext.shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-fasttextVectors.h5')
train_model_neural_network(classifier, X_train_fasttext, y_train, X_valid_fasttext, y_valid, X_test_fasttext, y_test, epochs=50,nome_modelo = nome2+"-fasttext")

print("\n",nome," - FASTTEXT VECTORS Upsampling")
classifier = create_model_architecture(X_train_fasttext_UP.shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-fasttextVectors.h5')
train_model_neural_network(classifier, X_train_fasttext_UP, y_train_UP, X_valid_fasttext, y_valid, X_test_fasttext, y_test, epochs=50,nome_modelo = nome2+"-fasttext-UP")

print("\n",nome," - FASTTEXT VECTORS Downsampling")
classifier = create_model_architecture(X_train_fasttext_SUB.shape[1])
#classifier = models.load_model('SimpleNeuralNetwork-fasttextVectors.h5')
train_model_neural_network(classifier, X_train_fasttext_SUB, y_train_SUB, X_valid_fasttext, y_valid, X_test_fasttext, y_test, epochs=50,nome_modelo = nome2+"-fasttext-SUB")


## 3.2 Rede Neural Convolucional

In [None]:
#REDE NEURAL CONVOLUCIONAL


def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((MAX_NB_WORDS, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    #output_layer2 = layers.Dense(2, activation="softmax")(output_layer1)
    output_layer2 = layers.Dense(2, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
nome = "REDE NEURAL CONVOLUCIONAL"
nome2 = "CNN"

print("\n"+nome)
classifier = create_cnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq, y_train, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2)


print("\n"+nome+" Upsampling")
classifier = create_cnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_UP, y_train_UP, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-UP")


print("\n"+nome+" Downsampling")
classifier = create_cnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_SUB, y_train_SUB, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-SUB")


## 3.3 Rede Neural Recorrente

In [None]:
# REDE NEURAL RECORRENTE - LSTM

def create_rnn():
    input_layer = layers.Input((MAX_NB_WORDS, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the RNN Layer
    rnn_layer = layers.SimpleRNN(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(rnn_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    #output_layer2 = layers.Dense(2, activation="softmax")(output_layer1)
    output_layer2 = layers.Dense(2, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:

nome = "REDE NEURAL RECORRENTE"
nome2 = "RNN"

print("\n"+nome)
classifier = create_rnn()
#classifier = models.load_model('RNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq, y_train, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2)


print("\n"+nome+" Upsampling")
classifier = create_rnn()
#classifier = models.load_model('RNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_UP, y_train_UP, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-UP")


print("\n"+nome+" Downsampling")
classifier = create_rnn()
#classifier = models.load_model('RNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_SUB, y_train_SUB, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-SUB")


## 3.4 Rede Neural Recorrente - LSTM 

In [None]:
# REDE NEURAL RECORRENTE - LSTM

def create_rnn_lstm():
    input_layer = layers.Input((MAX_NB_WORDS, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    #output_layer2 = layers.Dense(2, activation="softmax")(output_layer1)
    output_layer2 = layers.Dense(2, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:

nome = "REDE NEURAL RECORRENTE - LSTM"
nome2 = "LSTM"

print("\n"+nome)
classifier = create_rnn_lstm()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq, y_train, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2)


print("\n"+nome+" Upsampling")
classifier = create_rnn_lstm()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_UP, y_train_UP, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-UP")


print("\n"+nome+" Downsampling")
classifier = create_rnn_lstm()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_SUB, y_train_SUB, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-SUB")


## 3.5 Rede Neural Recorrente - GRU 

In [None]:
# REDE NEURAL RECORRENTE - GRU

def create_rnn_gru():
    # Add an Input Layer
    input_layer = layers.Input((MAX_NB_WORDS, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    #output_layer2 = layers.Dense(2, activation="softmax")(output_layer1)
    output_layer2 = layers.Dense(2, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:

nome = "REDE NEURAL RECORRENTE - GRU"
nome2 = "GRU"

print("\n"+nome)
classifier = create_rnn_gru()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq, y_train, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2)


print("\n"+nome+" Upsampling")
classifier = create_rnn_gru()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_UP, y_train_UP, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-UP")


print("\n"+nome+" Downsampling")
classifier = create_rnn_gru()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_SUB, y_train_SUB, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-SUB")


## 3.6 Rede Neural Recorrente Bidirecional

In [None]:
#REDE NEURAL RECORRENTE BIDIRECIONAL

def create_bidirectional_rnn():
    # Add an Input Layer
    input_layer = layers.Input((MAX_NB_WORDS, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    #output_layer2 = layers.Dense(2, activation="softmax")(output_layer1)
    output_layer2 = layers.Dense(2, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:

nome = "REDE NEURAL RECORRENTE - BIDIRECIONAL"
nome2 = "RNN-Bidirectional"

print("\n"+nome)
classifier = create_bidirectional_rnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq, y_train, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2)


print("\n"+nome+" Upsampling")
classifier = create_bidirectional_rnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_UP, y_train_UP, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-UP")


print("\n"+nome+" Downsampling")
classifier = create_bidirectional_rnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_SUB, y_train_SUB, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-SUB")


## 3.7 Rede Neural Recorrente Convolucional 

In [None]:
# REDE NEURAL RECORRENTE CONVOLUCIONAL

def create_rcnn():
    # Add an Input Layer
    input_layer = layers.Input((MAX_NB_WORDS, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(rnn_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    #output_layer2 = layers.Dense(2, activation="softmax")(output_layer1)
    output_layer2 = layers.Dense(2, activation="sigmoid")(output_layer1)
    
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',metrics=['accuracy'])
    
    return model

In [None]:


nome = "REDE NEURAL RECORRENTE CONVOLUCIONAL"
nome2 = "RCNN"

print("\n"+nome)
classifier = create_rcnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq, y_train, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2)


print("\n"+nome+" Upsampling")
classifier = create_rcnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_UP, y_train_UP, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-UP")


print("\n"+nome+" Downsampling")
classifier = create_rcnn()
#classifier = models.load_model('CNN-glove-s50.h5')
train_model_neural_network(classifier, X_train_seq_SUB, y_train_SUB, X_valid_seq, y_valid, X_test_seq, y_test, epochs=50, nome_modelo = nome2+"-SUB")


## 4. Lexicon 

In [None]:


from datetime import datetime

#Carrega base
#df = pd.read_pickle('dados\\df_processado.pkl')
# filtra tweets anotados, exclui so com classificacao D (delete)
df2 = df[df['sent_manual'].fillna('nan').str.contains('N|E|S|C')].copy()

In [None]:
def corrige_label(label):
    if label == 'S' or label == 'E':
        return('N')
    else:
        return(label)

In [None]:
df2['sent_manual'] = df2['sent_manual'].apply(corrige_label)
df2['sent_manual'].value_counts()
lista_index = df.index.values.copy()
lista_texto = df.tweet_limpo.to_list().copy()
lista_label = df.sent_manual.to_list().copy()



df2['sent_oplexicon3_ABP'] = df2['tweet_limpo'].apply(utils.oplexicon3_Absolute_Proportional_Difference)
df2['oplexicon3'] = df2['sent_oplexicon3_ABP'].apply(utils.polaridade2)


df2['tweet_lema'] = df2['tweet_para_traducao'].apply(utils.retorna_lemas)
df2['sent_wordnetaffectbr_ABP'] = df2['tweet_lema'].apply(utils.wordnetaffectbr_Absolute_Proportional_Difference)
df2['wordnetaffectbr'] = df2['sent_wordnetaffectbr_ABP'].apply(utils.polaridade2)

df2['sent_LIWC_ABP'] = df2['tweet_limpo'].apply(utils.LIWC_Absolute_Proportional_Difference)
df2['LIWC'] = df2['sent_LIWC_ABP'].apply(utils.polaridade2)

df2['sent_SentiLex_ABP'] = df2['tweet_limpo'].apply(utils.SentiLex_Absolute_Proportional_Difference)
df2['SentiLex'] = df2['sent_SentiLex_ABP'].apply(utils.polaridade2)

df2['sent_Dicionario_conjunto'] = df2['tweet_limpo'].apply(utils.Dicionario_conjunto_Absolute_Proportional_Difference)
df2['dic_conjunto'] = df2['sent_Dicionario_conjunto'].apply(utils.polaridade2)

df2['ensemble_portugues'] = df2['tweet_limpo'].apply(utils.ensemble_portugues).apply(corrige_label)


In [None]:
def comparaResultados(serie, nome_modelo):
    confusionMatrix(serie)
    classificationReport(serie)
    kappa = metrics.cohen_kappa_score(df2['sent_manual'],serie)
    print("Kappa score: ",kappa,"\n")
    acc = metrics.accuracy_score(df2['sent_manual'],serie)
    print("Accuracy score: ", acc,"\n")
    f1 = metrics.f1_score(y_true=df2['sent_manual'],y_pred=serie, average='weighted')
    print("F1 micro score: ", f1,"\n")
    acc_bal = metrics.balanced_accuracy_score(df2['sent_manual'],serie)
    print("Balanced Accuracy score: ", acc_bal,"\n")
    roc = metrics.roc_auc_score(df2['sent_manual']=='C',serie=='C')
    print("Area under the ROC curve: {:.3f}\n".format(roc))
    rec = metrics.recall_score(df2['sent_manual'],serie, pos_label = 'C', average='binary')
    print("Recall classe C: {:.3f}\n".format(rec))
    prec = metrics.precision_score(df2['sent_manual'],serie, pos_label = 'C', average='binary')
    print("Precision classe C: {:.3f}\n".format(prec))

    if nome_modelo:
        dateTimeObj = datetime.now()
        with open("Classificação de tweets\\resultados-classificacao.csv", "a") as myfile:
            myfile.write(nome_modelo+","+str(serie.shape[0])+","+str(acc)+","+str(kappa)+","+str(f1)+","+str(acc_bal)+","+str(roc)+","+str(rec)+","+str(prec)+","+dateTimeObj.strftime("%Y-%m-%d")+"\n")
    return 


def confusionMatrix(predictions):
    #faz um processamento dos dados para uma melhor impressão
    X = np.array( metrics.confusion_matrix(y_true=df2['sent_manual'],y_pred=predictions))
    X = pd.DataFrame(X,index = ['C','N'], columns = ['C','N'])
    print(X)
    return

# função que cria um relatório com base nas previsões realizadas pelo modelo
def classificationReport(predictions):
    print(metrics.classification_report(y_true=df2['sent_manual'],y_pred=predictions, target_names=['C','N']))    
    return

In [None]:
print('\noplexicon3:')
comparaResultados(df2['oplexicon3'],'lex-oplexicon3')

print('\nwordnetaffectbr:')
comparaResultados(df2['wordnetaffectbr'],'lex-wordnetaffectbr')

print('\nLIWC:')
comparaResultados(df2['LIWC'],'lex-LIWC')

print('\nSentiLex:')
comparaResultados(df2['SentiLex'],'lex-SentiLex')

print('\nDicionário Conjunto:')
comparaResultados(df2['dic_conjunto'],'lex-dic_conjunto')

print('\nEnsemble Português:')
comparaResultados(df2['ensemble_portugues'],'lex-ensemble_portugues')


In [None]:
df2['ensemble_portugues'].unique()

## 5. Modelo SVM com todos os dados

In [None]:
y_labels = df.sent_manual.to_list()
y = encoder.transform(y_labels)
print(len(corpus))
#remove stopwords
for i in range(0,len(corpus)): # varre a lista de textos
    words=corpus[i].split(" ") # separa o texto em palavras
    words_new = [w for w in words if w not in mystopwords] #remove as stop words
    corpus[i] = ' '.join(words_new) # concantena as palavras novamente

#tfidf
tfidf_vect2 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                            ngram_range=(1, 1), max_features = max_tokens)
tfidf_vect2.fit(corpus) # treina o objeto nos textos processados
X_tfidf =  tfidf_vect2.transform(corpus)
pickle.dump(tfidf_vect2.vocabulary_, open("dados/tfidf-vocab-full", 'wb'))

classifier = svm.SVC(probability=True)
parameters = {'C': (0.0, 0.5, 1.0),
               'kernel':('linear','poly','rbf','sigmoid'),
               'class_weight' : ('balanced',None)}

if (__name__ == "__main__"):
    grid_search = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=0,cv=5)
    t0 = time()
    grid_search.fit(X_tfidf, y)
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    classifier = grid_search.best_estimator_

joblib.dump(classifier, "dados/class-SVM-tfidf-full")