In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import time

import nltk
from nltk.tokenize import TweetTokenizer

from emoji.unicode_codes import UNICODE_EMOJI

from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from gensim.models import word2vec

In [8]:
alysson = pd.read_csv('MARCADOS_CONSISTENCIA/Alysson.csv', sep=';', encoding='utf-8')
alysson.columns = ['id', 'tweet', 'candidato', 'marcador', 'classe']
alysson = alysson[alysson.marcador == 'alysson']
alysson = alysson.drop_duplicates()

raul = pd.read_csv('MARCADOS_CONSISTENCIA/Raul.csv', sep=';', encoding='utf-8')
raul.columns = ['id', 'tweet', 'candidato', 'marcador', 'classe']
raul = raul[raul.marcador == 'raul']
raul = raul.drop_duplicates()

dalai = pd.read_csv('MARCADOS_CONSISTENCIA/Dalai.csv', sep=';', encoding='utf-8')
dalai.columns = ['id', 'tweet', 'candidato', 'marcador', 'classe']
dalai = dalai[dalai.marcador == 'dalai']
dalai = dalai.drop_duplicates()

romulo = pd.read_csv('MARCADOS_CONSISTENCIA/Romulo.csv', sep=';', encoding='utf-8')
romulo.columns = ['id', 'tweet', 'candidato', 'marcador', 'classe']
romulo = romulo[romulo.marcador == 'romulo']
romulo = romulo.drop_duplicates()

In [12]:
data = pd.concat([alysson,raul,dalai, romulo])

In [13]:
data.shape

(1412, 5)

In [15]:
data = data[data['classe'] != 'Não sei']
data['classe'][data['classe'] == 'Rejeição'] = 'REJEICAO'
data['classe'][data['classe'] == 'Neutro'] = 'NEUTRO'
data['classe'][data['classe'] == 'Aprovação'] = 'APROVACAO'
data = data.dropna()

In [19]:
data.shape

(1151, 5)

In [16]:
def preProcessing(twitterText):
    #Remover \n
    twitterText = re.sub("\n+"," ",twitterText)

    #Remover multiplos espaços
    twitterText = re.sub(" +"," ",twitterText)
    
    #(@usuário) pelo termo ’AT_USER’ tal como sugerido em [Almatrafi et al., 2015].
    twitterText = re.sub("@\w+","atuser",twitterText)

    #Remove links
    twitterText = re.sub(r"http\S+", "",twitterText)

    #Remover caracteres especiais
    twitterText = re.sub("[@|#|“|”|’|‘|®|,|!|?||\[|\]|\.|\"|%|:|\-|_|/|ª|\(|\)|°|\*|🇧|🇷|\'|️|=]",'',twitterText)

    #Remover números
    twitterText = re.sub("[0-9]+",'',twitterText)

    #Tokenize
    twitterTokens = TweetTokenizer().tokenize(twitterText)

    #transforme emojis em textcode
    twitterTokensEmojisCode = []
    for token in twitterTokens:
        if(token in UNICODE_EMOJI):
            twitterTokensEmojisCode.append(UNICODE_EMOJI[token])
        else:
            twitterTokensEmojisCode.append(token)
    twitterTokens = twitterTokensEmojisCode

    #remove stopwords
    stopwords = nltk.corpus.stopwords.words('portuguese')
    stopwords.remove("não")
    stopwords.remove("num")
    twitterTokens = [token for token in twitterTokens if (token not in stopwords) ]
    
    #Lower case
    twitterText = "".join(twitterText)
    twitterText = twitterText.lower()

    return twitterText

In [17]:
data['tweet'] = data['tweet'].apply(lambda x : preProcessing(x))

In [18]:
data.head(10)

Unnamed: 0,id,tweet,candidato,marcador,classe
2,twe984802485360582656,ao invés dos petistas estarem buscando livrar ...,alckmin,alysson,REJEICAO
3,twe977572021361168389,atuser atuser atuser atuser o problema caio é ...,manuela,alysson,NEUTRO
4,twe977558211447443457,a pergunta é séria atuser quer me pagar logo o...,manuela,alysson,REJEICAO
5,twe977347407011897345,disparado,bolsonaro,alysson,NEUTRO
6,twe984494027956543488,o que acontece agora que o inquérito de alckmi...,alckmin,alysson,NEUTRO
7,twe981677295676084224,aécio é flagrado pedindo grana a empresário ...,temer,alysson,REJEICAO
8,twe978200570740822017,se a bunda de algum ministro sentar sobre o pr...,lula,alysson,REJEICAO
9,twe982682347484180482,hoje não há lado certo ou lado errado lula é f...,temer,alysson,NEUTRO
10,twe984980191066849280,decisões do stf e do stj de encaminhar process...,alckmin,alysson,NEUTRO
11,twe983555366133944320,para os que ainda não sabem a précandidata à p...,marina,alysson,NEUTRO


In [50]:
#Bag of words
def featureextractionBOW(X):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(X)
    #Redução de dimensionalidade Truncated SVD (PCA para matrizes espaças)
    svd = TruncatedSVD(n_components=300)
    X_truncated = svd.fit_transform(X)
    return X_truncated

#TFIDF
def featureextractionTFIDF(X):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(X)
    tfidf = TfidfTransformer()
    X = tfidf.fit_transform(X)
    #Redução de dimensionalidade Truncated SVD (PCA para matrizes espaças)
    svd = TruncatedSVD(n_components=300)
    X_truncated = svd.fit_transform(X)
    return X_truncated

class FeatureGeneratorMedia:
    def __init__(self, X, w2vmodel, num_features):
        self.X = X
        self.w2vmodel = w2vmodel
        self.num_features = num_features
        self.features_vec = None

    def gen_features_dataset(self):
        self.X = self.X.apply(lambda text: TweetTokenizer().tokenize(text) )
        X_array = []
        self.X.apply(lambda listText: X_array.append(self.make_features_vec(listText)) )
        
        return np.matrix(X_array)
        
    def make_features_vec(self, tweet):
        featureVec = np.zeros(self.num_features)
        nwords = 0.0
        index2word_set = set(self.w2vmodel.wv.index2word)
        for word in tweet:
            if word in index2word_set:
                featureVec = np.add(featureVec, self.w2vmodel[word])
                nwords += 1
        if nwords == 0.0:
            nwords = 1.0
        return np.divide(featureVec, nwords)

def featureextractionWord2VecMean(X):
    num_features=300
    model = word2vec.Word2Vec.load("/home/alysson/Documents/Tweets/word2vec_files/tweets_presidential_elections_300_min1_cont2_cbow")
    featureGeneratorMedia = FeatureGeneratorMedia(X,model,num_features)
    return featureGeneratorMedia.gen_features_dataset()

In [51]:
X = data['tweet']
Y = data['classe']

featureExtractionList = [("BOW",featureextractionBOW),("TFIDF",featureextractionTFIDF),("W2Vmean",featureextractionWord2VecMean)]

modelsName = ['XGBoost', 'GradientBoosting', 'kNN']
listModels = [XGBClassifier(),GradientBoostingClassifier(),KNeighborsClassifier()]

In [52]:
it = 1
for extractionName,funcFeatureExtraction in featureExtractionList:
    X_features = funcFeatureExtraction(X)
    
    # train test split (sempre mesmo split)
    X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y, test_size=0.3, random_state=1)
    
    for nameModel,model in zip(modelsName,listModels):
        configuration = extractionName+"+"+nameModel
        print("Configuration # "+str(it)+": "+configuration)
        
        start_time =  time.time()
        #Treina modelo
        print("inicio treino")
        model.fit(X_train,Y_train)
        print("fim treino")
        #Submeter-se dados de treino ao modelo - Teste
        Y_pred = model.predict(X_test)
        #Avalia modelo
        metricsStr = classification_report(Y_test,Y_pred)
        print(metricsStr)
        #print(precision_recall_fscore_support(Y_test,Y_pred))
        timeexec = (time.time() - start_time)
        print("Time: ",timeexec)
        print("\n")

Configuration # 1: BOW+XGBoost
inicio treino
fim treino
             precision    recall  f1-score   support

  APROVACAO       0.47      0.18      0.26        51
     NEUTRO       0.50      0.41      0.45        78
   REJEICAO       0.73      0.88      0.80       217

avg / total       0.64      0.67      0.64       346

Time:  6.867495059967041


Configuration # 1: BOW+GradientBoosting
inicio treino


  if diff:


fim treino
             precision    recall  f1-score   support

  APROVACAO       0.42      0.20      0.27        51
     NEUTRO       0.48      0.41      0.44        78
   REJEICAO       0.73      0.86      0.79       217

avg / total       0.63      0.66      0.64       346

Time:  11.093000411987305


Configuration # 1: BOW+kNN
inicio treino
fim treino
             precision    recall  f1-score   support

  APROVACAO       0.15      0.31      0.20        51
     NEUTRO       0.32      0.40      0.35        78
   REJEICAO       0.71      0.47      0.56       217

avg / total       0.54      0.43      0.46       346

Time:  0.22736024856567383


Configuration # 1: TFIDF+XGBoost
inicio treino
fim treino
             precision    recall  f1-score   support

  APROVACAO       0.42      0.10      0.16        51
     NEUTRO       0.53      0.36      0.43        78
   REJEICAO       0.71      0.92      0.80       217

avg / total       0.63      0.67      0.62       346

Time:  7.768298149

  if diff:


fim treino
             precision    recall  f1-score   support

  APROVACAO       0.32      0.12      0.17        51
     NEUTRO       0.48      0.31      0.38        78
   REJEICAO       0.70      0.90      0.79       217

avg / total       0.60      0.65      0.60       346

Time:  10.239217519760132


Configuration # 1: TFIDF+kNN
inicio treino
fim treino
             precision    recall  f1-score   support

  APROVACAO       0.23      0.33      0.27        51
     NEUTRO       0.28      0.78      0.41        78
   REJEICAO       0.94      0.22      0.35       217

avg / total       0.69      0.36      0.35       346

Time:  0.15668010711669922






Configuration # 1: W2Vmean+XGBoost
inicio treino
fim treino
             precision    recall  f1-score   support

  APROVACAO       0.32      0.12      0.17        51
     NEUTRO       0.50      0.50      0.50        78
   REJEICAO       0.75      0.86      0.80       217

avg / total       0.63      0.67      0.64       346

Time:  7.59485387802124


Configuration # 1: W2Vmean+GradientBoosting
inicio treino


  if diff:


fim treino
             precision    recall  f1-score   support

  APROVACAO       0.31      0.18      0.23        51
     NEUTRO       0.55      0.54      0.54        78
   REJEICAO       0.76      0.84      0.80       217

avg / total       0.64      0.67      0.65       346

Time:  7.96818470954895


Configuration # 1: W2Vmean+kNN
inicio treino
fim treino
             precision    recall  f1-score   support

  APROVACAO       0.28      0.25      0.27        51
     NEUTRO       0.51      0.38      0.44        78
   REJEICAO       0.74      0.82      0.78       217

avg / total       0.62      0.64      0.63       346

Time:  0.14643406867980957


