In [1]:
import re
import sys
import numpy as np
import pandas as pd
from time import time
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from nltk.stem import PorterStemmer
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_predict, KFold

In [8]:
class Preprocessamento(object):
    def __init__(self):
        self.all_twitter_messages = None
        self.polarity_tweets = None
        self.tweets_stemming = None
        self.palavras = []

    def read_tweets_from_file(self, dataset):
        self.all_twitter_messages = dataset['content'].values

        return self.all_twitter_messages

    def read_polarity_from_file(self, dataset):
        self.polarity_tweets = dataset['sentiment'].values

        return self.polarity_tweets

    def clean_tweets(self, tweet):
        tweet = re.sub('@(\w{1,15})\b', '', tweet)
        tweet = tweet.replace("via ", "")
        tweet = tweet.replace("RT ", "")
        tweet = tweet.lower()

        return tweet

    def clean_url(self, tweet):
        tweet = re.sub(r'(https|http)?://(\w|\.|/|\?|=|&|%)*\b', '', tweet, flags=re.MULTILINE)
        tweet = tweet.replace("http", "")
        tweet = tweet.replace("htt", "")

        return tweet

    def remove_stop_words(self, tweet):
        english_stops = set(stopwords.words('english'))

        words = [i for i in tweet.split() if not i in english_stops]

        return (" ".join(words))

    def stemming_tweets(self, tweet):
        ps = PorterStemmer()

        self.tweets_stemming = ps.stem(tweet)

        return self.tweets_stemming

In [9]:
dataset_train = pd.read_csv('hcr-train.csv')
dataset_test = pd.read_csv('hcr-test.csv')

In [10]:
dataset_train.head()

Unnamed: 0,tweet id,user id,username,content,sentiment,target,annotator id,comment,dispute
0,10237553563,69128478.0,,RT @angelsmomaw: #HCR is unwanted because it w...,negative,hcr,aluckhardt,,
1,10239984258,7713202.0,GOPLeader,RT @WMRepublicans President's Remarks Yesterda...,negative,hcr,aluckhardt,,
2,10240791063,34927577.0,cnsnews_com,RT @johnboehner: Pelosi on #HCR: ''We have to ...,negative,dems,aluckhardt,,
3,"10253203734,16930489,ExJon,""RT @vermontaigne C...",,,,,,,,
4,"10255459398,15350894,LJSearles,""RT @HealthRefo...",,,,,,,,


In [11]:
dataset_test.head()

Unnamed: 0,tweet id,user id,username,content,sentiment,target,annotator id,comment,dispute
0,10729879540,19500327.0,willmckinley,50% of FoxNews.com readers think #hcr won't pa...,negative,conservatives,acoyne,,
1,10740632762,33972168.0,FREETeaPartyArt,NEW #teaparty sign download ---- Taking back o...,positive,conservatives,supadhyay,Updated after multiple students pointed out sl...,
2,"10780349787,16082787,slackadjuster,RT @Marnus3...",,,,,,,,
3,10781009520,18407451.0,BrazenlyLiberal,RT @hippieprof: RT @loudhearted: RT @quaigee: ...,negative,conservatives,acoyne,,
4,"10727641557,16175061,kpangrace,I'm confident t...",,,,,,,,


In [12]:
class RetiraPolaridade(object):
    def __init__(self):
        self.all_tweets = None
        self.polaridade = None

    def retira_polaridade(self, tweets, polaridade):
        self.all_tweets = []
        self.polaridade = []

        for i in range(len(tweets)):
            if polaridade[i] == 'positive' or polaridade[i] == 'negative' or polaridade[i] == 'neutral':
                self.all_tweets.append(tweets[i])
                self.polaridade.append(polaridade[i])

        return self.all_tweets, self.polaridade

In [13]:
rp = RetiraPolaridade()
pre = Preprocessamento()

tweets_train = pre.read_tweets_from_file(dataset_train)
polarity_train = pre.read_polarity_from_file(dataset_train)

tweets_test = pre.read_tweets_from_file(dataset_test)
polarity_test = pre.read_polarity_from_file(dataset_test)

tweets_train, polarity_train = rp.retira_polaridade(tweets_train, polarity_train)
tweets_test, polarity_test = rp.retira_polaridade(tweets_test, polarity_test)

In [83]:
print(len(tweets_train), len(tweets_test))

(852, 327)


In [71]:
TEST_SIZE = len(tweets_test)

''' Mesclando os dados de treino com os dados de teste '''

all_tweets = []
classes = []

for tweet in tweets_train:
    all_tweets.append(tweet)

for tweet in tweets_test:
    all_tweets.append(tweet)

for classe in polarity_train:
    classes.append(classe)

for classe in polarity_test:
    classes.append(classe)

DATA_SIZE = len(all_tweets)

In [72]:
''' Preprocessamento dos dados de teste '''

for i in range(len(all_tweets)):
    all_tweets[i] = pre.clean_tweets(all_tweets[i])
    all_tweets[i] = pre.clean_url(all_tweets[i])
    all_tweets[i] = pre.remove_stop_words(all_tweets[i])
    #all_tweets[i] = pre.stemming_tweets(all_tweets[i])

In [73]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, glove):
        self.glove = glove
        self.gloveweight = None
        self.dim = len(glove.itervalues().next())

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.gloveweight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.glove[w] * self.gloveweight[w]
                         for w in words if w in self.glove] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [74]:
def reading_glove(tweets, dim):
    if dim == 25:
        with open("glove.twitter.27B.25d.txt", "rb") as lines:
            glove = {line.split()[0]: np.array(map(float, line.split()[1:]))
                for line in lines}

    elif dim == 50:
        with open("glove.twitter.27B.50d.txt", "rb") as lines:
            glove = {line.split()[0]: np.array(map(float, line.split()[1:]))
                for line in lines}

    elif dim == 100:
        with open("glove.twitter.27B.100d.txt", "rb") as lines:
            glove = {line.split()[0]: np.array(map(float, line.split()[1:]))
                for line in lines}

    elif dim == 200:
        with open("glove.twitter.27B.200d.txt", "rb") as lines:
            glove = {line.split()[0]: np.array(map(float, line.split()[1:]))
                for line in lines}

    else:
        raise IOError("Dimensão do Word Embedding GloVe incorreta.")

    vec = TfidfEmbeddingVectorizer(glove)
    vec.fit(tweets)
    matrix = vec.transform(tweets)

    return matrix

In [75]:
from sklearn.model_selection import train_test_split

In [89]:
class CriaLexicon(object):
    def __init__(self):
        self.matriz = []

    def opinion_lexicon(self, lex_positivo, lex_negativo, all_tweets):
        for tweet in all_tweets:
            cont = [0.0 for i in range(3)]
            contPos = 0
            contNeg = 0

            for word in word_tokenize(tweet.lower()):
                if word in lex_positivo:
                    contPos += 1

                if word in lex_negativo:
                    contNeg += 1

            #print(contPos, contNeg)
            if contPos > contNeg:
                cont[0] = 1.0
            elif contNeg > contPos:
                cont[1] = 1.0
            else:
                cont[2] = 1.0

            self.matriz.append(cont)

        return self.matriz

In [90]:
pos = pd.read_csv('opinion_lexicon/positive-words.csv')
neg = pd.read_csv('opinion_lexicon/negative-words.csv')

lex = CriaLexicon()

pos = pos['pos']
neg = neg['neg']

matrix_lex = lex.opinion_lexicon(list(pos), list(neg), all_tweets)

In [93]:
matrix_lex[:5]

[[0.0, 1.0, 0.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0]]

In [76]:
# Criando modelo Bag-of-Words a partir de features do dataset
vec = CountVectorizer(binary=True)
vec.fit(all_tweets)
matrix_bow = vec.transform(all_tweets).toarray()

In [77]:
matrix_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [78]:
matrix_bow.shape

(1179, 4025)

In [80]:
matrix_embedding = reading_glove(all_tweets, 25)
matrix_embedding.shape

(1179, 25)

In [94]:
matrix = np.concatenate((matrix_embedding, matrix_bow, matrix_lex), axis=1)
matrix.shape

(1179, 4053)

In [96]:
# PREDIÇÃO COM MODELO WORD EMBEDDING DE 25 DIMENSÕES + DICIONÁRIO LÉXICO
matrix_embedding = reading_glove(all_tweets, 25)

matrix = np.concatenate((matrix_embedding, matrix_bow, matrix_lex), axis=1)

size = ((TEST_SIZE * 100) / DATA_SIZE) / 100
size = size * 0.01

X_train, X_test, y_train, y_test = train_test_split(matrix, classes, test_size=0.2)

lr = LogisticRegression()

lr.fit(X_train, y_train)

resultados = lr.predict(X_test)

sentimento = ['positive', 'negative', 'neutral']

print("Acurácia...: %.2f" %(metrics.accuracy_score(y_test,resultados) * 100))
print("Precision..: %.2f" %(metrics.precision_score(y_test,resultados,average='macro') * 100))
print("Recall.....: %.2f" %(metrics.recall_score(y_test,resultados, average='macro') * 100))
print("F1-Score...: %.2f" %(metrics.f1_score(y_test,resultados, average='macro') * 100))
#print()
print(metrics.classification_report(y_test,resultados,sentimento,digits=4))

Acurácia...: 65.68
Precision..: 63.49
Recall.....: 60.29
F1-Score...: 61.14
             precision    recall  f1-score   support

   positive     0.6190    0.5909    0.6047        66
   negative     0.6912    0.8103    0.7460       116
    neutral     0.5946    0.4074    0.4835        54

avg / total     0.6489    0.6568    0.6464       236



In [97]:
# PREDIÇÃO COM MODELO WORD EMBEDDING DE 50 DIMENSÕES + DICIONÁRIO LÉXICO
matrix_embedding = reading_glove(all_tweets, 50)

matrix = np.concatenate((matrix_embedding, matrix_bow, matrix_lex), axis=1)

size = ((TEST_SIZE * 100) / DATA_SIZE) / 100
size = size * 0.01

X_train, X_test, y_train, y_test = train_test_split(matrix, classes, test_size=0.2)

lr = LogisticRegression()

lr.fit(X_train, y_train)

resultados = lr.predict(X_test)

sentimento = ['positive', 'negative', 'neutral']

print("Acurácia...: %.2f" %(metrics.accuracy_score(y_test,resultados) * 100))
print("Precision..: %.2f" %(metrics.precision_score(y_test,resultados,average='macro') * 100))
print("Recall.....: %.2f" %(metrics.recall_score(y_test,resultados, average='macro') * 100))
print("F1-Score...: %.2f" %(metrics.f1_score(y_test,resultados, average='macro') * 100))
#print()
print(metrics.classification_report(y_test,resultados,sentimento,digits=4))

Acurácia...: 68.22
Precision..: 70.31
Recall.....: 63.34
F1-Score...: 65.34
             precision    recall  f1-score   support

   positive     0.6500    0.5821    0.6142        67
   negative     0.6713    0.8276    0.7413       116
    neutral     0.7879    0.4906    0.6047        53

avg / total     0.6914    0.6822    0.6745       236



In [99]:
# PREDIÇÃO COM MODELO WORD EMBEDDING DE 100 DIMENSÕES + DICIONÁRIO LÉXICO
matrix_embedding = reading_glove(all_tweets, 100)

matrix = np.concatenate((matrix_embedding, matrix_bow, matrix_lex), axis=1)

size = ((TEST_SIZE * 100) / DATA_SIZE) / 100
size = size * 0.01

X_train, X_test, y_train, y_test = train_test_split(matrix, classes, test_size=0.2)

lr = LogisticRegression(C=2.0)

lr.fit(X_train, y_train)

resultados = lr.predict(X_test)

sentimento = ['positive', 'negative', 'neutral']

print("Acurácia...: %.2f" %(metrics.accuracy_score(y_test,resultados) * 100))
print("Precision..: %.2f" %(metrics.precision_score(y_test,resultados,average='macro') * 100))
print("Recall.....: %.2f" %(metrics.recall_score(y_test,resultados, average='macro') * 100))
print("F1-Score...: %.2f" %(metrics.f1_score(y_test,resultados, average='macro') * 100))
#print()
print(metrics.classification_report(y_test,resultados,sentimento,digits=4))

Acurácia...: 62.29
Precision..: 58.92
Recall.....: 55.56
F1-Score...: 55.91
             precision    recall  f1-score   support

   positive     0.5556    0.5797    0.5674        69
   negative     0.6765    0.7931    0.7302       116
    neutral     0.5357    0.2941    0.3797        51

avg / total     0.6107    0.6229    0.6068       236



In [118]:
# PREDIÇÃO COM MODELO WORD EMBEDDING DE 200 DIMENSÕES + DICIONÁRIO LÉXICO
matrix_embedding = reading_glove(all_tweets, 200)

matrix = np.concatenate((matrix_embedding, matrix_lex, matrix_bow), axis=1)

size = ((TEST_SIZE * 100) / DATA_SIZE) / 100
size = size * 0.01

X_train, X_test, y_train, y_test = train_test_split(matrix, classes, test_size=0.2)

lr = LogisticRegression()

lr.fit(X_train, y_train)

resultados = lr.predict(X_test)

sentimento = ['positive', 'negative', 'neutral']

print("Acurácia...: %.2f" %(metrics.accuracy_score(y_test,resultados) * 100))
print("Precision..: %.2f" %(metrics.precision_score(y_test,resultados,average='macro') * 100))
print("Recall.....: %.2f" %(metrics.recall_score(y_test,resultados, average='macro') * 100))
print("F1-Score...: %.2f" %(metrics.f1_score(y_test,resultados, average='macro') * 100))
#print()
print(metrics.classification_report(y_test,resultados,sentimento,digits=4))

Acurácia...: 63.14
Precision..: 61.11
Recall.....: 57.59
F1-Score...: 58.48
             precision    recall  f1-score   support

   positive     0.6949    0.5125    0.5899        80
   negative     0.6383    0.7965    0.7087       113
    neutral     0.5000    0.4186    0.4557        43

avg / total     0.6323    0.6314    0.6223       236

