In [1]:
# Importando bibliotecas
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Carregamento conjunto de dados
data_path = Path("../data/raw/data.csv")

In [3]:
# Leitura do conjunto de dados
df = pd.read_csv(data_path, sep=";").sample(10)
df

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
27891,1045428393611808768,Ou seja se encomendar as yeezy vou pagar 50 eu...,Thu Sep 27 21:42:06 +0000 2018,1,:)
4376,1049284285230931969,@duducaralho @brendaccarvalho ótimo. agora vam...,Mon Oct 08 13:04:03 +0000 2018,1,:)
37430,1049282536524922881,@Dvrkskinsayan :( eu não percebo desculpa,Mon Oct 08 12:57:06 +0000 2018,0,:(
93208,1038587601542963203,Analisando #FAKE ou #FATO sobre a chamada do @...,Sun Sep 09 00:39:14 +0000 2018,2,#fato
12352,1047557531315388416,@SantoosIbra @BrunoMsbe @futtmais @PortalMessi...,Wed Oct 03 18:42:32 +0000 2018,1,:)
10153,1048423407019352065,opa solteira :) pena que es muito novinha kkk ...,Sat Oct 06 04:03:13 +0000 2018,1,:)
69076,1049305919182393345,&gt;@EstadaoPolitica Bolsonaro defende Paulo G...,Mon Oct 08 14:30:00 +0000 2018,2,estadao
14749,1047487432617316353,@019matilde Tu sabi Mozão :)),Wed Oct 03 14:03:59 +0000 2018,1,:)
39880,1049236837578018816,Eu só queria f1 antes de entrar na aula :(,Mon Oct 08 09:55:30 +0000 2018,0,:(
88643,1052566326705713152,"""InBrands em busca de um sócio"" https://t.co/A...",Wed Oct 17 14:25:42 +0000 2018,2,#trabalho


In [4]:
# Correção do fuso horário
df['tweet_date'] = pd.to_datetime(df['tweet_date'])
df['tweet_date'] = df.tweet_date.dt.tz_convert('Brazil/East')
df

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
27891,1045428393611808768,Ou seja se encomendar as yeezy vou pagar 50 eu...,2018-09-27 18:42:06-03:00,1,:)
4376,1049284285230931969,@duducaralho @brendaccarvalho ótimo. agora vam...,2018-10-08 10:04:03-03:00,1,:)
37430,1049282536524922881,@Dvrkskinsayan :( eu não percebo desculpa,2018-10-08 09:57:06-03:00,0,:(
93208,1038587601542963203,Analisando #FAKE ou #FATO sobre a chamada do @...,2018-09-08 21:39:14-03:00,2,#fato
12352,1047557531315388416,@SantoosIbra @BrunoMsbe @futtmais @PortalMessi...,2018-10-03 15:42:32-03:00,1,:)
10153,1048423407019352065,opa solteira :) pena que es muito novinha kkk ...,2018-10-06 01:03:13-03:00,1,:)
69076,1049305919182393345,&gt;@EstadaoPolitica Bolsonaro defende Paulo G...,2018-10-08 11:30:00-03:00,2,estadao
14749,1047487432617316353,@019matilde Tu sabi Mozão :)),2018-10-03 11:03:59-03:00,1,:)
39880,1049236837578018816,Eu só queria f1 antes de entrar na aula :(,2018-10-08 06:55:30-03:00,0,:(
88643,1052566326705713152,"""InBrands em busca de um sócio"" https://t.co/A...",2018-10-17 11:25:42-03:00,2,#trabalho


## Bag of Words

In [5]:
# Chamando o conversor de texto para token
countvec = CountVectorizer(strip_accents='ascii', 
                      lowercase=True)


In [6]:
# Transformando as frases em tokens
bag_of_words = countvec.fit_transform(df.tweet_text.tolist())

In [7]:
# Criando dataframe com as contagens das palavras
bow = pd.DataFrame(bag_of_words.toarray(), columns=countvec.get_feature_names_out())
bow

Unnamed: 0,019matilde,50,afvbttexu7,agora,agressor,algo,analisando,antes,ao,arma,...,um,uma,vai,vamos,ver,visao,vivo,volta,vou,yeezy
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,2,0,1,0,0,0,0,...,1,1,1,1,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,2,0,1,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [8]:
# Palavras mais frequentes
frequente = bow.sum()
frequente.sort_values(ascending=False)

de          7
https       5
co          5
que         5
vai         3
           ..
es          1
era         1
entrar      1
entender    1
yeezy       1
Length: 134, dtype: int64

## Stemming

In [9]:
# importações
import nltk
nltk.download('rslp')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package rslp to /home/amanda/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to /home/amanda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
# Criando objeto stemmer
stemmer = nltk.stem.RSLPStemmer()

In [11]:
# Tokenizando frases 
word_tokens = df.apply(lambda row: word_tokenize(row["tweet_text"]), axis=1)
word_tokens

27891    [Ou, seja, se, encomendar, as, yeezy, vou, pag...
4376     [@, duducaralho, @, brendaccarvalho, ótimo, .,...
37430    [@, Dvrkskinsayan, :, (, eu, não, percebo, des...
93208    [Analisando, #, FAKE, ou, #, FATO, sobre, a, c...
12352    [@, SantoosIbra, @, BrunoMsbe, @, futtmais, @,...
10153    [opa, solteira, :, ), pena, que, es, muito, no...
69076    [&, gt, ;, @, EstadaoPolitica, Bolsonaro, defe...
14749            [@, 019matilde, Tu, sabi, Mozão, :, ), )]
39880    [Eu, só, queria, f1, antes, de, entrar, na, au...
88643    [``, InBrands, em, busca, de, um, sócio, '', h...
dtype: object

In [12]:
# Realizando stemmer nas frases
frase_stemmer = word_tokens.apply(lambda x: [stemmer.stem(y) for y in x])
frase_stemmer

27891    [ou, sej, se, encomend, as, yeezy, vou, pag, 5...
4376     [@, duducaralh, @, brendaccarvalh, ótim, ., ag...
37430    [@, dvrkskinsayan, :, (, eu, não, perceb, desc...
93208    [analis, #, fak, ou, #, fat, sobr, a, cham, do...
12352    [@, santoosibr, @, brunomsb, @, futtm, @, port...
10153    [opa, solt, :, ), pen, que, es, muit, nov, kkk...
69076    [&, gt, ;, @, estadaopoli, bolsonar, defend, p...
14749                 [@, 019matild, tu, sab, mo, :, ), )]
39880     [eu, só, quer, f1, ant, de, entr, na, aul, :, (]
88643    [``, inbrand, em, busc, de, um, sóci, '', http...
dtype: object

## Lemmatization

In [13]:
# instalando spacy pt
#!python -m spacy download pt

In [14]:
# importações
import spacy
nlp = spacy.load('pt_core_news_sm')

In [15]:
# criando lista com todas as palavras da primeira frase
doc = word_tokens.apply(lambda x: nlp(str([y for y in x])))
doc

27891    ([, ', Ou, ', ,, ', seja, ', ,, ', se, ', ,, '...
4376     ([, ', @, ', ,, ', duducaralho, ', ,, ', @, ',...
37430    ([, ', @, ', ,, ', Dvrkskinsayan, ', ,, ', :, ...
93208    ([, ', Analisando, ', ,, ', #, ', ,, ', FAKE, ...
12352    ([, ', @, ', ,, ', SantoosIbra, ', ,, ', @, ',...
10153    ([, ', opa, ', ,, ', solteira, ', ,, ', :, ', ...
69076    ([, ', &, ', ,, ', gt, ', ,, ', ;, ', ,, ', @,...
14749    ([, ', @, ', ,, ', 019matilde, ', ,, ', Tu, ',...
39880    ([, ', Eu, ', ,, ', só, ', ,, ', queria, ', ,,...
88643    ([, ', `, `, ', ,, ', InBrands, ', ,, ', em, '...
dtype: object

In [16]:
# Realizando lematização
frase_lema = doc.apply(lambda row: "".join([w.lemma_ for w in row]))
frase_lema

27891    ['ou','ser','se','encomendar','o','yeezy','ir'...
4376     ['@','duducaralho','@','brendaccarvalho','bom'...
37430    ['@','Dvrkskinsayan',':','(','eu','não','perce...
93208    ['analisar','#','FAKE','ou','#','FATO','sobre'...
12352    ['@','SantoosIbra','@','BrunoMsbe','@','futtma...
10153    ['opa','solteira',':',')','pena','que','es','m...
69076    ['&','gt',';','@','EstadaoPolitica','Bolsonaro...
14749    ['@','019matilde','Tu','sabi','Mozão',':',')',...
39880    ['eu','só','querer','f1','antes','de','entrar'...
88643    ['``','InBrands','em','busca','de','um','sócio...
dtype: object

## Removendo stop words

In [17]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package stopwords to /home/amanda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
stop_words = set(stopwords.words('portuguese') + list(punctuation))

In [19]:
frase_sem_stopword = word_tokens.apply(lambda x: [word for word in x if not word.lower() in stop_words])
frase_sem_stopword

27891    [encomendar, yeezy, vou, pagar, 50, euros, por...
4376     [duducaralho, brendaccarvalho, ótimo, agora, v...
37430                   [Dvrkskinsayan, percebo, desculpa]
93208    [Analisando, FAKE, FATO, sobre, chamada, Jorna...
12352    [SantoosIbra, BrunoMsbe, futtmais, PortalMessi...
10153    [opa, solteira, pena, es, novinha, kkk, —, Ih,...
69076    [gt, EstadaoPolitica, Bolsonaro, defende, Paul...
14749                            [019matilde, sabi, Mozão]
39880                    [queria, f1, antes, entrar, aula]
88643    [``, InBrands, busca, sócio, '', https, //t.co...
dtype: object

## Stemming

In [20]:
# Realizando stemmer nas frases
frase_stemmer_sem_stopword = frase_sem_stopword.apply(lambda x: [stemmer.stem(y) for y in x])
frase_stemmer_sem_stopword

27891           [encomend, yeezy, vou, pag, 50, eur, port]
4376     [duducaralh, brendaccarvalh, ótim, agor, vam, ...
37430                     [dvrkskinsayan, perceb, desculp]
93208    [analis, fak, fat, sobr, cham, jornaloglob, so...
12352    [santoosibr, brunomsb, futtm, portalmess, vai,...
10153    [opa, solt, pen, es, nov, kkk, —, ih, http, //...
69076    [gt, estadaopoli, bolsonar, defend, paul, gued...
14749                                 [019matild, sab, mo]
39880                           [quer, f1, ant, entr, aul]
88643    [``, inbrand, busc, sóci, '', http, //t.co/afv...
dtype: object

## Lemmatization

In [21]:
# criando lista com todas as palavras da primeira frase
doc_sem_stopword = frase_sem_stopword.apply(lambda x: nlp(str([y for y in x])))
doc_sem_stopword

27891    ([, ', encomendar, ', ,, ', yeezy, ', ,, ', vo...
4376     ([, ', duducaralho, ', ,, ', brendaccarvalho, ...
37430    ([, ', Dvrkskinsayan, ', ,, ', percebo, ', ,, ...
93208    ([, ', Analisando, ', ,, ', FAKE, ', ,, ', FAT...
12352    ([, ', SantoosIbra, ', ,, ', BrunoMsbe, ', ,, ...
10153    ([, ', opa, ', ,, ', solteira, ', ,, ', pena, ...
69076    ([, ', gt, ', ,, ', EstadaoPolitica, ', ,, ', ...
14749    ([, ', 019matilde, ', ,, ', sabi, ', ,, ', Moz...
39880    ([, ', queria, ', ,, ', f1, ', ,, ', antes, ',...
88643    ([, ', `, `, ', ,, ', InBrands, ', ,, ', busca...
dtype: object

In [22]:
# Realizando lematização
frase_lema = doc_sem_stopword.apply(lambda row: "".join([w.lemma_ for w in row]))
frase_lema

27891    ['encomendar','yeezy','ir','pagar','50','euro'...
4376     ['duducaralho','brendaccarvalho','bom','agora'...
37430               ['Dvrkskinsayan','percebo','desculpa'_
93208    ['analisar','FAKE','FATO','sobre','chamar','Jo...
12352    ['SantoosIbra','BrunoMsbe','futtmal','PortalMe...
10153    ['opa','solteira','pena','es','novinho','kkk',...
69076    ['gt','EstadaoPolitica','Bolsonaro','defender'...
14749                        ['019matilde','sabi','Mozão'_
39880              ['querer','f1','antes','entrar','aula'_
88643    ['``','InBrands','busca','sócio',"''",'https',...
dtype: object