In [14]:
import pandas as pd
import numpy as np
import re
import string
import itertools
import unidecode


from nltk.corpus import stopwords
from tqdm import tqdm_notebook
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
df = pd.read_csv('../raw_data/dilma.csv')
#df_dilma.head(10)

In [11]:
df['content'].sample(10)

55508     "@Flordelismk: #Dilma VOCÊ ACHOU A TAMPA PARA ...
6987      Me lembro tanto da Erenice lépida e faceira at...
7386      #Dilma cumpriu todas as promessas feitas a tra...
103451    O que vai acontecer no Brasil caso a receita d...
19749     Nem com a maior boa vontade dá prá votar nas #...
92463     Por campanha de #Dilma, militância petista rea...
97780     Quem bate cartão não vota em patrão!!! #Dilma1...
37813     A foto é linda. Embora passe uma imagem triste...
94386              @MudaMais  #Dilma esta linda nesta foto!
60312     Kkkkkkkkkkkkk A #Dilma esqueceu de pagar esse ...
Name: content, dtype: object

In [12]:
def clean_tweet(tweet):
    '''
    tweet: String
           Input Data
    tweet: String
           Output Data
           
    func: Convert tweet to lower case
          Replace URLs with a space in the message
          Replace ticker symbols with space. The ticker symbols are any stock symbol that starts with $.
          Replace  usernames with space. The usernames are any word that starts with @.
          Replace everything not a letter or apostrophe with space
          Remove single letter words
          lemmatize, tokenize (nouns and verb), remove stop words, filter all the non-alphabetic words, then join
          them again

    '''
    tweet = tweet.lower()
    tweet = re.sub(" \d+", "", tweet)
    tweet = re.sub('\@[a-zA-Z0-9]*', ' ', tweet) # remove username start with @
    tweet = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', tweet) # remove link in the tweet
    tweet = re.sub('\$[a-zA-Z0-9]*', ' ', tweet) # remove the variable start with $
    #tweet = re.sub('[^a-zA-Z\']', ' ', tweet)
    for punctuation in new_punc: # remove panctuations 
            tweet = tweet.replace(punctuation, '')
    tweet = ' '.join( [w for w in tweet.split() if len(w)>1] ) #remove one letter words
    tweet = ''.join(c[0] for c in itertools.groupby(tweet)) #remove duplicated letters
    tweet = unidecode.unidecode(tweet)  # normalizar as letras com acentos 
    tweet = ' '.join(word for word in tweet.split(' ') if word not in stop_words) # remove stopwords
    
                          
    #doc = nlp(tweet)
    #tweet= [token.lemma_ for token in doc if token.text not in stop_words]
    
    return tweet 

In [15]:
%%time

new_punc = list(string.punctuation)
del new_punc[2]
stop_words = stopwords.words('portuguese')
stop_words.remove('não')
addicional = [
            'd', 'ta', 'q', 'tah', 'tao', 'eh', 'vc', 'voce',
            'pq', 'quedê', 'mto', 'mt', 'bj', 'bjs','vcs','bb',
            'b', 'sao', 'axo', 'mano', 'ae', 'neh', 'aí',
            'kkk', 'porque', 'né', 'no', 'iai', 'tbm', 'msm', 'jah', 'yahoo', 'yahoobr','rt']
stop_words.extend(addicional)
df['content'] = df['content'].astype(str)
df = df.drop_duplicates(subset="content",keep="first")

df["content_clean"]=df["content"].apply(lambda x:clean_tweet(x))

df=df[df["lang"]=="pt"]
df=df[["date","username","content_clean"]]
df['date'] = pd.to_datetime(df['date'])
df

CPU times: user 7.52 s, sys: 108 ms, total: 7.63 s
Wall time: 8.17 s


Unnamed: 0,date,username,content_clean
0,2014-09-19 23:58:17,luiz_carlosar,boechat diz surpreso solidez campanha #dilma
1,2014-09-19 23:57:12,alves_auri,permitame compartilhar pensamento #dilma outra...
2,2014-09-19 23:55:33,Sem_NeuroseYT,#dilma compra pesquisa compra copa almenta pas...
3,2014-09-19 23:51:37,NaldoValenca,alo coxinha #minhacasaminhavida #dilma investi...
4,2014-09-19 23:49:07,Colatto,hauahuahu #dilma entao
...,...,...,...
104625,2014-09-10 00:03:52,UDN_tofora,matriculas ensino superior crescem8 #dilma #pt...
104626,2014-09-10 00:03:12,O_Apedeuta,contra tudocontra revistacontra comentaristas ...
104627,2014-09-10 00:02:16,TerezaBraSiLL,alguns investimentos governo #dilma paulo
104628,2014-09-10 00:02:04,cavlis,programa tv dilma hoje tv excelente direto cla...


In [89]:
%%time
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(df['content_clean'])

lda_model = LatentDirichletAllocation(n_components=8, learning_method='online', learning_offset=20, max_iter=10, random_state=42)

lda_vectors = lda_model.fit_transform(data_vectorized)


CPU times: user 3min 50s, sys: 171 ms, total: 3min 50s
Wall time: 3min 51s


In [93]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda_model, vectorizer.get_feature_names(), 15)


Topic 0:
dilma fez veja vai pf número governo vamos brasil turno quer milhões educação povo ter
Topic 1:
dilma aecio marina brasil debate não nao vota padilha vez aqui inflação vida eleições2014 debatenaband
Topic 2:
dilma pt lula não psdb aecio rt globo diz contra bem presidente jn ibope falar
Topic 3:
dilma gov 13 voto brasil nacional debatenaglobo paulo continuar minas mil sabe entrevista hoje lei
Topic 4:
dilma presos pra agora país anos copa aecio brasil tá pesquisa fhc 12 obras frente
Topic 5:
dilma não sobre esa votar presidenta povo petrobras corupção ser política fala gente presidente apoio
Topic 6:
dilma não brasil iso dia diz novo campanha faz ese ainda bom via fazer nada
Topic 7:
coruptos cair crisenapf dilma melhor tudo sim sempre discurso datafolha vote pesquisas olha bilhões mal


In [76]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = df['content_clean'] 

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=stop_words)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=stop_words)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 9

# Run NMF
nmf = NMF(n_components=no_topics, random_state=42, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=42).fit(tf)

no_top_words = 15
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
crisenapf presos cair número pf coruptos gov veja fez rj 14 oficial filho santos pimentel
Topic 1:
pt psdb globo jn governo contra vaitercopa agora campanha foradilma forapt corupção copa mídia nacional
Topic 2:
não povo diz nada iso pode sabe pra ser esa governo país corupção quer presidente
Topic 3:
aecio debate psdb eleições2014 pesquisa neves sobre minas eleicoes2014 votos ibope turno datafolha debatenaglobo eleições
Topic 4:
brasil copa diz mundo todo melhor vamos presidente governo via ainda desemprego educação continuar sempre
Topic 5:
vai pra ser votar ter agora copa presidenta esa ganhar dizer vota rt voa tomar
Topic 6:
13 voto sim votar iso dia todos vamos motivos intenções pra povo presidenta agora hoje
Topic 7:
marina diz turno sobre silva governo segundo datafolha pesquisa ibope 2º presidente eleições2014 via política
Topic 8:
lula psdb globo rt governos jn vaitercopa petrobras milhões saúde fhc anos campos sobre médicos
Topic 0:
aecio marina lula sobre brasil tur

In [91]:
%%time
neighbour_cluster = 10
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # paraphrase-MiniLM-L6-v2
hdbscan_model = HDBSCAN(min_cluster_size=neighbour_cluster, metric='euclidean', #min_cluster_size=10,100
                        cluster_selection_method='eom', prediction_data=True)
umap_model = UMAP(n_neighbors=neighbour_cluster, n_components=30, min_dist=0.0, metric='cosine', low_memory=True, random_state=42) # n_neighbors=15,30, 100
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words, min_df=5)         # min_df means "ignore terms that appear in less than n documents".

topic_model = BERTopic(#language="multilingual",
                       top_n_words=10,                                                    
                       min_topic_size=10,
                       nr_topics="auto",                                                           # Don't set nr_topics if you're already converging topics to pre-determined labels
                       low_memory=True,                                                            # If you have a big dataset set low_memory to True - but it's slower
                       calculate_probabilities=False,  # Set to False if big data
                       seed_topic_list=None,                                                       # If you want to converge to a list of your own topics
                       embedding_model=sentence_model,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       verbose=True                                                                # Get logging
                      )

CPU times: user 356 ms, sys: 96.4 ms, total: 452 ms
Wall time: 8.65 s


In [92]:
%%time
docs = df['content_clean'].to_list()  
topics, probs = topic_model.fit_transform(docs) 

Batches:   0%|          | 0/3170 [00:00<?, ?it/s]

2022-06-03 13:38:08,912 - BERTopic - Transformed documents to Embeddings
2022-06-03 13:45:57,860 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [157]:
%%time
#Converting the "clean_tweet" column in the format supported by embeddings.
sent = [row for row in df_pt["content_clean"]]
#use Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences.
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[100]

CPU times: user 1.48 s, sys: 6.93 ms, total: 1.49 s
Wall time: 1.5 s


'muita gente contra #dilma emprego facilidades muita gente favor #dilma têm ctps filhos faculdadeetc'

In [158]:
e_sentences=[]
for i in range(0,len(sentences)):
    e_sentences.append(sentences[i].split(' '))

e_sentences

[['#dilma', 'recebeu', 'hj', 'atletas', 'destacaram'],
 ['caminhada', '#dilma', '#padilha', 'santo', 'amaro'],
 ['marina',
  'silvacandidata',
  'banqueiros',
  'itau',
  'nao',
  'povonao',
  'deixe',
  'enganarvota',
  'conscientevota',
  '#dilma'],
 ['não',
  'vota',
  'marina',
  'silva',
  'pro',
  'brasil',
  'permanecer',
  'independente',
  'sistema',
  'financeiro',
  'causou',
  'atual',
  'crise',
  'econômica',
  'mundial',
  'vota',
  '#dilma'],
 ['governo',
  'mentirosos',
  'basta',
  'ver',
  'campanha',
  '#dilma',
  'não',
  'honesta',
  '#vaipracubapt',
  '#foradilma',
  '#abaixodecretodadilma'],
 ['ciro',
  'gomes',
  'sobre',
  'marina',
  'silvaé',
  'vazio',
  'absolutoaproveita',
  'inocencia',
  'juventude',
  'mal',
  'informada',
  'pig',
  'vote',
  '#dilma'],
 ['tacalhe', 'pau', '#dilma', '🙌'],
 ['alô',
  'coxinha',
  '#dilma',
  'agraciou',
  'sp',
  '#maismédicos',
  'municípios',
  'estado',
  'beneficiando',
  'milhões',
  'pessoas',
  'suíça'],
 ['não'

In [160]:
%%time
#Initializing the word2vec model

import multiprocessing

w2v_model = Word2Vec(min_count=4,
                     window=5,
                     vector_size =300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     seed= 42,
                     workers=multiprocessing.cpu_count()-1)


#building vocab of the word2vec model from the custom data
w2v_model.build_vocab(e_sentences, progress_per=50000)

# https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483



CPU times: user 205 ms, sys: 149 µs, total: 205 ms
Wall time: 208 ms


In [161]:
%%time
w2v_model.train(e_sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

CPU times: user 12.8 s, sys: 122 ms, total: 12.9 s
Wall time: 5.21 s


(1716223, 7703040)

In [163]:
#w2v_model.wv.key_to_index
w2v_model.wv.most_similar("dilma")

[('faz', 0.9995744228363037),
 ('enqto', 0.999555230140686),
 ('viajando', 0.9995530247688293),
 ('vou', 0.999552845954895),
 ('el', 0.9995509386062622),
 ('bem', 0.9995501041412354),
 ('novos', 0.9995485544204712),
 ('#veja', 0.999548077583313),
 ('real', 0.9995476007461548),
 ('cresce', 0.9995447993278503)]

In [164]:
%%time
model = KMeans(n_clusters=3, max_iter=1000, random_state=42, n_init=50).fit(X=w2v_model.wv.vectors.astype('double'))


CPU times: user 7.1 s, sys: 3.35 s, total: 10.4 s
Wall time: 3.6 s


In [168]:
# check what we have in each cluster to label the clusters
w2v_model.wv.similar_by_vector(model.cluster_centers_[0], topn=100, restrict_vocab=None)


[('excelente', 0.9998299479484558),
 ('comandar', 0.9998290538787842),
 ('dna', 0.9998289942741394),
 ('jornais', 0.9998283386230469),
 ('acredita', 0.9998263716697693),
 ('coitada', 0.9998262524604797),
 ('palavrão', 0.9998262524604797),
 ('maiores', 0.9998261332511902),
 ('#eunaovejonaglobo', 0.9998258948326111),
 ('única', 0.9998252391815186),
 ('continuará', 0.9998244643211365),
 ('#maiseducação', 0.999824047088623),
 ('sinto', 0.9998239278793335),
 ('exemplar', 0.9998228549957275),
 ('finaliza', 0.9998227953910828),
 ('lavada', 0.9998227953910828),
 ('rápido', 0.9998226165771484),
 ('sanepar', 0.9998222589492798),
 ('putz', 0.999821662902832),
 ('culpar', 0.9998214840888977),
 ('#coracaovalente', 0.9998212456703186),
 ('ampliação', 0.9998209476470947),
 ('#naovamosdesistirdobrasil', 0.9998207092285156),
 ('especialidades', 0.9998201131820679),
 ('estourou', 0.9998197555541992),
 ('emoção', 0.9998196363449097),
 ('gratidão', 0.9998192191123962),
 ('defesa', 0.9998189210891724),
 ('

# Build Your own embedding 

In [None]:
#Cretae your own embedding
# This initializes a Keras utilities that does all the tokenization for you
tokenizer = Tokenizer()

# The tokenization learns a dictionary that maps a token (integer) to each word
# It can be done only on the train set - we are not supposed to know the test set!
# This tokenization also lowercases your words, apply some filters, and so on 
tokenizer.fit_on_texts(X_train)
    
# We apply the tokenization to the train and test set
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_token, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_token, dtype='float32', padding='post')

#Your X_train_token and X_test_token contain sequences of different lengths.
#a neural network has to have a tensor as input. For this reason, you have to pad your data.

vocab_size = len(tokenizer.word_index)


# Use pretrained embeddings

In [59]:
%%time
word_vectors= KeyedVectors.load_word2vec_format('glove_s100.txt') # load pretrained glove_s100, 


CPU times: user 1min 23s, sys: 2.2 s, total: 1min 26s
Wall time: 1min 28s


In [62]:
result = word_vectors.most_similar('política')
result

[('económica', 0.8212239146232605),
 ('politica', 0.7895896434783936),
 ('políticas', 0.781465470790863),
 ('estratégia', 0.7725036144256592),
 ('social', 0.7609491348266602),
 ('perspectiva', 0.7572699189186096),
 ('reforma', 0.7452260851860046),
 ('europeia', 0.7418079972267151),
 ('externa', 0.7416183948516846),
 ('opinião', 0.73964524269104)]

In [None]:


embedding_dimension = 300

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size + 1, output_dim=embedding_dimension, mask_zero=True))
model.add(layers.LSTM(20))
model.add(layers.Dense(10, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)
# First, train a word2vec model (with the arguments that you want) on your training sentence. Store it into the word2vec variable.
word2vec = Word2Vec(sentences=X_train, vector_size=60, min_count=10, window=10)