### Código utilizado no artigo "Using Transformers in Clustering Narrow Domain Short Documents" submetido ao KDMILE 2022

Autora: Alessandra Gomes

In [1]:
#instalações e importações necessárias

import pandas as pd

!pip install sentence_transformers
from sentence_transformers import SentenceTransformer

from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 41.1 MB/s 
Installing collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.1 jedi-0.18.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.23.0-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 44.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |██████

### A Clusterização dos Narrow Domain Short Documents (Tweets)

In [2]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')



<b>Carregar os arquivos dos datasets</b>

In [3]:
path1 = "<path to the dataset1 file>"
path2 = "<path to the dataset2 file>"
path3 = "<path to the dataset3 file>"

df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)
df3 = pd.read_csv(path3)

<b>Execução do pré-processamento</b>

In [4]:
# Os arquivos dos datasets aqui carregados já continham as operações de pré-processamento:
# - remoção de emojis, urls, espaços duplicados, quebra de linhas e da hashtag #debatenaband
# - remoção de tweets com conteúdos repetidos (remoção de duplicatas)
# - remoção de tweets com menos de duas palavras

#lower case
sentences1 = df1['text'].str.lower().to_list()
sentences2 = df2['text'].str.lower().to_list()
sentences3 = df3['text'].str.lower().to_list()



### Definição do Total de Clusters com Silhouette

<b> Embedding dos tweets do dataset1 com Transformers</b>

In [5]:
sentence_embeddings1 = model.encode(sentences1)



<b> Cálculo do Silhouette para valores de total de clusters entre 2 e 10 (dataset1)</b>

Para o <b>dataset1</b>, a maior média silhouette foi obtida com <b>n_cluster = 4</b>

In [6]:
for n_cluster in range(2,11):
  clustering = AgglomerativeClustering(n_clusters = n_cluster, linkage="ward").fit(sentence_embeddings1)

  print("n_cluster: {}; silhouette_avg: {}".format(n_cluster, silhouette_score(sentence_embeddings1, clustering.labels_)))


n_cluster: 2; silhouette_avg: 0.04671502113342285
n_cluster: 3; silhouette_avg: 0.05241701751947403
n_cluster: 4; silhouette_avg: 0.05473967269062996
n_cluster: 5; silhouette_avg: 0.02598670683801174
n_cluster: 6; silhouette_avg: 0.025218751281499863
n_cluster: 7; silhouette_avg: 0.024409614503383636
n_cluster: 8; silhouette_avg: 0.011379833333194256
n_cluster: 9; silhouette_avg: 0.014934797771275043
n_cluster: 10; silhouette_avg: 0.017248624935746193


<b> Embedding dos tweets do dataset2 com Transformers</b>

In [7]:
sentence_embeddings2 = model.encode(sentences2)



Para o <b>dataset2</b>, a maior média silhouette também foi obtida com <b>n_cluster = 4</b>

In [8]:
for n_cluster in range(2,11):
  clustering = AgglomerativeClustering(n_clusters = n_cluster, linkage="ward").fit(sentence_embeddings2)

  print("n_cluster: {}; silhouette_avg: {}".format(n_cluster, silhouette_score(sentence_embeddings2, clustering.labels_)))


n_cluster: 2; silhouette_avg: 0.027884025126695633
n_cluster: 3; silhouette_avg: 0.03555232658982277
n_cluster: 4; silhouette_avg: 0.03821224346756935
n_cluster: 5; silhouette_avg: 0.03662513196468353
n_cluster: 6; silhouette_avg: 0.01459056418389082
n_cluster: 7; silhouette_avg: 0.016314417123794556
n_cluster: 8; silhouette_avg: 0.01993398554623127
n_cluster: 9; silhouette_avg: 0.020444225519895554
n_cluster: 10; silhouette_avg: 0.013897251337766647


<b> Embedding dos tweets do dataset3 com Transformers</b>

In [9]:
sentence_embeddings3 = model.encode(sentences3)



Para o <b>dataset3</b>, a maior média silhouette também foi obtida com <b>n_cluster = 4</b>

In [10]:
for n_cluster in range(2,11):
  clustering = AgglomerativeClustering(n_clusters = n_cluster, linkage="ward").fit(sentence_embeddings3)

  print("n_cluster: {}; silhouette_avg: {}".format(n_cluster, silhouette_score(sentence_embeddings3, clustering.labels_)))


n_cluster: 2; silhouette_avg: 0.037607062608003616
n_cluster: 3; silhouette_avg: 0.04364657774567604
n_cluster: 4; silhouette_avg: 0.04610239341855049
n_cluster: 5; silhouette_avg: 0.022414514794945717
n_cluster: 6; silhouette_avg: 0.021959206089377403
n_cluster: 7; silhouette_avg: 0.025396622717380524
n_cluster: 8; silhouette_avg: 0.026761522516608238
n_cluster: 9; silhouette_avg: 0.020612133666872978
n_cluster: 10; silhouette_avg: 0.022969214245676994


### **Topic Modeling com BERTopic**

In [11]:
!pip install bertopic
from bertopic import BERTopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.12.0-py2.py3-none-any.whl (90 kB)
[?25l[K     |███▋                            | 10 kB 29.8 MB/s eta 0:00:01[K     |███████▎                        | 20 kB 6.5 MB/s eta 0:00:01[K     |██████████▉                     | 30 kB 9.3 MB/s eta 0:00:01[K     |██████████████▌                 | 40 kB 4.2 MB/s eta 0:00:01[K     |██████████████████              | 51 kB 4.3 MB/s eta 0:00:01[K     |█████████████████████▊          | 61 kB 5.0 MB/s eta 0:00:01[K     |█████████████████████████▎      | 71 kB 5.5 MB/s eta 0:00:01[K     |█████████████████████████████   | 81 kB 6.2 MB/s eta 0:00:01[K     |████████████████████████████████| 90 kB 4.2 MB/s 
Collecting pyyaml<6.0
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 28.5 MB/s 
[?25hCollecting umap-learn>=0.5.0
  Downlo

Com o resultado encontrado anteriormente, o algoritmo <b>AgglomerativeClustering</b> foi executado novamente para <b>n_cluster = 4</b>. 
Os resultados foram salvos em três arquivos diferentes, um para cada dataset.

In [16]:
path1 = "<path to the result for the dataset1 clustering>"
#path2 = "<path to the result for the dataset2 clustering>"
#path3 = "<path to the result for the dataset3 clustering>"

df_original = pd.read_csv(path1)

filtro1 = (df_original['CLUSTER'] == 0)
filtro2 = (df_original['CLUSTER'] == 1)
filtro3 = (df_original['CLUSTER'] == 2)
filtro4 = (df_original['CLUSTER'] == 3)

df_c1 = df_original[filtro1]
df_c2 = df_original[filtro2]
df_c3 = df_original[filtro3]
df_c4 = df_original[filtro4]

print(len(df_c1))
print(len(df_c2))
print(len(df_c3))
print(len(df_c4))

tweets_c1 = df_c1['TEXT'].str.lower().to_list()
tweets_c2 = df_c2['TEXT'].str.lower().to_list()
tweets_c3 = df_c3['TEXT'].str.lower().to_list()
tweets_c4 = df_c4['TEXT'].str.lower().to_list()


14023
13096
1556
1325


Carrega o modelo BERTopic

In [17]:
model = BERTopic(verbose = False, language="multilingual")

Execução do Topic Modeling para os clusters referentes ao <b>dataset1</b>:

<b>Cluster 1 do Dataset 1:</b>

In [18]:
topics1, probs1 = model.fit_transform(tweets_c1)

for i in range(0, 5):
  print(model.get_topic(i))

2022-09-14 09:28:27,653 - BERTopic - Transformed documents to Embeddings
2022-09-14 09:28:43,889 - BERTopic - Reduced dimensionality
2022-09-14 09:28:44,540 - BERTopic - Clustered reduced embeddings


[('mentira', 0.021626236885192926), ('lula', 0.020644651286268554), ('debate', 0.008215109178075343), ('ciro', 0.007849885613017395), ('bolsonaro', 0.00620234208321244)]
[('tchutchuca', 0.1279054157029289), ('centrão', 0.08842765016117593), ('soraya', 0.048704748087821784), ('chamou', 0.03282477397562908), ('aluna', 0.013313153556864259)]
[('kkkkkk', 0.0370198020490028), ('kkkkkkkkk', 0.036474233524119826), ('kkkkk', 0.03478731383782638), ('kkkkkkk', 0.030816427496416512), ('kkkkkkkk', 0.024566100303862925)]
[('band', 0.048102379627875315), ('microfone', 0.0320737455852317), ('áudio', 0.020334780129737037), ('debate', 0.017779407137800515), ('som', 0.010562391964679615)]
[('ceará', 0.027170254136255966), ('votar', 0.025589858659658234), ('paris', 0.01805542748226199), ('vá', 0.0149178162331637), ('criticando', 0.009072031180758555)]


<b>Cluster 2 do Dataset 1:</b>

In [19]:
topics2, probs2 = model.fit_transform(tweets_c2)

for i in range(0, 5):
  print(model.get_topic(i))

2022-09-14 09:29:20,023 - BERTopic - Transformed documents to Embeddings
2022-09-14 09:29:35,471 - BERTopic - Reduced dimensionality
2022-09-14 09:29:36,004 - BERTopic - Clustered reduced embeddings


[('presidente', 0.05338560260075639), ('chamando', 0.019163826108645476), ('lula', 0.015170963720129299), ('atual', 0.010629962973989916), ('bolsonaro', 0.009347550471395448)]
[('jornalista', 0.07120292317147899), ('vera', 0.023495142708890908), ('magalhães', 0.019600514750078506), ('atacar', 0.013576153157412313), ('vergonha', 0.012863703706752327)]
[('corrupção', 0.07255453877811492), ('governo', 0.015101937266339313), ('lula', 0.01492676054422267), ('petrobras', 0.010633603609236835), ('pergunta', 0.010015354131237685)]
[('mentiroso', 0.04772884829844927), ('mentira', 0.04031786011821581), ('mentir', 0.03394767851634503), ('dna', 0.021027793315175146), ('bolsonaro', 0.014345452116425395)]
[('auxílio', 0.08544139001920943), ('200', 0.05336431973064136), ('600', 0.05186107757148673), ('emergencial', 0.04480732042077957), ('reais', 0.036939293582413886)]


<b>Cluster 3 do Dataset 1:</b>

In [20]:
topics3, probs3 = model.fit_transform(tweets_c3)

for i in range(0, 5):
  print(model.get_topic(i))

2022-09-14 09:29:57,364 - BERTopic - Transformed documents to Embeddings
2022-09-14 09:30:04,416 - BERTopic - Reduced dimensionality
2022-09-14 09:30:04,484 - BERTopic - Clustered reduced embeddings


[('fome', 0.13287051278876844), ('bolsonaro', 0.03574592589156996), ('milhões', 0.03331882822427128), ('pessoas', 0.03291927585386459), ('ciro', 0.032211054304423545), ('33', 0.029877300717703094)]
[('contra', 0.2308410945801971), ('auxílio', 0.1014663285920785), ('600', 0.03797660603320903), ('reais', 0.031558586899262775), ('bolsonaro', 0.030674451111630488), ('pt', 0.028555354960901766)]
[('educação', 0.12299982793305986), ('professores', 0.032375187639433416), ('professor', 0.029705731249502498), ('brasil', 0.02684318933373508), ('ensino', 0.011379745569547932)]
[('paris', 0.11808934487713506), ('espero', 0.03748612039391569), ('haddad', 0.0341013444373616), ('ciro', 0.0335331876356973), (eleições    0.027717163327712173)]
[('agro', 0.11115068085041885), ('agronegócio', 0.08952066975133405), ('sustentável', 0.07111556673355826), ('mundo', 0.043099243595060364), ('amazônia', 0.03438025699902708)]


<b>Cluster 4 do Dataset 1:</b>

In [21]:
topics4, probs4 = model.fit_transform(tweets_c4)

for i in range(0, 5):
  print(model.get_topic(i))

2022-09-14 09:31:04,198 - BERTopic - Transformed documents to Embeddings
2022-09-14 09:31:10,650 - BERTopic - Reduced dimensionality
2022-09-14 09:31:10,704 - BERTopic - Clustered reduced embeddings


[('feminismo', 0.06763587004947759), ('presidente', 0.04731533415212314), ('simone', 0.030439813671810964), ('mulher', 0.028475612915837226), ('bolsonaro', 0.027769937375677217), ('candidato', 0.02672077149241989), ('tebet', 0.0263206250106208)]
[('jornalista', 0.10274378595785215), ('cresce', 0.035014822800870996), ('cima', 0.03345590719658092), ('bolsonaro', 0.029049559162712745),  ('mulher', 0.022650419998623966), ('atacar', 0.01999962879202793)]
[('odeia', 0.12639097031428817), ('ódio', 0.06583101156237622), ('bolsonaro', 0.0644206114796136), ('mulheres', 0.0513782102106709), ('tanto', 0.04073006350712872)]
[('debate', 0.14083118283909987), ('perguntar', 0.10470966817925151), ('homem', 0.08137971975702661), ('ciro', 0.0687504773565023), ('escolheu', 0.05137669098382707), ('mulheres', 0.04131756944445433)]
[('violência', 0.05094204420422454), ('armas', 0.040304928475154767), ('mulheres', 0.034870221659189966), ('bolsonaro', 0.03146811305524682), ('contra', 0.029558222404568776) , ('