## Modelagem de tópicos
BERTopic: Neural topic modeling with a class-based TF-IDF procedure: https://arxiv.org/abs/2203.05794

In [None]:
import pandas as pd
from bertopic import BERTopic

In [2]:
df = pd.read_parquet('dados/id_desc_clr.parquet')
df.shape

(3452, 6)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3452 entries, 0 to 3615
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                3452 non-null   object
 1   desc              3452 non-null   object
 2   desc_sent         3452 non-null   object
 3   desc_token_clr    3452 non-null   object
 4   desc_bigram_clr   3452 non-null   object
 5   desc_trigram_clr  3452 non-null   object
dtypes: object(6)
memory usage: 188.8+ KB


### Bertopic

In [18]:
# Transforma lista de tokens (frases limpas) em string
df['desc_token_clr_str'] = df['desc_token_clr'].apply(lambda tokens: ' '.join(tokens))

In [19]:
docs = df['desc_token_clr_str'].to_list()

model = BERTopic(verbose=True)
topics, probs = model.fit_transform(docs)

Batches: 100%|██████████| 108/108 [03:17<00:00,  1.83s/it]
2023-05-14 22:54:30,479 - BERTopic - Transformed documents to Embeddings
2023-05-14 22:55:03,772 - BERTopic - Reduced dimensionality
2023-05-14 22:55:04,044 - BERTopic - Clustered reduced embeddings


In [27]:
topics = model.get_topic_info()
topics

Unnamed: 0,Topic,Count,Name
0,-1,837,-1_data_platform_read_company
1,0,1259,0_health_care_healthcare_medical
2,1,278,1_security_data_cloud_risk
3,2,132,2_payments_payment_financial_card
4,3,116,3_ai_data_learning_machine
5,4,85,4_insurance_claims_underwriting_insurers
6,5,63,5_logistics_freight_fulfillment_shipping
7,6,53,6_retailers_retail_commerce_store
8,7,47,7_marketing_customer_data_marketers
9,8,44,8_video_tv_radio_streaming


#### Principais tokens dos maiores tópicos 

In [43]:
def get_words_topic(id_topic):
    print('Tópico:', topics.loc[id_topic + 1]['Name'])
    for word in model.get_topic(id_topic):
        print(word)

get_words_topic(0)

Tópico: 0_health_care_healthcare_medical
('health', 0.03248818327705299)
('care', 0.026168910394805345)
('healthcare', 0.025673155823679475)
('medical', 0.01922112850815286)
('patients', 0.019172840498210098)
('patient', 0.018953114277789448)
('clinical', 0.01594525779297723)
('company', 0.015588634835925734)
('read', 0.015133208001758892)
('platform', 0.012035358125167967)


In [44]:
get_words_topic(1)

Tópico: 1_security_data_cloud_risk
('security', 0.04811445486468763)
('data', 0.0286439043375488)
('cloud', 0.02304353304686613)
('risk', 0.020622542594590742)
('identity', 0.017755221924920638)
('cyber', 0.01765499679157528)
('privacy', 0.017545480793287157)
('fraud', 0.017360341062895596)
('secure', 0.01728304192296843)
('cybersecurity', 0.016260121870367288)


In [45]:
get_words_topic(2)

Tópico: 2_payments_payment_financial_card
('payments', 0.052220918469135826)
('payment', 0.05182115611528438)
('financial', 0.024126405522818833)
('card', 0.023502197600770908)
('credit', 0.02286815667517169)
('businesses', 0.022619607742527824)
('mobile', 0.021682612187382454)
('fintech', 0.020771686200952846)
('cash', 0.020580558809828325)
('banking', 0.016777425584113662)


In [46]:
get_words_topic(3)

Tópico: 3_ai_data_learning_machine
('ai', 0.06293226864555679)
('data', 0.03863918959884666)
('learning', 0.03776199027033455)
('machine', 0.03414366876093293)
('science', 0.024930699608162406)
('ml', 0.023642918254833695)
('models', 0.022402929228872784)
('model', 0.02205243852093044)
('platform', 0.01951109228083915)
('intelligence', 0.018139348563747718)


### Visualização de tópicos

In [50]:
model.visualize_topics()

In [51]:
model.visualize_barchart()

### Exporta empresas e seus tópicos

In [52]:
df_topics = model.get_document_info(docs)
df_topics.head(5)

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,vectra cybersecurity platform uses ai detect a...,1,1_security_data_cloud_risk,security - data - cloud - risk - identity - cy...,1.0,False
1,roadzen global leader p c insurance space insu...,4,4_insurance_claims_underwriting_insurers,insurance - claims - underwriting - insurers -...,0.820254,False
2,restream multi streaming solution allows produ...,8,8_video_tv_radio_streaming,video - tv - radio - streaming - live - onrad ...,0.763877,False
3,dlp works 1 companies fortunately cyberhaven 9...,1,1_security_data_cloud_risk,security - data - cloud - risk - identity - cy...,1.0,False
4,shift technology delivers ai native decision a...,4,4_insurance_claims_underwriting_insurers,insurance - claims - underwriting - insurers -...,0.726881,False


In [54]:
df_topics.to_parquet('dados/id_desc_topics.parquet')