<a href="https://colab.research.google.com/github/andreea-bodea/bachelors-thesis-informatics/blob/main/BT%20INFO%3A%20Model%200%3A%20BERTopic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model 0: BERTopic on Parler

BERTopic with Embedding Model Sentence Transformers "all-MiniLM-L6-v2"

https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html#visual-overview


In [None]:
%%capture
!pip install bertopic

In [None]:
%%capture
!pip install joblib==1.1.0

In [None]:
from bertopic import BERTopic 
from umap import UMAP

In [None]:
# Upload csv file with posts to Google Colab 
# Sample for topic modelling: parleys_test (~300.000 posts)
from google.colab import files
uploaded = files.upload()

Saving parleys_test.csv to parleys_test.csv


In [None]:
# Read csv file as pandas dataframe 
import pandas as pd
import io
parleys_test = pd.read_csv(io.BytesIO(uploaded['parleys_test.csv']))
parleys_test

Unnamed: 0,body
0,glad see parler free speech actually alive wel...
1,not enough year minimum
2,wonder kamalaharris blm think white guy placed...
3,agreed seemed like close race till inner city ...
4,well well abercrombie fitch president canada e...
...,...
309063,politician concerned covering ass not represen...
309064,rent kid hell barack mike rented them
309065,whom biden carry anything especially itcome pe...
309066,pedo fly head never lie


In [None]:
# Transform pandas dataframe to list with posts 
Parler_posts_test = parleys_test['body'].tolist()
Parler_posts_test

In [None]:
# Set a random_state in UMAP to prevent any stochastic behavior -> reproduce the results possible (at the expense of performance)
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)

In [None]:
# Train topic model with "all-MiniLM-L6-v2" sentence-transformers embeddings
# Extract topics and generate probabilities
topic_model = BERTopic(nr_topics=10, umap_model=umap_model)
topics, probs = topic_model.fit_transform(Parler_posts_test)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# Access information about all topics that were generated
# -1 refers to all outliers and should typically be ignored
topics_df = topic_model.get_topic_info()
topics_df
# topics_df.to_csv('Topics_Model_0.csv', index=False);

Unnamed: 0,Topic,Count,Name
0,-1,272216,-1_not_trump_people_get
1,0,6300,0_china_chinese_ccp_virus
2,1,5843,1_parler_maga_follow_welcome
3,2,5427,2_party_republican_democrat_rinos
4,3,3856,3_antifa_blm_terrorist_protest
5,4,3286,4_god_pray_praying_prayer
6,5,2980,5_patriot_proud_boy_true
7,6,2353,6_georgia_recount_vote_election
8,7,2328,7_treason_hang_dementia_tribunal
9,8,2303,8_racist_white_black_house


In [None]:
# Access all topics
all_topics = topic_model.get_topics()
all_topics

{-1: [('not', 0.037793528622727966),
  ('trump', 0.022207403989859334),
  ('people', 0.02151355504773762),
  ('get', 0.018254629962069082),
  ('like', 0.018082120729975164),
  ('need', 0.017230872959814767),
  ('would', 0.016629986279411205),
  ('one', 0.01584802638288666),
  ('president', 0.015659602080616588),
  ('know', 0.015349673265615432)],
 0: [('china', 0.20266936875980343),
  ('chinese', 0.08637975079851005),
  ('ccp', 0.0449583340150664),
  ('virus', 0.0370570547094948),
  ('biden', 0.036021241644270764),
  ('not', 0.030983120939773744),
  ('communist', 0.025914228631651964),
  ('russia', 0.02280847144986725),
  ('beijing', 0.021949646873435095),
  ('country', 0.02140411436513911)],
 1: [('parler', 0.14733444426992792),
  ('maga', 0.06724952834223188),
  ('follow', 0.06099502179310923),
  ('welcome', 0.04804466915140653),
  ('glad', 0.0451351121248835),
  ('content', 0.03685775944632153),
  ('trump', 0.035228032068443806),
  ('supporter', 0.03418226416153417),
  ('you', 0.032

In [None]:
# Transform topics to dataframe and save as CSV file
list_with_all_topics = []
list_with_one_topic = []
for key in all_topics:
  list_with_one_topic = []
  for tuple in all_topics[key]:
    list_with_one_topic.append(tuple[0])
  list_with_all_topics.append(list_with_one_topic)
print(list_with_all_topics)

topics_df = pd.DataFrame(list_with_all_topics, index = ['-1', 'Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9'],
                                 columns = ['Word 1', 'Word 2', 'Word 3', 'Word 4', 'Word 5', 'Word 6', 'Word 7', 'Word 8', 'Word 9', 'Word 10'])
topics_df 
topics_df.to_csv('Model_0_Topics_Complete.csv')

[['not', 'trump', 'people', 'get', 'like', 'need', 'would', 'one', 'president', 'know'], ['china', 'chinese', 'ccp', 'virus', 'biden', 'not', 'communist', 'russia', 'beijing', 'country'], ['parler', 'maga', 'follow', 'welcome', 'glad', 'content', 'trump', 'supporter', 'you', 'truly'], ['party', 'republican', 'democrat', 'rinos', 'rino', 'dems', 'gop', 'coward', 'not', 'need'], ['antifa', 'blm', 'terrorist', 'protest', 'supporter', 'not', 'trump', 'peaceful', 'people', 'riot'], ['god', 'pray', 'praying', 'prayer', 'bless', 'president', 'trump', 'you', 'thank', 'amen'], ['patriot', 'proud', 'boy', 'true', 'thank', 'american', 'you', 'not', 'god', 'bless'], ['georgia', 'recount', 'vote', 'election', 'senate', 'not', 'ballot', 'state', 'republican', 'runoff'], ['treason', 'hang', 'dementia', 'tribunal', 'treasonous', 'firing', 'squad', 'military', 'death', 'not'], ['racist', 'white', 'black', 'house', 'racism', 'color', 'race', 'not', 'people', 'matter'], ['fox', 'news', 'newsmax', 'watch'

In [None]:
# Extract representative docs for all topics
# representative_docs = topic_model.get_representative_docs()

# Extract representative docs of a specific topic
# representative_docs = topic_model.get_representative_docs(0)

# Extract representative docs for all topics as dataframe and save as CSV file
all_topics_representative_docs_df = pd.DataFrame(columns=['Topic', 'Representative Post'])
for key in all_topics.keys():
    if (key == -1):
        continue
    topic_representative_docs_list = topic_model.get_representative_docs(key)
    for representative_doc in topic_representative_docs_list:
       all_topics_representative_docs_df = all_topics_representative_docs_df.append({'Topic': key, 'Representative Post': representative_doc}, ignore_index=True)
all_topics_representative_docs_df.to_csv('Model_0_Topics_Representative_Posts.csv')
all_topics_representative_docs_df

Unnamed: 0,Topic,Representative Post
0,0,they are owned china
1,0,taiwan worried usa politics sorry israelite su...
2,0,let every patriot refuse purchase good made ch...
3,0,stated would shut campaign went hiding pandemi...
4,0,always case not death level case twisted mean ...
...,...,...
913,9,not osama bin ladin
914,9,thing happen both buried sea least shot bin la...
915,9,right not sure anything lin might distracting ...
916,9,glad see others making switch


Visualizations of Topics

In [None]:
# Visualize Topics -> Intertopic Distance Map
topic_model.visualize_topics()

In [None]:
# Visualize Topics -> Barchart
topic_model.visualize_barchart()

In [None]:
# Visualize Topics -> Hierarchy
topic_model.visualize_hierarchy()

In [None]:
!pip install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation
!pip install hdbscan --no-cache-dir --no-binary :all:
!pip uninstall numpy
!pip install numpy==1.19

In [None]:
"""
# Save topic model
topic_model.save("Model_1")
# Load saved model
loaded_model = BERTopic.load("Model_1") 
# Access single topic -> topic 0 = most frequent topic that was generated
topic_model.get_topic(0)
# Find topics most similar to a search_term
similar_topics, similarity = topic_model.find_topics("election", top_n=5)
topic_model.get_topic(similar_topics[0])
# Visualize Topic -> Similarity (Heatmap)
topic_model.visualize_heatmap()
"""