In [10]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2,4), sublinear_tf=True)
sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=sentence_model, vectorizer_model=vectorizer)

## Read in data

In [19]:
df = pd.read_csv("amp.csv")

In [20]:
df.head()

Unnamed: 0,date,attachments,textTranslated,detectedLanguage,likeCount,countries,authorId,url,platform,commentCount,...,submittedLanguage,themeIds,issueIds,text,updatedAt,id,index,type,issueName,themeName
0,2023-06-03,2023/06/08/evidence/749a5972-c537-4f93-80de-ad...,[object Object],en,76.0,NG,rhysoneill@gmail.com,https://www.facebook.com/instablog9ja/posts/pf...,Facebook,9.0,...,en,e83rfYgBnMmOXZCbaYt2,g80Cf4gBnMmOXZCbX4tn,actor jamie foxx reportedly par lyzed and bl n...,2023-06-08T16:14:34.393Z,749a5972-c537-4f93-80de-adb0c73527a2,prod-evidence-v1,_doc,COVID-19,Vaccine Side Effects
1,2023-06-05,,[object Object],en,2.0,KE,rhysoneill@gmail.com,https://www.facebook.com/robertalai/posts/pfbi...,Facebook,,...,en,fs31fogBnMmOXZCbnosl,g80Cf4gBnMmOXZCbX4tn,use alot of fresh ginger when it strikes your ...,2023-06-08T17:13:15.540Z,11b135fd-d34f-4bc5-9e51-b4ed28f22a22,prod-evidence-v1,_doc,COVID-19,Home Remedies
2,2023-06-07,,[object Object],en,1.0,ZA,rhysoneill@gmail.com,https://www.facebook.com/permalink.php?story_f...,Facebook,,...,en,,g80Cf4gBnMmOXZCbX4tn,vaccine kills government pushed a unapproved v...,2023-06-08T18:24:00.164Z,9f635c18-49c4-4d26-ada9-5d6ae2706f1c,prod-evidence-v1,_doc,COVID-19,
3,2023-05-17,2023/06/08/evidence/af0d9542-930b-4101-a786-20...,[object Object],en,,"ZA,Africa",rhysoneill@gmail.com,https://t.me/SAAwakened/138313,Telegram,,...,en,es3rfYgBnMmOXZCbaYt2,g80Cf4gBnMmOXZCbX4tn,reminder share share share stop medical col...,2023-06-08T14:30:29.975Z,af0d9542-930b-4101-a786-202710a13f26,prod-evidence-v1,_doc,COVID-19,Corruption
4,2023-06-08,2023/06/08/evidence/085206d6-bbb6-460e-b96e-98...,[object Object],en,,ZA,rhysoneill@gmail.com,,WhatsApp,,...,en,e83rfYgBnMmOXZCbaYt2,,the south african government has established a...,2023-06-08T14:19:09.429Z,ea9ee102-2d4f-48f3-b1f4-13b2cb4714ba,prod-evidence-v1,_doc,,Vaccine Side Effects


### Fit model

In [11]:
topics, _ = topic_model.fit_transform(df['text'].values)

In [25]:
topic_grams

[{'topic number': 0,
  'topic ngram 1': 'omicron xbb',
  'topic ngram 2': 'covid omicron xbb',
  'topic ngram 3': 'covid omicron',
  'topic ngram 4': 'covid 19',
  'topic ngram 5': 'vaccine injury',
  'topic ngram 6': 'reduced immunity',
  'topic ngram 7': 'covid 19 vaccines',
  'topic ngram 8': 'public health',
  'topic ngram 9': 'covid 19 injections',
  'topic ngram 10': 'hydrogel covid 19'},
 {'topic number': 1,
  'topic ngram 1': 'orange juice',
  'topic ngram 2': 'half cup',
  'topic ngram 3': 'day week',
  'topic ngram 4': 'cut pieces',
  'topic ngram 5': 'daily week',
  'topic ngram 6': 'times day',
  'topic ngram 7': 'lime orange',
  'topic ngram 8': 'bottle honey',
  'topic ngram 9': 'lime orange juice',
  'topic ngram 10': 'day weeks'}]

In [12]:
topic_grams = []
for k in range(len(set(topics))):
    cur_top = topic_model.get_topic(k)
    if cur_top:
        cur_d = {'topic number': k}
        for j in range(10):
            cur_d[f'topic ngram {j+1}'] = cur_top[j][0]
        topic_grams.append(cur_d)
topics_df = pd.DataFrame(topic_grams)

In [13]:
topics_df

Unnamed: 0,topic number,topic ngram 1,topic ngram 2,topic ngram 3,topic ngram 4,topic ngram 5,topic ngram 6,topic ngram 7,topic ngram 8,topic ngram 9,topic ngram 10
0,0,omicron xbb,covid omicron xbb,covid omicron,covid 19,vaccine injury,reduced immunity,covid 19 vaccines,public health,covid 19 injections,hydrogel covid 19
1,1,orange juice,half cup,day week,cut pieces,daily week,times day,lime orange,bottle honey,lime orange juice,day weeks


In [14]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,30,-1_clot brain_dr drew_lyzed bl_lyzed bl nd
1,0,22,0_omicron xbb_covid omicron xbb_covid omicron_...
2,1,10,1_orange juice_half cup_day week_cut pieces


### Add back to original dataframe

In [21]:
df['topicId'] = topics

In [None]:
df.merge(topic_model, how = "left", left_on= "topicId", right_on="Topic")