In [1]:
import pandas as pd
import bertopic

# 1. Data Check

In [2]:
INPUT = "/VData/scro4316/ct_prevalence/calvin_posts_cleaned.parquet"
df = pd.read_parquet(INPUT)
total = df['post_text'].shape[0]
na_rows = df['post_text'].isna().sum()
missing_rate = na_rows / total * 100
platform_counts = df['platform'].value_counts()

print(f"The dataset calvin_posts contains: {total} records")
print(f"However, {na_rows} records are missing values (worth checking) - {missing_rate:.2f}% missing rate")
print("Platform distribution for bertopic is executed for following data:")
print(platform_counts)

The dataset calvin_posts contains: 555872 records
However, 0 records are missing values (worth checking) - 0.00% missing rate
Platform distribution for bertopic is executed for following data:
platform
truthsocial    186506
gab            112722
X               86061
4chan           62638
bluesky         53242
gettr           42091
fediverse       12612
Name: count, dtype: int64


In [None]:
df.columns


Index(['id', 'platform', 'topic', 'post_text', 'post_clean', 'embed_id'], dtype='object')

In [4]:
print(df['platform'].value_counts())
print(df['topic'].value_counts())

platform
truthsocial    186506
gab            112722
X               86061
4chan           62638
bluesky         53242
gettr           42091
fediverse       12612
Name: count, dtype: int64
topic
KEYWORDS_COVID19    275457
KEYWORDS_NWO        218558
KEYWORDS_ALIEN       49343
KEYWORDS_9_11         8147
KEYWORDS_MOON         4367
Name: count, dtype: int64


# 2. Preprocessing
- remove NA content, duplicates, urls, from 'post_text' column.
- use index str as `embed id` to avoid matching erros in long-id strings (with letters, numbers and special characters).

In [25]:
from langdetect import detect, LangDetectException
from tqdm import tqdm
import re
import pandas as pd

tqdm.pandas()

# ===== PRE-COMPILE ALL REGEX PATTERNS (ONE TIME ONLY) =====
URL_PATTERN = re.compile(r'https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
PARTIAL_URL_PATTERN = re.compile(r'https?://\S*')
DOMAIN_PATTERN = re.compile(r'\s(?:www\.|(?:[\w-]+\.)+(?:com|net|org|edu|gov|mil|biz|info|io|me|tv|[\w]{2,}))\S*')
FRAGMENTS_PATTERN = re.compile(r'(?:press\.coop|gab\.com|youtube\.com|bitchute\.com|imdb\.com)\/\S*')
ASCII_PATTERN = re.compile(r'[^\x00-\x7F]+')
SPACE_PATTERN = re.compile(r'\s+')

def remove_non_english(text):
    """
    Language detection using langdetect
    """
    if not isinstance(text, str) or len(text.strip()) < 3:
        return text
    
    try:
        detected_lang = detect(text)
        if detected_lang != 'en':
            return None
        return text
    except LangDetectException:
        # If detection fails, keep the text
        return text    

def remove_urls(text):
    """
    Remove all URLs from the text using pre-compiled patterns
    """
    if not isinstance(text, str):
        return text
    
    # Use pre-compiled patterns (NO recompilation)
    text = URL_PATTERN.sub('', text)
    text = PARTIAL_URL_PATTERN.sub('', text)
    text = DOMAIN_PATTERN.sub(' ', text)
    text = FRAGMENTS_PATTERN.sub('', text)
    
    return text

def clean_text(text):
    """
    Comprehensive text cleaning function with optimized regex
    """
    if not isinstance(text, str) or len(text.strip()) == 0:
        return None
    
    # First check if text is English (before removing content)
    text = remove_non_english(text)
    if text is None:
        return None
        
    # Then remove URLs
    text = remove_urls(text)
    
    # Remove emojis and non-ASCII characters using pre-compiled pattern
    text = ASCII_PATTERN.sub('', text)
    
    # Remove multiple spaces using pre-compiled pattern
    text = SPACE_PATTERN.sub(' ', text).strip()
    
    # Return None if text becomes empty after cleaning
    return text if len(text) > 0 else None

In [26]:
df = df.dropna()
df = df.drop_duplicates(subset=['id'])
df['post_clean'] = df['post_text'].progress_apply(clean_text)

  0%|          | 86/555872 [00:00<10:48, 856.89it/s]

100%|██████████| 555872/555872 [14:42<00:00, 630.19it/s] 


In [27]:
# save cleaned data
df = df.reset_index(drop=True)
df['embed_id'] = df.index.astype(str)
OUTPUT = "/VData/scro4316/ct_prevalence/calvin_posts_cleaned.parquet"
df.to_parquet(OUTPUT, index=False)


# 3. pre-embed texts with Mpnet

In [6]:
df = pd.read_parquet("/VData/scro4316/ct_prevalence/calvin_posts_cleaned.parquet")

In [7]:
import os
from openai import OpenAI
import datetime
import numpy as np
import sentence_transformers
from sentence_transformers import SentenceTransformer
import torch
import gc

# Clear any existing CUDA memory first
torch.cuda.empty_cache()
gc.collect()

# Set GPU device to 1 (and verify it)
torch.cuda.set_device(1)
DEVICE = "cuda:1"
print("Current CUDA device:", torch.cuda.current_device())
print("Using device:", torch.cuda.get_device_properties(1))

SERVER = "/VData/scro4316/ct_prevalence"
OUTPUT_PATH = f"{SERVER}/results"
EMBEDDINGS_OUTPUT_PATH = f"{SERVER}/embeddings"

#  Embed claim text and save it to the embedding output ######
# Prepare data for embedding (using unique texts only)
ids = df['embed_id'].astype(str).tolist()  # Representative post_ids for unique texts
docs = df['post_clean'].tolist()  # Unique texts only

# Additional validation - ensure all docs are strings
clean_docs = []
clean_ids = []
# ensure no 0-length strings after cleaning
for i, doc in enumerate(docs):
    if isinstance(doc, str) and len(doc.strip()) > 0:
        clean_docs.append(doc.strip())
        clean_ids.append(ids[i])

print(f"Final validation: {len(clean_docs)} valid documents out of {len(docs)}")
ids = clean_ids
docs = clean_docs
CHUNK_SIZE = 10000  # Number of documents per chunk

#### 3. Generate embeddings and save them in chunks ######
def generate_and_save_embeddings(docs, ids, output_path, chunk_size=CHUNK_SIZE):
    """
    Generate embeddings for documents using OpenAI API and save them in chunks.
    """
    os.makedirs(output_path, exist_ok=True)
    
    # Verify GPU device before loading model
    print(f"Current GPU before model load: {torch.cuda.current_device()}")
    
    # Load embedding model
    print("Loading all-mpnet-base-v2 model...")
    print("Before model load:", torch.cuda.memory_allocated(device=DEVICE) / 1e9, "GB")
    model = sentence_transformers.SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=DEVICE)
    print(f"Model device: {next(model.parameters()).device}")
    print("After model load:", torch.cuda.memory_allocated(device=DEVICE) / 1e9, "GB")
    
    # Split into chunks
    docs_chunks = [docs[x:x+chunk_size] for x in range(0, len(docs), chunk_size)]
    ids_chunks = [ids[x:x+chunk_size] for x in range(0, len(ids), chunk_size)]
    
    for i in range(len(docs_chunks)):
        out_file = f"{output_path}/embeddings_{i+1}.npy"
        if os.path.isfile(out_file):
            print(f"Chunk {i+1} already exists, skipping...")
            continue
            
        print(f"Processing chunk {i+1} of {len(docs_chunks)}")
        print(f"Starting at {datetime.datetime.now()}")
        print(f"Current GPU device: {torch.cuda.current_device()}")  # Verify GPU for each chunk
        
        # Generate embeddings for this chunk
        embeddings = model.encode(docs_chunks[i], show_progress_bar=True, batch_size=32, device=DEVICE)
        embeddings_dict = dict(zip(ids_chunks[i], embeddings))
        
        # Save chunk to file
        np.save(out_file, embeddings_dict)
        print(f"Saved {len(embeddings_dict)} embeddings to {out_file}")
        
        # Clear GPU memory
        del embeddings
        del embeddings_dict
        torch.cuda.empty_cache()
        gc.collect()
        
        print('Memory after cleanup:', round(torch.cuda.memory_allocated(1)/1024**3,1), 'GB')
    
    print("Embedding generation completed!")

def load_embeddings(file_path, ids_order=None):
    """Load embeddings from a numpy file."""
    try:
        embeddings_dict = np.load(file_path, allow_pickle=True).item()
        if ids_order:
            embeddings_ordered = {id: embeddings_dict[id] for id in ids_order if id in embeddings_dict}
            return embeddings_ordered
        return embeddings_dict
    except Exception as e:
        print(f"Failed to load embeddings from {file_path} with error {e}")
        return None

def load_all_embeddings(folder_path, ids_order=None):
    """Load all embeddings from a directory."""
    embeddings_dict = {}
    
    if not os.path.exists(folder_path):
        print(f"Embeddings folder {folder_path} does not exist!")
        return embeddings_dict
    
    # Get filenames and sort by number
    filenames = sorted(os.listdir(folder_path), key=lambda x: int(re.search(r'\d+', x).group()))
    
    for file_name in filenames:
        if file_name.endswith('.npy'):
            chunk_embeddings = load_embeddings(os.path.join(folder_path, file_name), ids_order)
            if chunk_embeddings:
                embeddings_dict.update(chunk_embeddings)
                print(f"Loaded {len(chunk_embeddings)} embeddings from {file_name}")
    
    print(f"Total embeddings loaded: {len(embeddings_dict)}")
    return embeddings_dict

Current CUDA device: 1
Using device: _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81105MB, multi_processor_count=132, uuid=1cc4a5f1-9c3a-e533-51f5-2db38a687abb, L2_cache_size=50MB)
Final validation: 554943 valid documents out of 555872


In [None]:
generate_and_save_embeddings(docs, ids, EMBEDDINGS_OUTPUT_PATH)

# 4. Bertopic modeling by topic and platforms

In [8]:
import re
embeddings = load_all_embeddings(EMBEDDINGS_OUTPUT_PATH, ids)

Loaded 10000 embeddings from embeddings_1.npy
Loaded 10000 embeddings from embeddings_2.npy
Loaded 10000 embeddings from embeddings_3.npy
Loaded 10000 embeddings from embeddings_4.npy
Loaded 10000 embeddings from embeddings_5.npy
Loaded 10000 embeddings from embeddings_6.npy
Loaded 10000 embeddings from embeddings_7.npy
Loaded 10000 embeddings from embeddings_8.npy
Loaded 10000 embeddings from embeddings_9.npy
Loaded 10000 embeddings from embeddings_10.npy
Loaded 10000 embeddings from embeddings_11.npy
Loaded 10000 embeddings from embeddings_12.npy
Loaded 10000 embeddings from embeddings_13.npy
Loaded 10000 embeddings from embeddings_14.npy
Loaded 10000 embeddings from embeddings_15.npy
Loaded 10000 embeddings from embeddings_16.npy
Loaded 10000 embeddings from embeddings_17.npy
Loaded 10000 embeddings from embeddings_18.npy
Loaded 10000 embeddings from embeddings_19.npy
Loaded 10000 embeddings from embeddings_20.npy
Loaded 10000 embeddings from embeddings_21.npy
Loaded 10000 embedding

In [None]:
import numpy as np
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import matplotlib.pyplot as plt
from typing import List, Dict, Any
def analyze_topic_platform(df: pd.DataFrame, 
                         topic: str, 
                         platform: str, 
                         embeddings_dict: Dict[str, np.ndarray],
                         min_cluster_size: int = 20) -> tuple:
    """
    Analyze documents for a specific topic and platform combination.
    Returns topic model and document embeddings.
    """
    # Filter data
    mask = (df['topic'] == topic) & (df['platform'] == platform)
    subset_df = df[mask].copy()
    
    # Print diagnostic information
    print(f"Found {len(subset_df)} documents for {topic} on {platform}")
    
    # Filter for documents that have embeddings
    subset_df = subset_df[subset_df['embed_id'].astype(str).isin(embeddings_dict.keys())].copy()
    
    if len(subset_df) < min_cluster_size:
        print(f"Insufficient data for {topic} on {platform}: {len(subset_df)} documents")
        return None, None, None
    
    # Get documents and their embeddings
    doc_ids = subset_df['embed_id'].astype(str).tolist()
    documents = subset_df['post_clean'].tolist()
    
    print(f"Processing {len(documents)} documents after filtering")
    
    try:
        doc_embeddings = np.array([embeddings_dict[id_] for id_ in doc_ids])
        
        # Initialize models with more conservative parameters
        umap_model = UMAP(
            n_neighbors=min(15, len(documents)-1),  # Ensure n_neighbors is less than n_samples
            n_components=2,
            min_dist=0.1,
            metric='cosine',
            random_state=42
        )
        
        vectorizer_model = CountVectorizer(
            stop_words="english",
            ngram_range=(1, 3),
            min_df=2  # At least 2 documents must contain the term
        )
        
        # Initialize BERTopic with more robust parameters
        topic_model = BERTopic(
            vectorizer_model=vectorizer_model,
            min_topic_size=max(min_cluster_size, 5),  # Ensure minimum size is reasonable
            nr_topics="auto",
            calculate_probabilities=True,
            verbose=True
        )
        
        # Fit the model
        topics, _ = topic_model.fit_transform(documents, doc_embeddings)
        
        # Check if any topics were found
        if len(topic_model.get_topic_info()) <= 1:  # Only -1 topic means no clusters found
            print(f"No meaningful topics found for {topic} on {platform}")
            return None, None, None
            
        # Reduce embeddings for visualization
        reduced_embeddings = umap_model.fit_transform(doc_embeddings)
        
        return topic_model, documents, reduced_embeddings
        
    except Exception as e:
        print(f"Error processing {topic} on {platform}: {str(e)}")
        return None, None, None

def visualize_results(topic_model: BERTopic, 
                     documents: List[str], 
                     reduced_embeddings: np.ndarray,
                     topic: str,
                     platform: str) -> None:
    """
    Create and save visualizations for topic modeling results.
    """
    # Get topic info
    topic_info = topic_model.get_topic_info()
    top_topics = topic_info[topic_info['Topic'] != -1].head(10)
    
    # Create visualization
    plt.figure(figsize=(15, 10))
    
    # Plot each topic
    unique_topics = top_topics['Topic'].tolist()
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_topics)))
    
    # Plot points
    for idx, t in enumerate(unique_topics):
        mask = np.array(topic_model.topics_) == t
        if np.any(mask):
            points = reduced_embeddings[mask]
            # Get top 10 words for this topic
            words = topic_model.get_topic(t)[:10]
            label = '_'.join([word[0] for word in words])
            plt.scatter(points[:, 0], points[:, 1], 
                       c=[colors[idx]], 
                       label=f"Topic {t}: {label}",
                       alpha=0.6, s=20)
    
    plt.title(f'Topic Distribution: {topic} on {platform}')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(f'/VData/scro4316/ct_prevalence/results/topic_viz_{topic}_{platform}.png', 
                bbox_inches='tight', dpi=300)
    plt.close()

In [32]:
# Process each topic-platform combination
# topics = df['topic'].unique().tolist()
# topics = ['KEYWORDS_9_11','KEYWORDS_ALIEN', 'KEYWORDS_MOON']
topics = ['KEYWORDS_COVID19','KEYWORDS_NWO']

platforms = df['platform'].unique().tolist()
print("Starting analysis for each topic-platform combination...")
results_summary = []

for topic in topics:
    for platform in platforms:
        print(f"\nAnalyzing {topic} on {platform}")
        print("="*50)
        
        try:
            topic_model, documents, reduced_embeddings = analyze_topic_platform(
                df, topic, platform, embeddings
            )
            
            if topic_model is not None and documents is not None:
                # Get and display topic information
                topic_info = topic_model.get_topic_info()
                if len(topic_info) > 1:  # More than just the -1 topic
                    top_topics = topic_info[topic_info['Topic'] != -1].head(10)
                    
                    # Store results
                    results_summary.append({
                        'topic': topic,
                        'platform': platform,
                        'n_documents': len(documents),
                        'top_topics': top_topics
                    })
                    
                    # Create visualization
                    visualize_results(topic_model, documents, reduced_embeddings, topic, platform)
                    
                    # Print summary
                    print(f"\nResults for {topic} on {platform}:")
                    print(f"Total documents: {len(documents)}")
                    print("\nTop 10 topics:")
                    print(top_topics[['Topic', 'Count', 'Name']].to_string())
                else:
                    print(f"No meaningful topics found for {topic} on {platform}")
            
        except Exception as e:
            print(f"Error processing {topic} on {platform}: {str(e)}")
            continue
            
        print("\n" + "="*80 + "\n")

Starting analysis for each topic-platform combination...

Analyzing KEYWORDS_COVID19 on X
Found 34998 documents for KEYWORDS_COVID19 on X
Processing 34900 documents after filtering


2025-11-03 12:14:22,916 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-03 12:14:50,162 - BERTopic - Dimensionality - Completed ✓
2025-11-03 12:14:50,165 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 12:15:59,790 - BERTopic - Cluster - Completed ✓
2025-11-03 12:15:59,793 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 12:16:02,326 - BERTopic - Representation - Completed ✓
2025-11-03 12:16:02,329 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 12:16:02,408 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 12:16:04,675 - BERTopic - Representation - Completed ✓
2025-11-03 12:16:04,678 - BERTopic - Topic reduction - Reduced number of topics from 220 to 4
2025-11-03 12:16:34,927 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Results for KEYWORDS_COVID19 on X:
Total documents: 34900

Top 10 topics:
   Topic  Count                                           Name
1      0  18817               0_covid_vaccine_vaccines_covid19
2      1     96                                 1_la_il_di_che
3      2     25  2_english_golden_vaccine created_like covid19



Analyzing KEYWORDS_COVID19 on fediverse
Found 6096 documents for KEYWORDS_COVID19 on fediverse
Processing 6082 documents after filtering


2025-11-03 12:16:43,169 - BERTopic - Dimensionality - Completed ✓
2025-11-03 12:16:43,170 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 12:16:43,873 - BERTopic - Cluster - Completed ✓
2025-11-03 12:16:43,874 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 12:16:45,250 - BERTopic - Representation - Completed ✓
2025-11-03 12:16:45,252 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 12:16:45,266 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 12:16:46,491 - BERTopic - Representation - Completed ✓
2025-11-03 12:16:46,494 - BERTopic - Topic reduction - Reduced number of topics from 64 to 28
2025-11-03 12:16:55,017 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Results for KEYWORDS_COVID19 on fediverse:
Total documents: 6082

Top 10 topics:
    Topic  Count                                         Name
1       0   1949             0_covid_vaccine_vaccines_covid19
2       1    125                    1_woke_patriot_says_trump
3       2    124                      2_jews_lie_russians_jew
4       3    115                  3_usaid_funded_musk_funding
5       4     66  4_scamdemic_cdc grooming_grooming_new world
6       5     65               5_economy_assets_wealth_market
7       6     56        6_canada_canadian_minister_government
8       7     52          7_ivermectin_cancer_drug_remdesivir
9       8     49        8_covid_memes_covid1984_covid covid19
10      9     48       9_cancer_cancers_aggressive_soonshiong



Analyzing KEYWORDS_COVID19 on bluesky
Found 27487 documents for KEYWORDS_COVID19 on bluesky
Processing 27467 documents after filtering


2025-11-03 12:17:14,835 - BERTopic - Dimensionality - Completed ✓
2025-11-03 12:17:14,836 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 12:17:27,545 - BERTopic - Cluster - Completed ✓
2025-11-03 12:17:27,546 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 12:17:29,205 - BERTopic - Representation - Completed ✓
2025-11-03 12:17:29,206 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 12:17:29,236 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 12:17:30,849 - BERTopic - Representation - Completed ✓
2025-11-03 12:17:30,852 - BERTopic - Topic reduction - Reduced number of topics from 122 to 55



Results for KEYWORDS_COVID19 on bluesky:
Total documents: 27467

Top 10 topics:
    Topic  Count                                          Name
1       0  11054               0_covid_people_vaccines_vaccine
2       1    384                     1_masks_mask_wear_wearing
3       2    276                     2_uk_tories_labour_brexit
4       3    176                         3_shes_covid_lady_did
5       4    172        4_fauci_anthony_anthony fauci_function
6       5    163                  5_musk_elon_trump_trump musk
7       6    116               6_canada_trudeau_ford_canadians
8       7    104  7_facebook_zuckerberg_censor_mark zuckerberg
9       8    103  8_plandemic_real plandemic_real_plandemic 20
10      9     98                  9_ballots_2020_election_mail



Analyzing KEYWORDS_COVID19 on truthsocial
Found 98878 documents for KEYWORDS_COVID19 on truthsocial
Processing 98799 documents after filtering


2025-11-03 12:17:53,926 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-03 12:21:28,204 - BERTopic - Dimensionality - Completed ✓
2025-11-03 12:21:28,208 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 16:53:03,482 - BERTopic - Cluster - Completed ✓
2025-11-03 16:53:03,496 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 16:53:16,055 - BERTopic - Representation - Completed ✓
2025-11-03 16:53:16,060 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 16:53:17,338 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 16:53:28,543 - BERTopic - Representation - Completed ✓
2025-11-03 16:53:28,556 - BERTopic - Topic reduction - Reduced number of topics from 1017 to 811



Results for KEYWORDS_COVID19 on truthsocial:
Total documents: 98799

Top 10 topics:
    Topic  Count                                                                 Name
1       0   1583                                             0_china_chinese_ccp_land
2       1   1414  1_involved_collusion hoax_involved russia_involved russia collusion
3       2   1243                                          2_hes_heaposs_needs_nursing
4       3   1114                                      3_mrna_mortality_studies_excess
5       4   1024                                                 4_gop_zero_math_dems
6       5    965                                    5_fauci_dr fauci_aids_fauci needs
7       6    952                                                  6_flu_came_lab_died
8       7    935                                         7_bird_bird flu_chickens_flu
9       8    822               8_zuckerberg_ad_rockefeller foundation_mark zuckerberg
10      9    747                                       

2025-11-03 16:55:40,476 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-03 16:55:51,408 - BERTopic - Dimensionality - Completed ✓
2025-11-03 16:55:51,411 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 16:55:53,495 - BERTopic - Cluster - Completed ✓
2025-11-03 16:55:53,496 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 16:55:55,733 - BERTopic - Representation - Completed ✓
2025-11-03 16:55:55,735 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 16:55:55,747 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 16:55:57,854 - BERTopic - Representation - Completed ✓
2025-11-03 16:55:57,858 - BERTopic - Topic reduction - Reduced number of topics from 61 to 36



Results for KEYWORDS_COVID19 on 4chan:
Total documents: 16358

Top 10 topics:
    Topic  Count                                 Name
1       0   5273          0_covid_vaccine_people_just
2       1    627              1_jews_jewish_jew_covid
3       2    523       2_clot_clot shot_shot_clotshot
4       3    232           3_ukraine_russia_war_putin
5       4    223              4_people_shit_like_just
6       5    201             5_eggs_chickens_egg_bird
7       6    175  6_canada_canadians_trudeau_canadian
8       7    154             7_women_white_men_people
9       8    128              8_god_beast_mark_christ
10      9    113                  9_5g_cloud_ngo_able



Analyzing KEYWORDS_COVID19 on gettr
Found 28834 documents for KEYWORDS_COVID19 on gettr
Processing 28762 documents after filtering


2025-11-03 16:56:09,054 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-03 16:56:31,625 - BERTopic - Dimensionality - Completed ✓
2025-11-03 16:56:31,627 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 17:00:24,049 - BERTopic - Cluster - Completed ✓
2025-11-03 17:00:24,052 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 17:00:28,070 - BERTopic - Representation - Completed ✓
2025-11-03 17:00:28,075 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 17:00:28,283 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 17:00:32,258 - BERTopic - Representation - Completed ✓
2025-11-03 17:00:32,262 - BERTopic - Topic reduction - Reduced number of topics from 377 to 34



Results for KEYWORDS_COVID19 on gettr:
Total documents: 28762

Top 10 topics:
    Topic  Count                                           Name
1       0  17830               0_covid_vaccine_covid19_vaccines
2       1    105                       1_jews_israel_jewish_jew
3       2     95    2_plandemic_plan_plandemic 20_new plandemic
4       3     74                   3_newsom_california_la_gavin
5       4     74                 4_dangerous_uk_government_just
6       5     65              5_birx_covid_deborah birx_deborah
7       6     60              6_carney_canada_trudeau_canadians
8       7     57       7_prosecuted_fully_willful_fully exposed
9       8     55  8_davos_party_davos party_virus covid vaccine
10      9     50                 9_pope_church_vatican_catholic



Analyzing KEYWORDS_COVID19 on gab
Found 62795 documents for KEYWORDS_COVID19 on gab
Processing 62634 documents after filtering


2025-11-03 17:00:53,074 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-03 17:01:42,579 - BERTopic - Dimensionality - Completed ✓
2025-11-03 17:01:42,582 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 17:44:13,829 - BERTopic - Cluster - Completed ✓
2025-11-03 17:44:13,831 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 17:44:30,052 - BERTopic - Representation - Completed ✓
2025-11-03 17:44:30,060 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 17:44:30,731 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 17:44:46,558 - BERTopic - Representation - Completed ✓
2025-11-03 17:44:46,568 - BERTopic - Topic reduction - Reduced number of topics from 693 to 37



Results for KEYWORDS_COVID19 on gab:
Total documents: 62634

Top 10 topics:
    Topic  Count                                                     Name
1       0  39514                             0_covid_people_vaccine_trump
2       1    779                             1_born_prime_nonprime_verses
3       2    134                        2_clotshot_dies_age_notabledeaths
4       3    124                       3_cabal_obsidian_digital euro_usdr
5       4    111  4_whats_trump2020_socialist_national security apparatus
6       5     62                                5_dea_fbi_news_precursors
7       6     53                           6_lesson_ninja_ninja kids_rick
8       7     51                     7_aluminum_rifles_243_243 winchester
9       8     50                         8_uk_globalist_pakistani_muslims
10      9     46             9_said_nothingwhen_said nothingwhen_illegals



Analyzing KEYWORDS_NWO on X
Found 32517 documents for KEYWORDS_NWO on X


2025-11-03 17:45:32,403 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Processing 32406 documents after filtering


2025-11-03 17:45:56,145 - BERTopic - Dimensionality - Completed ✓
2025-11-03 17:45:56,147 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 17:46:27,843 - BERTopic - Cluster - Completed ✓
2025-11-03 17:46:27,846 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 17:46:30,241 - BERTopic - Representation - Completed ✓
2025-11-03 17:46:30,244 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 17:46:30,307 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 17:46:32,503 - BERTopic - Representation - Completed ✓
2025-11-03 17:46:32,507 - BERTopic - Topic reduction - Reduced number of topics from 191 to 133
2025-11-03 17:46:51,911 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Results for KEYWORDS_NWO on X:
Total documents: 32406

Top 10 topics:
    Topic  Count                                                              Name
1       0   1477                          0_illuminati_globalist_illuminatiam_join
2       1   1123                         1_freemasonry_freemasons_freemason_israel
3       2   1060  2_depopulation_depopulation agenda_population_population control
4       3    721                                     3_obama_deep state_deep_biden
5       4    695                                4_canada_carney_canadians_canadian
6       5    544                   5_wwg1wga_ncswic_wwg1wga wwg1wga_ncswic wwg1wga
7       6    540                              6_nwo_antichrist_nwo nwo_welcome nwo
8       7    485                             7_psyop_psyop psyop_psyop just_psyops
9       8    427                             8_reset_great reset_great_reset great
10      9    398                                       9_elon_musk_trump_elon musk



Analyzing KEY

2025-11-03 17:47:02,512 - BERTopic - Dimensionality - Completed ✓
2025-11-03 17:47:02,514 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 17:47:02,670 - BERTopic - Cluster - Completed ✓
2025-11-03 17:47:02,671 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 17:47:03,201 - BERTopic - Representation - Completed ✓
2025-11-03 17:47:03,202 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 17:47:03,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 17:47:03,674 - BERTopic - Representation - Completed ✓
2025-11-03 17:47:03,676 - BERTopic - Topic reduction - Reduced number of topics from 31 to 17
2025-11-03 17:47:14,659 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Results for KEYWORDS_NWO on fediverse:
Total documents: 2845

Top 10 topics:
    Topic  Count                                                  Name
1       0   1071                         0_world_new_order_world order
2       1    171                         1_vaccines_mrna_covid_covid19
3       2    137  2_human blood_blood sacrifice_real illuminati_helped
4       3    130      3_illuminati_freemasonry_freemasons_organization
5       4    130  4_population_depopulation_control_population control
6       5     74                         5_psyop_jfk_cia_assassination
7       6     56                      6_musk_elon_elon musk_technology
8       7     46                       7_reset_great reset_great_going
9       8     43                        8_canada_carney_mark carney_eu
10      9     35                  9_globohomo_science_truth_reptilians



Analyzing KEYWORDS_NWO on bluesky
Found 13094 documents for KEYWORDS_NWO on bluesky
Processing 13082 documents after filtering


2025-11-03 17:47:24,234 - BERTopic - Dimensionality - Completed ✓
2025-11-03 17:47:24,236 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 17:47:27,573 - BERTopic - Cluster - Completed ✓
2025-11-03 17:47:27,574 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 17:47:28,277 - BERTopic - Representation - Completed ✓
2025-11-03 17:47:28,278 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 17:47:28,297 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 17:47:28,937 - BERTopic - Representation - Completed ✓
2025-11-03 17:47:28,940 - BERTopic - Topic reduction - Reduced number of topics from 92 to 51



Results for KEYWORDS_NWO on bluesky:
Total documents: 13082

Top 10 topics:
    Topic  Count                                                                      Name
1       0   2535                             0_world order_order_new world_new world order
2       1   1095                             1_illuminati_whatsapp_great illuminati_member
3       2    897                            2_population_population control_control_people
4       3    777                   3_controlled opposition_controlled_opposition_democrats
5       4    602  4_freemason scheme_democracy freemason_democracy freemason scheme_scheme
6       5    258                                   5_nwo_fuse_long_secret world government
7       6    240                              6_freemasons_freemasonry_demon_strange oaths
8       7    192                                    7_psyop_government psyop_cia_cia psyop
9       8    188                                            8_israel_gaza_genocide_zionist
10      9    

2025-11-03 17:47:38,148 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-03 17:48:49,984 - BERTopic - Dimensionality - Completed ✓
2025-11-03 17:48:49,989 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 18:51:27,792 - BERTopic - Cluster - Completed ✓
2025-11-03 18:51:27,796 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 18:51:37,898 - BERTopic - Representation - Completed ✓
2025-11-03 18:51:37,905 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 18:51:38,622 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 18:51:48,539 - BERTopic - Representation - Completed ✓
2025-11-03 18:51:48,549 - BERTopic - Topic reduction - Reduced number of topics from 733 to 538



Results for KEYWORDS_NWO on truthsocial:
Total documents: 76107

Top 10 topics:
    Topic  Count                                                             Name
1       0   5014                                 0_paper_ballots_people need_hand
2       1   2438                       1_world order_world_world government_order
3       2   2124               2_population_depopulation_population control_gates
4       3   1601                                  3_putin_ukraine_russia_zelensky
5       4   1346  4_amen thank wwg1wga_amen thank_wwg1wga amen thank_wwg1wga amen
6       5   1340                                         5_jews_jewish_israel_jew
7       6   1222                                          6_nwo_nwo nwo_rt_nwo rt
8       7    970                    7_freemasons_freemason_freemasonry_illuminati
9       8    903                                          8_smart_fires_la_cities
10      9    687                             9_canada_carney_north_north american



Analyzing KEYW

2025-11-03 18:52:58,351 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-03 18:53:21,704 - BERTopic - Dimensionality - Completed ✓
2025-11-03 18:53:21,708 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 18:54:05,211 - BERTopic - Cluster - Completed ✓
2025-11-03 18:54:05,215 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 18:54:09,401 - BERTopic - Representation - Completed ✓
2025-11-03 18:54:09,404 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 18:54:09,472 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 18:54:13,484 - BERTopic - Representation - Completed ✓
2025-11-03 18:54:13,489 - BERTopic - Topic reduction - Reduced number of topics from 196 to 118
2025-11-03 18:54:34,969 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Results for KEYWORDS_NWO on 4chan:
Total documents: 33948

Top 10 topics:
    Topic  Count                                                               Name
1       0   4973                                     0_jews_jewish_freemasons_psyop
2       1   2911                                      1_ukraine_russia_nato_russian
3       2    574                                          2_china_japan_chinese_ccp
4       3    535                                     3_covid_vaccine_vaccines_virus
5       4    514  4_ai_total surveillance_killing jews politicians_jews politicians
6       5    380                                          5_illuminati_im_know_like
7       6    307                                 6_canada_carney_canadian_canadians
8       7    304                                        7_elon_musk_trump_elon musk
9       8    289          8_white men_public enemy number_public enemy_enemy number
10      9    288   9_population_population control_depopulation_depopulation agenda



2025-11-03 18:54:42,849 - BERTopic - Dimensionality - Completed ✓
2025-11-03 18:54:42,852 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 18:54:46,181 - BERTopic - Cluster - Completed ✓
2025-11-03 18:54:46,182 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 18:54:48,623 - BERTopic - Representation - Completed ✓
2025-11-03 18:54:48,626 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 18:54:48,647 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 18:54:50,961 - BERTopic - Representation - Completed ✓
2025-11-03 18:54:50,965 - BERTopic - Topic reduction - Reduced number of topics from 98 to 33



Results for KEYWORDS_NWO on gettr:
Total documents: 11536

Top 10 topics:
    Topic  Count                               Name
1       0   5737          0_world_trump_people_just
2       1    338             1_nwo_sb_divided_trump
3       2    313     2_communist_fires_smart_cities
4       3    311  3_yhwh_swamp_america_swamp combat
5       4    240  4_canada_carney_trudeau_greenland
6       5    192      5_ukraine_zelensky_war_russia
7       6    174        6_elon_musk_elon musk_musks
8       7     92        7_shes_world_eu_world order
9       8     76         8_women_men_trans_children
10      9     71            9_ccp_china_chinese_guo



Analyzing KEYWORDS_NWO on gab
Found 48373 documents for KEYWORDS_NWO on gab
Processing 48303 documents after filtering


2025-11-03 18:54:58,885 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-03 18:55:35,746 - BERTopic - Dimensionality - Completed ✓
2025-11-03 18:55:35,748 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-03 19:05:42,743 - BERTopic - Cluster - Completed ✓
2025-11-03 19:05:42,746 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-03 19:05:55,713 - BERTopic - Representation - Completed ✓
2025-11-03 19:05:55,719 - BERTopic - Topic reduction - Reducing number of topics
2025-11-03 19:05:56,041 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-03 19:06:09,141 - BERTopic - Representation - Completed ✓
2025-11-03 19:06:09,151 - BERTopic - Topic reduction - Reduced number of topics from 491 to 329



Results for KEYWORDS_NWO on gab:
Total documents: 48303

Top 10 topics:
    Topic  Count                                                     Name
1       0   2626                          0_zionist_zionist jews_jews_usa
2       1   1764                       1_honoring_noahide_kushner_deceive
3       2   1107                            2_nwo_nwo wants_trump nwo_hes
4       3   1064                  3_climate_climate change_science_change
5       4   1010                           4_vaccines_gates_vaccine_fauci
6       5    784                  5_pope_catholic_vatican_catholic church
7       6    724  6_hollywood illuminati_hollywood_illuminati_deep church
8       7    706       7_trump2020_socialist_stopthesteal_trump trump2020
9       8    543                            8_north_screen_china_japanese
10      9    477                            9_white_whites_racism_kalergi




In [33]:
import pandas as pd
import numpy as np
from pathlib import Path

# Create results directory if it doesn't exist
results_dir = Path("/VData/scro4316/ct_prevalence/results")
results_dir.mkdir(exist_ok=True)

# Store results for each topic-platform combination
topic_platform_results = {}

# First, save individual results as CSVs
for result in results_summary:
    topic = result['topic']
    platform = result['platform']
    top_topics_df = result['top_topics']
    
    # Add representative words column
    top_topics_df['Rep_Words'] = top_topics_df['Name'].apply(lambda x: ', '.join(x.split('_')[:10]))
    top_topics_df['Doc_Count'] = top_topics_df['Count']
    
    # Save to CSV
    filename = f"{topic}_{platform}_topics.csv"
    top_topics_df.to_csv(results_dir / filename)
    
    # Store in dictionary for table creation
    if topic not in topic_platform_results:
        topic_platform_results[topic] = {}
    topic_platform_results[topic][platform] = {
        'topics': top_topics_df['Rep_Words'].tolist(),
        'counts': top_topics_df['Count'].tolist()
    }

topics = ['KEYWORDS_COVID19','KEYWORDS_NWO']
# Create comparison tables for each topic
for topic in topics:
    print(f"\n{'-'*40}")
    print(f"Topic: {topic}")
    print(f"{'-'*40}")
    
    # Create DataFrame with platforms as columns
    table_data = []
    for i in range(10):  # For each of the top 10 topics
        row = {'Topic_Num': f'Topic {i}'}
        for platform in platforms:
            if platform in topic_platform_results.get(topic, {}):
                platform_data = topic_platform_results[topic][platform]
                if i < len(platform_data['topics']):
                    words = platform_data['topics'][i]
                    count = platform_data['counts'][i]
                    row[platform] = f"[{words}], n={count}"
                else:
                    row[platform] = "null"
            else:
                row[platform] = "null"
        table_data.append(row)
    
    # Create and display the table
    comparison_df = pd.DataFrame(table_data)
    comparison_df.set_index('Topic_Num', inplace=True)
    
    # Save comparison table
    filename = f"{topic}_platform_comparison.csv"
    comparison_df.to_csv(results_dir / filename)
    
    # Display the table
    pd.set_option('display.max_colwidth', None)
    print(comparison_df)
    print("\n")

print("Results have been saved to:", results_dir)


----------------------------------------
Topic: KEYWORDS_COVID19
----------------------------------------
                                                                   X  \
Topic_Num                                                              
Topic 0              [0, covid, vaccine, vaccines, covid19], n=18817   
Topic 1                                   [1, la, il, di, che], n=96   
Topic 2    [2, english, golden, vaccine created, like covid19], n=25   
Topic 3                                                         null   
Topic 4                                                         null   
Topic 5                                                         null   
Topic 6                                                         null   
Topic 7                                                         null   
Topic 8                                                         null   
Topic 9                                                         null   

                            