In [None]:
from bertopic import BERTopic
import pandas as pd
import os
import csv
from functools import reduce
import json
import configparser
import matplotlib.pyplot as plt
from bertopic.representation import KeyBERTInspired
import urllib.request

In [None]:
# Corpus and dictionary files to use
corpus_file = 'democracy_reports_corpus.csv'
dictionary_file = 'dimension_dictionary.json'

corpus_file_url = "https://github.com/backdem/democracy-datasets/raw/main/democracy_reports_corpus.csv"
dictionary_file_url = "https://raw.githubusercontent.com/backdem/democracy-datasets/main/dimension_dictionary.json"

In [None]:
# Download datsets if not already downloaded
if not os.path.exists(corpus_file):
    urllib.request.urlretrieve(corpus_file_url, corpus_file)
if not os.path.exists(dictionary_file):
    urllib.request.urlretrieve(dictionary_file_url, dictionary_file)

In [None]:
# load data setof all countries, years and sources
all_countries_data = pd.read_csv(corpus_file, dtype={'year': str}, comment='#')
# calculate sentence lengths
all_countries_data["sentence_len"] = all_countries_data["sentence"].apply(lambda x: len(x.split()))
# cast sentence column to string
all_countries_data['sentence'] = all_countries_data['sentence'].astype(str)

In [None]:
# list all countries in the corpus and choose one to process
countries =  pd.Series(all_countries_data['country']).unique()
print(countries)

In [None]:
# choose a country
country = ["france"]

In [None]:
df = pd.DataFrame(all_countries_data)
country_data = df[(df['country'].isin(country))]
# reset index; needed for proper parsing by BERT
country_data = country_data.reset_index(drop=True)
corpus_size = len(country_data)
number_of_words = reduce(lambda x, y: x + y, country_data["sentence_len"])

In [None]:
# show length of corpus
print(f"corpus size is {corpus_size} sentences.")
print(f"total number of words is  {number_of_words}.")
# show first 10 sentences
print(country_data.iloc[:10]['sentence'])

In [None]:
# load dictionary of topics
# strcuture [{"name": TOPIC_NAME, "words": NGRAMS_OF_KEYWORDS}, ...]
dictionary = None
with open(dictionary_file, 'r') as file:
     dictionary = json.load(file)
    
def get_seed_lists(dictionary, ngram_size):
    # create list of topics wit max ngram_size
    seeds = []
    for topic in dictionary:
        seed = [w for w in topic["words"] if len(w.split()) <= ngram_size]
        seeds.append(seed)
    return seeds

In [None]:
# Create seed list from dictionary
seeds = get_seed_lists(dictionary, 1)
print(seeds)

In [None]:
# Prepare embeddings
from sentence_transformers import SentenceTransformer
docs = country_data.sentence
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Use KeyBERTInspired representation model. 
# This gives better names to the topics.
representation_model = KeyBERTInspired()
embeddings = sentence_model.encode(docs, show_progress_bar=True)

In [None]:
# load BERT model paraphrase-MiniLM-L3-v2 (multilingual) or all-MiniLM-L6-v2 (english)
# setting min_topic_size to 7 and n_grams from 1 to 3
# we need to explore these parameters. Other parameters:
# https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html
# guided topic modeling: https://maartengr.github.io/BERTopic/getting_started/guided/guided.html
# seed_topic_list = [["corruption"],
#                   ["elections", "election", "assembly"],
#                  ["freedom", "liberty"]]
seed_topic_list = get_seed_lists(dictionary, 3)
topic_model = BERTopic(representation_model=representation_model, 
                       seed_topic_list=seed_topic_list, 
                       verbose=True, 
                       embedding_model='all-MiniLM-L6-v2', 
                       min_topic_size = 50, 
                       n_gram_range=(1, 3)
                      ).fit(docs, embeddings)

In [None]:
topic_model.visualize_documents(docs, embeddings=embeddings)

In [None]:
# get topic information
info = topic_model.get_topic_info()
# normalize counts
info["norm_count"] = info["Count"].apply(lambda x: x / corpus_size)
# print topic information
print(f"Number of topics: {len(info)}")
print(info.head())

In [None]:
# visualize topic words/n_grams
fig = topic_model.visualize_barchart(top_n_topics=10)
fig.show()

In [None]:
topic_model.visualize_topics()

In [None]:
# visualize topic clusters
topic_model.visualize_hierarchy(top_n_topics=30)

In [None]:
# search topics close to our categories
dimensions = []
for cat in dictionary:
    print(f'Closest topic to category: {cat["name"]}.')
    topics = cat["words"]
    max_similarity = 0
    
    for topic in topics:
        similar_topics, similarities = topic_model.find_topics(topic, top_n = 1)
        # most_similar = similar_topics[0]
        if similarities[0] > max_similarity:
            max_similarity = similarities[0]
            most_similar = similar_topics[0]
            best_topic = topic
    info = topic_model.get_topic_info(most_similar)
    # add normalized counts
    normalized_count = info["Count"][0] / corpus_size
    record = []
    record.append(cat["name"])
    record.append(most_similar)
    record.append(info["Name"][0])
    record.append(best_topic)
    record.append(max_similarity)
    record.append(normalized_count)
    record.append(topic_model.get_topic(most_similar))
    
    
    dimensions.append(record)

    print(f"Most Similar Topic Info: {topic_model.get_topic(most_similar)}")
    print(f"Most Similar Topic Number: {most_similar}")
    print(f"Best seed match: {best_topic}")
    print(f"Similarity Score: {max_similarity}")
    print(f"Topic normalized count: {normalized_count}")
    print(f"Topic info: {topic_model.get_topic_info(most_similar)}")
    print("-----")


In [None]:
result_folder = "results"

In [None]:
# write csv
file_name = country[0] + "_" + year + "_dimenstions.csv"
with open(os.path.join(result_folder, file_name), mode="w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Dimension", "Topic_No", "Topic_Name",  "Best_Dict_Word_Match", "Similarity", "Normalized_Count", "Topic_Words"])
    for row in dimensions:
        writer.writerow(row)

In [None]:
# get document info
doc_info = topic_model.get_document_info(country_data.sentence)
# write csv
file_name = country[0] + "_" + year + "_sentences.csv"
with open(os.path.join(result_folder, file_name), mode="w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Sentence", "Topic_Name", "Topic_No", "Probability"])
    for document, topic, name, top_n_words, prob in zip(doc_info["Document"], doc_info["Topic"], doc_info["Name"], doc_info["Top_n_words"], doc_info["Probability"]):
        writer.writerow([document, name, topic, prob])

In [None]:
# save topic information
info = topic_model.get_topic_info()
file_name = country[0] + "_" + year + "_topics.csv"
with open(os.path.join(result_folder, file_name), mode="w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Topic_No", "Topic_Name", "Count", "Topic_Words"])
    for row in zip(info["Topic"], info["Name"], info["Count"]):
        if row[0] == -1:
            continue
        row = row + (topic_model.get_topic(row[0]),)
        writer.writerow(row)

In [None]:
# save model
topic_model.save(f"{result_folder}/{country[0]}_{year}")

# loading model
# model=BERTopic.load("../data/file")