# Topic modelling using BERTopic

## Libraries/data required

In [32]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
# implementing a countvectorizer and using it in the topic model to remove stopwords
# reason: some topic representations had many stop words
# parameter ngram range represents how large word groups can be 

vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 1))

In [35]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_topics.csv", parse_dates=["date"], index_col=0) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(13616, 21)


Unnamed: 0_level_0,summary,date,name,lat,lng,id,paragraphs,publisher,tags,sentiment_summary,...,sentiment_roberta_sum,sentiment_roberta,location_article,hunger,refugees,humanitarian,conflict,economy,agriculture,demographics
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125,0,Juba — THE parliament of South Sudan has passe...,New Vision (Kampala),"['South Sudan', 'East Africa', 'Governance', '...",POSITIVE,...,neutral,neutral,Juba,False,False,False,False,False,False,False
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,1,What is the logic of Khartoum's military actio...,South Sudan News Agency,"['South Sudan', 'East Africa', 'Business', 'Co...",NEGATIVE,...,negative,negative,Abyei,False,False,False,False,False,False,False
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,2,Secretary-General Ban Ki-moon today welcomed t...,UN News Service,"['Sudan', 'East Africa', 'Governance', 'Extern...",POSITIVE,...,neutral,neutral,Southern Kordofan,False,False,False,False,False,False,False
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,3,AS South Sudan readies itself to become the wo...,The Moment (London),"['Sudan', 'East Africa', 'Africa', 'South Sudan']",POSITIVE,...,neutral,neutral,South Sudan,False,False,False,False,False,False,False
4,The article discusses the United States' respo...,2011-07-06,Addis Ababa,8.980603,38.757761,5,U.S. DEPARTMENT OF STATE Office of the Spokesp...,"United States Department of State (Washington,...","['Sudan', 'East Africa', 'Governance', 'Extern...",POSITIVE,...,neutral,neutral,Addis Ababa,False,False,False,False,False,False,False


## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [36]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(vectorizer_model=vectorizer_model, language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [37]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:top_n] # Return a list of the top_n unique relevant topics

In [38]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['hunger', 'food insecurity', 'food', 'malnutrition', 'famine'], top_n=20)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
109,27,109_malnutrition_nutrition_children_unicef,"[malnutrition, nutrition, children, unicef, br...",[The article discusses the severe acute malnut...
13,114,13_food_fao_million_hunger,"[food, fao, million, hunger, famine, insecurit...",[The article discusses the major food security...
171,16,171_refugees_food_maban_refugee,"[refugees, food, maban, refugee, malnutrition,...",[The article discusses the UNHCR and WFP calli...
29,70,29_wfp_food_assistance_programme,"[wfp, food, assistance, programme, world, mill...",[The article discusses the European Commission...
146,19,146_pibor_gumuruk_dorein_labrab,"[pibor, gumuruk, dorein, labrab, food, distrib...",[The article discusses the ongoing humanitaria...
57,42,57_flooding_floods_flood_affected,"[flooding, floods, flood, affected, floodaffec...",[The article discusses aid organizations compl...
16,105,16_agriculture_agricultural_food_farmers,"[agriculture, agricultural, food, farmers, far...",[The article discusses the need for cooperatio...
73,38,73_malaria_nets_cases_disease,"[malaria, nets, cases, disease, kalaazar, heal...",[The article discusses an unprecedented malari...
9,140,9_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, billi...",[The article discusses the United States' anno...
126,22,126_refugees_refugee_ethiopia_million,"[refugees, refugee, ethiopia, million, funding...",[The article discusses the urgent appeal for f...


In [39]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['refugees', 'displaced'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
68,39,68_refugees_adjumani_uganda_refugee,"[refugees, adjumani, uganda, refugee, district...",[The article discusses the experience of a Sou...
116,25,116_refugees_ethiopia_number_unhcr,"[refugees, ethiopia, number, unhcr, refugee, f...",[The article discusses the increasing number o...
126,22,126_refugees_refugee_ethiopia_million,"[refugees, refugee, ethiopia, million, funding...",[The article discusses the urgent appeal for f...
69,39,69_yida_refugees_camp_thok,"[yida, refugees, camp, thok, unhcr, refugee, a...",[The article discusses the ongoing war in Sout...
163,17,163_refugees_unhcr_nip_refugee,"[refugees, unhcr, nip, refugee, relocation, ca...",[The article discusses efforts by UNHCR and it...
129,22,129_kakuma_camp_refugee_kenya,"[kakuma, camp, refugee, kenya, refugees, camps...",[The article discusses the UNFPA-supported Kak...
171,16,171_refugees_food_maban_refugee,"[refugees, food, maban, refugee, malnutrition,...",[The article discusses the UNHCR and WFP calli...
135,21,135_refugees_water_nile_blue,"[refugees, water, nile, blue, jamam, shortages...",[The article discusses the urgent need for rel...
195,13,195_education_refugee_school_her,"[education, refugee, school, her, children, sh...","[The article discusses Grace Taban Genova, a w..."
4,183,4_displaced_malakal_un_people,"[displaced, malakal, un, people, civilians, hu...",[The article discusses the high number of inte...


In [40]:
# Get the top 10 topics related to the keyword 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['humanitarian', 'red cross', 'aid'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs


    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,140,9_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, billi...",[The article discusses the United States' anno...
175,15,175_lanzer_toby_coordinator_humanitarian,"[lanzer, toby, coordinator, humanitarian, mr, ...",[The article discusses a press conference with...
4,183,4_displaced_malakal_un_people,"[displaced, malakal, un, people, civilians, hu...",[The article discusses the high number of inte...
126,22,126_refugees_refugee_ethiopia_million,"[refugees, refugee, ethiopia, million, funding...",[The article discusses the urgent appeal for f...
143,20,143_workers_maban_aid_mabanese,"[workers, maban, aid, mabanese, militia, count...",[The article discusses the killing of at least...
106,28,106_red_cross_icrc_ifrc,"[red, cross, icrc, ifrc, crescent, medical, wo...",[The article discusses the three-day visit of ...
26,72,26_msf_medical_patients_facilities,"[msf, medical, patients, facilities, sans, hea...",[The article discusses the looting of Médecins...
171,16,171_refugees_food_maban_refugee,"[refugees, food, maban, refugee, malnutrition,...",[The article discusses the UNHCR and WFP calli...
160,18,160_un_condemnation_civilians_hostilities,"[un, condemnation, civilians, hostilities, arm...",[The article discusses the condemnation by the...
21,97,21_darfur_unamid_doha_ddpd,"[darfur, unamid, doha, ddpd, peacekeepers, nya...",[The article discusses a former rebel leader's...


In [41]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['conflict', 'fighting', 'murder', 'attack', 'military', 'bomb'], top_n=15)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
122,24,122_ceasefire_rebels_upper_nassir,"[ceasefire, rebels, upper, nassir, truce, viol...",[The article discusses clashes between governm...
132,21,132_ddr_excombatants_reintegration_program,"[ddr, excombatants, reintegration, program, de...",[The article discusses the plans of the South ...
176,15,176_jonglei_conflicts_intertribal_wanglei,"[jonglei, conflicts, intertribal, wanglei, tri...",[The article discusses the increase of violenc...
27,72,27_border_kordofan_blue_accusations,"[border, kordofan, blue, accusations, both, su...",[The article discusses the decision by the Sud...
199,12,199_conflict_civil_ruling_resulted,"[conflict, civil, ruling, resulted, displaced,...",[The article discusses the ongoing civil war i...
177,15,177_her_kenyan_shot_died,"[her, kenyan, shot, died, veronika, missionary...",[The article discusses the death of Sr. Veroni...
32,67,32_machar_riek_former_president,"[machar, riek, former, president, vice, kiir, ...",[The article discusses the call from internall...
56,43,56_talks_detainees_ababa_addis,"[talks, detainees, ababa, addis, political, pe...",[The article discusses the involvement of seve...
202,12,202_peacekeeping_battalion_infantry_chinas,"[peacekeeping, battalion, infantry, chinas, ch...",[The article discusses the deployment of a 700...
65,40,65_abraham_isaiah_assassination_awuol,"[abraham, isaiah, assassination, awuol, murder...",[The article discusses the assassination of So...


In [42]:
# Get the top 10 topics related to the keywords that have something to do with the economy.... but might remove this
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['economy', 'invest', 'bank', 'money'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs
    
df["economy"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
79,35,79_bank_banks_kcb_banking,"[bank, banks, kcb, banking, equity, growth, cf...",[The article discusses Kenya Commercial Bank's...
92,32,92_investment_investors_opportunities_business,"[investment, investors, opportunities, busines...",[The article discusses South Sudan's plan to h...
180,15,180_prices_goods_retailers_price,"[prices, goods, retailers, price, market, trad...",[The article discusses the debts owed to the E...
210,10,210_timber_congo_export_ugandas,"[timber, congo, export, ugandas, markets, lowe...",[The article discusses how lower prices of met...
167,16,167_uap_business_investment_opportunities,"[uap, business, investment, opportunities, eas...",[The article discusses Tanzanian investors who...
88,32,88_bank_imf_world_development,"[bank, imf, world, development, fund, finance,...",[The article discusses a general cooperation a...
43,53,43_currency_bank_devaluation_exchange,"[currency, bank, devaluation, exchange, pound,...",[The article discusses how the Central Bank of...
205,11,205_inflation_beverages_price_consumer,"[inflation, beverages, price, consumer, nonalc...",[The article discusses a decrease in inflation...
121,24,121_traders_ugandan_trade_compensation,"[traders, ugandan, trade, compensation, goods,...","[The article discusses how the Ugandan body, S..."
168,16,168_fuel_shortage_petrol_trucks,"[fuel, shortage, petrol, trucks, petroleum, el...",[The article discusses how the ministry of pet...


In [43]:
# Get the top 10 topics related to the keywords that have something to do with demographics
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['health', 'education', 'home', 'electricity'], top_n=15)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

df["demographics"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
105,28,105_education_school_schools_primary,"[education, school, schools, primary, children...",[The article discusses the challenges faced by...
11,122,11_students_university_education_higher,"[students, university, education, higher, univ...",[The article discusses a row between Arabic-sp...
187,13,187_education_examination_school_schools,"[education, examination, school, schools, resu...",[The article discusses the release of the resu...
25,76,25_health_maternal_mortality_care,"[health, maternal, mortality, care, healthcare...",[The article discusses the challenges faced by...
211,10,211_textbooks_books_primary_teacher,"[textbooks, books, primary, teacher, textbook,...",[The article discusses the distribution of new...
195,13,195_education_refugee_school_her,"[education, refugee, school, her, children, sh...","[The article discusses Grace Taban Genova, a w..."
183,14,183_vocational_training_skills_centres,"[vocational, training, skills, centres, labour...",[The article discusses the South Sudanese gove...
139,20,139_language_arabic_english_languages,"[language, arabic, english, languages, instruc...",[The article discusses the debate about using ...
83,34,83_youth_conference_sports_youths,"[youth, conference, sports, youths, ghazal, yo...",[The article discusses the appointment of Hon ...
109,27,109_malnutrition_nutrition_children_unicef,"[malnutrition, nutrition, children, unicef, br...",[The article discusses the severe acute malnut...


In [44]:
# Get the top 10 topics related to the keywords that i make up again 2.0 
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['agriculture', 'farm', 'weather', 'livestock', 'flood'], top_n=15)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs
    
df["agriculture"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
145,19,145_livestock_animal_fisheries_cattle,"[livestock, animal, fisheries, cattle, animals...",[The article discusses the importance of lives...
16,105,16_agriculture_agricultural_food_farmers,"[agriculture, agricultural, food, farmers, far...",[The article discusses the need for cooperatio...
57,42,57_flooding_floods_flood_affected,"[flooding, floods, flood, affected, floodaffec...",[The article discusses aid organizations compl...
37,57,37_cattle_warrap_raiding_county,"[cattle, warrap, raiding, county, rustling, co...",[The article discusses the National Legislativ...
13,114,13_food_fao_million_hunger,"[food, fao, million, hunger, famine, insecurit...",[The article discusses the major food security...
180,15,180_prices_goods_retailers_price,"[prices, goods, retailers, price, market, trad...",[The article discusses the debts owed to the E...
210,10,210_timber_congo_export_ugandas,"[timber, congo, export, ugandas, markets, lowe...",[The article discusses how lower prices of met...
29,70,29_wfp_food_assistance_programme,"[wfp, food, assistance, programme, world, mill...",[The article discusses the European Commission...
109,27,109_malnutrition_nutrition_children_unicef,"[malnutrition, nutrition, children, unicef, br...",[The article discusses the severe acute malnut...
118,25,118_land_grabbing_demolition_equatoria,"[land, grabbing, demolition, equatoria, lease,...",[The article discusses the issue of land grabb...


In [50]:
original_df = pd.read_csv("data/articles_topics.csv", parse_dates=["date"], index_col=0)

# Combine article summaries with the newly created features
df = original_df.merge(
    df[
        [
            "summary",
            "hunger",
            "refugees",
            "humanitarian",
            "conflict",
            "economy",
            "agriculture",
            "demographics",
        ]
    ],
    how="left",
    left_on="summary",
    right_on="summary",
)
df = df.loc[:, ~df.columns.str.contains("_x")]
df.columns = df.columns.str.replace("_y", "")

df.to_csv(
    "data/articles_topics.csv", index=False
)  # Save DataFrame to articles_topics.csv

In [21]:
# still very unhappy with this number, want to figure out what to do with it but would need more time ...
# however; if the article does not fall in these categories it is likely not relevant? 

print(len(df))
print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False) & (df["economy"] == False) & (df["agriculture"] == False) & (df["demographics"] == False)]))

13616
10742


### Seperate all categories

Seperate all the articles topics into three categories, `<category>_positive`, `<category>_neutral` and `<category>_negative`

In [52]:
topics = [
    "hunger",
    "refugees",
    "humanitarian",
    "conflict",
    "economy",
    "agriculture",
    "demographics",
]
for topic in topics:
    df[f"{topic}_positive"] = df[topic] & (df["sentiment_roberta"] == "positive")
    df[f"{topic}_negative"] = df[topic] & (df["sentiment_roberta"] == "negative")
    df[f"{topic}_neutral"] = df[topic] & (df["sentiment_roberta"] == "neutral")

df.to_csv("data/articles_topics.csv", index=False)