In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geotext import GeoText
import pycountry
import re
from collections import Counter
from collections import defaultdict
import matplotlib as mpl
from itertools import islice

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

In [2]:
df_country_list_different_spellings=pd.read_excel(r"C:\Users\Yasaman\Downloads\country_list_different_spellings_arabspring.xlsx")
df_country_list_different_spellings['ulke']=df_country_list_different_spellings['ulke'].apply(lambda x: x.lower())
map_country_list_different_spellings = pd.Series( df_country_list_different_spellings.ulke.values, index=df_country_list_different_spellings.yer).to_dict()
map_country_list_different_spellings['Turkey']='tur'

def get_country(txt):
    for copyright_mark in ['©', 'Copyright (C)']:
        if copyright_mark in txt:
            txt = txt.split(copyright_mark)[0]

    for tag in [ 'US dollar','New Mexico','Turkish','US$','US $','United States Dollar','USD','HK', 'Congo Red',
               'Congo red', 'US-Dollar', 'Michael Jordan','Guinea pig']:
        if tag in txt:
            txt=txt.replace(tag, '')

    places = GeoText(txt)
    country_codes = set()  

    for country_name in set(places.countries):
        country_code = pycountry.countries.get(name=country_name)
        if country_code:
            country_codes.add(country_code.alpha_3.lower())
        else:
            country_codes.add(map_country_list_different_spellings.get(country_name, country_name))

    pattern = r'\b(' + '|'.join(re.escape(country_name) for country_name in map_country_list_different_spellings.keys()) + r')\b'

    matches = re.findall(pattern, txt)

    for match in matches:
        country_codes.add(map_country_list_different_spellings[match])

    if 'uae' in country_codes:
        country_codes.remove('uae')
        country_codes.add('are')

    return list(country_codes)


In [6]:

df=pd.read_csv(r'C:\Users\Yasaman\Arab Spring Paper\Arab Spring Data\Study Datasets\Arab Spring related research\Scopus-arabspring.csv')
df.drop(columns=['Link', 'EID'], inplace=True)
df['Abstract']=df['Abstract'].apply(lambda x: x.split('©')[0])
df.loc[df['Abstract']=='[No abstract available]','Abstract']=''
df['Text']=df['Title']+' '+df['Abstract']
df=df[df['Year']>2010]
df['Mentions']=df['Text'].apply(get_country)

In [9]:
def get_topics(texts, timestamps):
    # Define stopwords (convert to list)
    stop_words = list(text.ENGLISH_STOP_WORDS.union([
        "would", "could", "might", "also"  # add custom words if you want
    ]))

    # Create vectorizer with stopwords and n-gram range
    vectorizer_model = CountVectorizer(
        stop_words=stop_words,
        ngram_range=(1, 3)   # unigrams, bigrams, trigrams
    )

    # Initialize topic model with custom vectorizer
    topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose=True)

    # Fit the model
    topics, probs = topic_model.fit_transform(texts)

    # Topics over time
    topics_over_time = topic_model.topics_over_time(
        texts,
        timestamps,
        datetime_format="%Y",
    )
    return topics_over_time
    

In [10]:
topics_over_time_list=[]
for group in [['egy', 'tun'], ['tun', 'lby', 'yem'], ['mar', 'kwt', 'bhr', 'omn', 'jor']]:
    df_group=df[df['Mentions'].apply(lambda mentions: any(m in group for m in mentions))]
    texts=df_group['Text'].values
    timestamps=df_group['Year'].values
    topics=get_topics(texts, timestamps)
    topics_over_time_list.append(topics)

2025-09-08 10:57:29,997 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

2025-09-08 10:58:22,235 - BERTopic - Embedding - Completed ✓
2025-09-08 10:58:22,237 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-08 10:58:23,847 - BERTopic - Dimensionality - Completed ✓
2025-09-08 10:58:23,849 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-08 10:58:23,940 - BERTopic - Cluster - Completed ✓
2025-09-08 10:58:23,952 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-08 10:58:24,760 - BERTopic - Representation - Completed ✓
9it [00:04,  2.04it/s]
2025-09-08 10:58:29,485 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

2025-09-08 10:59:11,458 - BERTopic - Embedding - Completed ✓
2025-09-08 10:59:11,459 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-08 10:59:12,552 - BERTopic - Dimensionality - Completed ✓
2025-09-08 10:59:12,557 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-08 10:59:12,644 - BERTopic - Cluster - Completed ✓
2025-09-08 10:59:12,653 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-08 10:59:13,434 - BERTopic - Representation - Completed ✓
9it [00:03,  2.82it/s]
2025-09-08 10:59:16,917 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2025-09-08 10:59:41,090 - BERTopic - Embedding - Completed ✓
2025-09-08 10:59:41,091 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-08 10:59:41,738 - BERTopic - Dimensionality - Completed ✓
2025-09-08 10:59:41,740 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-08 10:59:41,770 - BERTopic - Cluster - Completed ✓
2025-09-08 10:59:41,778 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-08 10:59:42,181 - BERTopic - Representation - Completed ✓
9it [00:02,  4.46it/s]


In [None]:
C:\Users\Yasaman\Arab Spring Paper\Arab Spring Code\Supp-Material\Attention\Topics of attention AS\analysis.ipynb

In [22]:

for dataframe, group in zip(topics_over_time_list,['GO', 'CW', 'GC']):
    display(dataframe)
    dataframe.to_csv(r'~\Arab Spring Paper\Arab Spring Code\Supp-Material\Attention\Topics of attention AS\topics_over_time_'+group+'.csv', index=False)

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"arab, egypt, political, tunisia, revolution",19,2011
1,0,"egypt, egyptian, political, military, muslim",12,2011
2,1,"media, social, social media, arab, new",6,2011
3,2,"arab, uprisings, egypt, change, world",9,2011
4,3,"study, hotel, egyptian, egypt, hotel industry",4,2011
...,...,...,...,...
90,4,"public, sexual, women, sexual violence, female",4,2019
91,5,"provisional, provisional administrations, admi...",3,2019
92,6,"eu, eus, democracy promotion, promotion, democ...",3,2019
93,7,"spring, arab spring, arab, countries, developm...",1,2019


Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"arab, political, tunisia, middle, spring",21,2011
1,0,"libya, libyan, libyas, intervention, women",5,2011
2,1,"media, arab, wikileaks, social, arab spring",3,2011
3,2,"arab, uprisings, egypt, change, political",8,2011
4,3,"syria, strategy, war, region, arab",5,2011
...,...,...,...,...
107,6,"family law, law, started, legislation, womenfr...",1,2019
108,7,"eu, security, regional, eus, cooperation",5,2019
109,8,"services, fp, conflict, yemen, pac",2,2019
110,9,"provisional, provisional administrations, admi...",1,2019


Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"arab, political, middle, middle east, east",7,2011
1,0,"morocco, alqaeda, political, moroccan, arab",6,2011
2,1,"social, wikileaks, media, women, facebook blog...",3,2011
3,3,"bahrain, gulf, states, saudi, iran",3,2011
4,4,"regimes, civil, nonviolence, armies, libya",3,2011
...,...,...,...,...
63,2,"health, workers, health care, care workers, he...",3,2019
64,3,"middle eastern, eastern, monarchies, saudi, re...",2,2019
65,4,"regimes, outcomes, following, international in...",1,2019
66,5,"oil, stock, price, oil price, relationship",5,2019
