In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geotext import GeoText
import pycountry
import re
from collections import Counter
from collections import defaultdict
import matplotlib as mpl
from itertools import islice

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

In [2]:
df_country_list_different_spellings=pd.read_excel(r"C:\Users\Yasaman\Downloads\country_list_different_spellings_arabspring.xlsx")
df_country_list_different_spellings['ulke']=df_country_list_different_spellings['ulke'].apply(lambda x: x.lower())
map_country_list_different_spellings = pd.Series( df_country_list_different_spellings.ulke.values, index=df_country_list_different_spellings.yer).to_dict()
map_country_list_different_spellings['Turkey']='tur'

def get_country(txt):
    for copyright_mark in ['©', 'Copyright (C)']:
        if copyright_mark in txt:
            txt = txt.split(copyright_mark)[0]

    for tag in [ 'US dollar','New Mexico','Turkish','US$','US $','United States Dollar','USD','HK', 'Congo Red',
               'Congo red', 'US-Dollar', 'Michael Jordan','Guinea pig']:
        if tag in txt:
            txt=txt.replace(tag, '')

    places = GeoText(txt)
    country_codes = set()  

    for country_name in set(places.countries):
        country_code = pycountry.countries.get(name=country_name)
        if country_code:
            country_codes.add(country_code.alpha_3.lower())
        else:
            country_codes.add(map_country_list_different_spellings.get(country_name, country_name))

    pattern = r'\b(' + '|'.join(re.escape(country_name) for country_name in map_country_list_different_spellings.keys()) + r')\b'

    matches = re.findall(pattern, txt)

    for match in matches:
        country_codes.add(map_country_list_different_spellings[match])

    if 'uae' in country_codes:
        country_codes.remove('uae')
        country_codes.add('are')

    return list(country_codes)


In [3]:

df=pd.read_csv(r'C:\Users\Yasaman\Arab Spring Paper\Arab Spring Data\Study Datasets\Arab Spring related research\Scopus-arabspring.csv')
df.drop(columns=['Link', 'EID'], inplace=True)
df['Abstract']=df['Abstract'].apply(lambda x: x.split('©')[0])
df.loc[df['Abstract']=='[No abstract available]','Abstract']=''
df['Text']=df['Title']+' '+df['Abstract']
df=df[df['Year']>2010]
df['Mentions']=df['Text'].apply(get_country)

In [4]:
def get_topics(texts, timestamps):
    stop_words = list(text.ENGLISH_STOP_WORDS.union([
        "would", "could", "might", "also"  
    ]))

    vectorizer_model = CountVectorizer(
        stop_words=stop_words,
        ngram_range=(1, 3)  
    )

    topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose=True)

    topics, probs = topic_model.fit_transform(texts)

    topics_over_time = topic_model.topics_over_time(
        texts,
        timestamps,
        datetime_format="%Y",
    )
    return topics_over_time, topics
    

In [None]:
topics_over_time_list=[]
for group in [['egy', 'tun'], ['syr', 'lby', 'yem'], ['mar', 'kwt', 'bhr', 'omn', 'jor']]:
    df_group=df[df['Mentions'].apply(lambda mentions: any(m in group for m in mentions))]
    texts=df_group['Text'].values
    timestamps=df_group['Year'].values
    topics, _=get_topics(texts, timestamps)
    topics_over_time_list.append(topics)

2025-09-25 08:30:57,959 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

2025-09-25 08:32:19,820 - BERTopic - Embedding - Completed ✓
2025-09-25 08:32:19,820 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-25 08:32:40,399 - BERTopic - Dimensionality - Completed ✓
2025-09-25 08:32:40,400 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-25 08:32:40,476 - BERTopic - Cluster - Completed ✓
2025-09-25 08:32:40,484 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-25 08:32:41,776 - BERTopic - Representation - Completed ✓
9it [00:05,  1.68it/s]
2025-09-25 08:32:47,759 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

2025-09-25 08:33:46,009 - BERTopic - Embedding - Completed ✓
2025-09-25 08:33:46,009 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-25 08:33:47,347 - BERTopic - Dimensionality - Completed ✓
2025-09-25 08:33:47,347 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-25 08:33:47,396 - BERTopic - Cluster - Completed ✓
2025-09-25 08:33:47,404 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-25 08:33:48,238 - BERTopic - Representation - Completed ✓
9it [00:07,  1.19it/s]
2025-09-25 08:33:56,305 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2025-09-25 08:34:29,686 - BERTopic - Embedding - Completed ✓
2025-09-25 08:34:29,686 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-25 08:34:30,193 - BERTopic - Dimensionality - Completed ✓
2025-09-25 08:34:30,193 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-25 08:34:30,210 - BERTopic - Cluster - Completed ✓
2025-09-25 08:34:30,218 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-25 08:34:30,620 - BERTopic - Representation - Completed ✓
9it [00:01,  5.41it/s]


In [None]:

for dataframe, group in zip(topics_over_time_list,['GO', 'CW', 'GC']):
    dataframe.to_csv(r'~\Arab Spring Paper\Arab Spring Code\Supp-Material\Attention\Topics of attention AS\topics_over_time_'+group+'.csv', index=False)

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"arab, political, egypt, tunisia, social",17,2011
1,0,"egypt, egyptian, political, military, muslim",13,2011
2,1,"media, social, social media, arab, new",6,2011
3,2,"arab, uprisings, egypt, world, middle",11,2011
4,3,"political, tunisia, tunisian, civil, activities",3,2011
...,...,...,...,...
104,5,"eu, foreign, policy, north, eus",4,2019
105,6,"spring, arab spring, arab, region, countries",2,2019
106,7,"family law, started, law, legislation, womenfr...",1,2019
107,9,"public, sexual, sexual violence, female, egypt",3,2019


Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"arab, political, spring, arab spring, middle",10,2011
1,0,"syria, syrian, regime, middle, political",11,2011
2,1,"libya, libyan, libyas, women, intervention",5,2011
3,2,"arab, civil, egypt, tunisia, uprisings",6,2011
4,3,"wikileaks, revelations, wiki revelations, reve...",1,2011
...,...,...,...,...
71,3,"media, social media, social, political, region...",3,2019
72,4,"turkeys, kurdish, syria, turkey, ecology",3,2019
73,5,"services, fp, yemen, conflict, pac",2,2019
74,6,"asylum, design, refugee, refugees, camps",2,2019


Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"middle east, east, middle, arab, government",5,2011
1,0,"political, morocco, arab, alqaeda, moroccan",11,2011
2,1,"wikileaks, women, journalism, social, arab",3,2011
3,2,"bahrain, gulf, states, saudi, iran",3,2011
4,4,"price, petrochemical, diversify, gulf, need",1,2011
5,-1,"political, women, arab, water, social",7,2012
6,0,"arab, political, morocco, 2011, tunisia",18,2012
7,1,"social, dubai, media, bahrain, movement",4,2012
8,2,"policy, arab, gulf, foreign policy, foreign",2,2012
9,3,"water, jordan, arab, zionist, palestine",6,2012


In [22]:
groups = {
    "GO": ["egy", "tun"],
    "CW": ["syr", "lby", "yem"],
    "GC": ["mar", "kwt", "bhr", "omn", "jor"]
}

# initialize with False
for g in groups:
    df[g] = False

# mark True where any group member is in Mentions
for g, members in groups.items():
    df.loc[df["Mentions"].apply(lambda m: any(c in m for c in members)), g] = True

df=df[df['GO']|df['CW']|df['GC']].reset_index(drop=True)

texts=df['Text'].values
timestamps=df['Year'].values
topics, topics_per_document=get_topics(texts, timestamps)

2025-09-25 17:48:47,936 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

2025-09-25 17:49:17,604 - BERTopic - Embedding - Completed ✓
2025-09-25 17:49:17,605 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-25 17:49:27,697 - BERTopic - Dimensionality - Completed ✓
2025-09-25 17:49:27,698 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-25 17:49:27,752 - BERTopic - Cluster - Completed ✓
2025-09-25 17:49:27,755 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-25 17:49:29,039 - BERTopic - Representation - Completed ✓
9it [00:16,  1.87s/it]


In [41]:
df['Topic']=topics_per_document

In [45]:
df.to_csv(r'~\Arab Spring Paper\Arab Spring Code\Supp-Material\Attention\Topics of attention AS\one_model_data_with_topics.csv', index=False)

In [46]:
topics.to_csv(r'~\Arab Spring Paper\Arab Spring Code\Supp-Material\Attention\Topics of attention AS\one_model_topics.csv', index=False)

In [35]:
topics[topics['Topic']==10].Words.values

array(['bahrain, gulf, monarchies, saudi, states',
       'saudi, saudi arabia, arabia, gulf, monarchies',
       'monarchies, 1970 coup, coup, dynastic, 1970',
       'bahrain, monarchies, gulf, oman, arab',
       'monarchies, gulf, arab, qatar, gcc',
       'qatar, arab, saudi, emirates, monarchies',
       'saudi, policies, revivalist movement, revivalist, monarchical',
       'saudi, gcc, arab, nationalism, perceptions',
       'monarchies, middle eastern, eastern, middle eastern monarchies, eastern monarchies'],
      dtype=object)

In [16]:
sum(df['GO']&df['CW']), sum(df['GO']&df['GC']), sum(df['CW']&df['GC'])

(201, 148, 107)