In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geotext import GeoText
import pycountry
import re
from collections import Counter
from collections import defaultdict
import matplotlib as mpl
from itertools import islice




In [123]:

df=pd.read_csv(r'C:\Users\Yasaman\Arab Spring Paper\Arab Spring Data\Study Datasets\Arab Spring related research\Scopus-arabspring.csv')
df.drop(columns=['Link', 'EID'], inplace=True)
df['Abstract']=df['Abstract'].apply(lambda x: x.split('©')[0])
df.loc[df['Abstract']=='[No abstract available]','Abstract']=''

In [124]:
pop_df=pd.read_csv(r'C:\Users\Yasaman\Downloads\World_bank_population.csv',skiprows=3)
pop_df=pop_df[['Country Code','2019']].dropna()
pop_df['2019']=pop_df['2019'].astype(int)
possible_countries=pop_df.query(" `2019` >=1000000")['Country Code'].values
possible_countries=[x.lower() for x in possible_countries]


excluded_iso3_codes = [
    "IRL",  # Ireland
    "SSD",  # South Sudan
    "SDN",  # Sudan
    "COG",  # Republic of the Congo
    "COD",  # Democratic Republic of the Congo
    "GIN",  # Guinea
    "GNB",  # Guinea-Bissau
    "GNQ",  # Equatorial Guinea
    "PNG",  # Papua New Guinea
    "XKX",  # Kosovo (unofficial)
    "MNE",  # Montenegro
    "SRB",  # Serbia
    "TLS",   # Timor-Leste
    'GEO'

]
excluded_iso3_codes=[c.lower() for c in excluded_iso3_codes]
possible_countries=list(set(possible_countries)-set(excluded_iso3_codes))

In [None]:
df_country_list_different_spellings=pd.read_excel(r"C:\Users\Yasaman\Downloads\country_list_different_spellings_arabspring.xlsx")
df_country_list_different_spellings['ulke']=df_country_list_different_spellings['ulke'].apply(lambda x: x.lower())
map_country_list_different_spellings = pd.Series( df_country_list_different_spellings.ulke.values, index=df_country_list_different_spellings.yer).to_dict()
map_country_list_different_spellings['Turkey']='tur'

def get_country(txt):


    for copyright_mark in ['©', 'Copyright (C)']:
        if copyright_mark in txt:
            txt = txt.split(copyright_mark)[0]

    for tag in [ 'US dollar','New Mexico','Turkish','US$','US $','United States Dollar','USD','HK', 'Congo Red',
               'Congo red', 'US-Dollar', 'Michael Jordan','Guinea pig']:
        if tag in txt:
            txt=txt.replace(tag, '')

    places = GeoText(txt)
    country_codes = set()  

    for country_name in set(places.countries):
        country_code = pycountry.countries.get(name=country_name)
        if country_code:
            country_codes.add(country_code.alpha_3.lower())
        else:
            country_codes.add(map_country_list_different_spellings.get(country_name, country_name))

    pattern = r'\b(' + '|'.join(re.escape(country_name) for country_name in map_country_list_different_spellings.keys()) + r')\b'

    matches = re.findall(pattern, txt)

    for match in matches:
        country_codes.add(map_country_list_different_spellings[match])

    if 'uae' in country_codes:
        country_codes.remove('uae')
        country_codes.add('are')

    return list(country_codes)
df['Text']=df['Title']+' '+df['Abstract']
df['Mentions']=df['Text'].apply(get_country)


In [None]:
arabspring_countries=['egy', 'tun', 'lby', 'syr', 'mar', 'kwt', 'bhr', 'yem', 'omn', 'jor']
df=df[df['Mentions'].apply(lambda mentions: any(m in arabspring_countries for m in mentions))]

In [None]:
STOPWORDS = {
    "a","an","and","or","the","of","to","in","on","for","with","at","by","from",
    "is","are","was","were","be","been","being","as","that","this","these","those",
    "it","its","into","about","not","but","so","than","then","over","under","between",
    "within","without","across","against","during","before","after","above","below",
    "up","down","out","off","again","further","more","most","some","such","no","nor",
    "too","very","can","will","just","do","does","did","doing","their","them","they",
    "you","your","we","our","i","me","my", 'has', 'article','paper','have'
}

def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z'\s]", " ", text)
    tokens = [re.sub(r"'", "", t) for t in re.split(r"\s+", text) if t]
    tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 1]
    return tokens

def trigrams(tokens):
    return [" ".join(tokens[i:i+3]) for i in range(len(tokens)-2)]

year_trigram_counts = defaultdict(Counter)

for _, row in df.iterrows():
    year = int(row["Year"])
    tokens = tokenize(str(row["Text"]))
    for tri in trigrams(tokens):
        year_trigram_counts[year][tri] += 1

rows = []
for year, ctr in year_trigram_counts.items():
    for tri, cnt in ctr.items():
        rows.append({"Year": year, "Trigram": tri, "Count": cnt})
tri_df = pd.DataFrame(rows).sort_values(["Year", "Count"], ascending=[True, False])

tri_df["Rank_in_Year"] = tri_df.groupby("Year")["Count"].rank(method="first", ascending=False).astype(int)

TOP_N = 20
top_by_year = tri_df[tri_df["Rank_in_Year"] <= TOP_N].copy()


In [130]:
top_by_year.to_csv(r"C:\Users\Yasaman\Arab Spring Paper\Arab Spring Data\Study Datasets\Arab Spring related research\Scopus-arabspring-top-trigrams.csv", index=False)


* **2002** — Focus on *social development and activism* in the Middle East (urban grassroots, social development/activism).
* **2003** — *Palestine, culture, and civil society*: revolutionary framing alongside analyses of Palestinian songs and movements.
* **2004** — *Conflict/controls in Yemen* and regional peace-process talk (Yemeni highlands, policing demonstrators; Middle East peace process).
* **2005** — *Risk and geopolitics*: country-risk spillovers tied to war/terror, “road map” peace, and upheavals in the Middle East.
* **2006** — Mixed year (some non-core/medical trigrams) plus *U.S./non-aligned movement* and Gulf mentions; little Arab-Spring framing yet.
* **2007** — *Foreign/nuclear policy and Maghreb geopolitics* (British policy, nuclear choices; Libya/Maghreb socio-political developments).
* **2008** — *Leadership and urban movements*: Muammar al-Qaddafi appears; social movements in cities.
* **2009** — *Pre-uprising tensions & governance*: Houthi insurgency (Yemen), “Egypt’s quest \[for] democracy,” Syrian cultural diplomacy, Islamist violence in Saudi Arabia.
* **2010** — *Media systems and the Gulf*: pan-Arab satellite/news, TV/media ecosystems; Gulf/West Asia framing — a prelude to social-media politics.
* **2011** — *The Arab Spring proper*: MENA-wide focus (Tunisia–Egypt–Libya), Ben Ali/Mubarak/Assad named, *internet/social media* and Libya’s *National Transitional Council*; repression vs. regime survival.
* **2012** — *Transitions and media*: Al Jazeera English, social networking sites, transition processes in Egypt/Libya/Syria; continued MENA framing.
* **2013** — *Uprisings to transitions*: Arab Spring uprisings + *civil–military relations*, Turkey/AKP foreign policy, Tahrir Square; early “aftermath” language enters.
* **2014** — *Regional diplomacy & “post–Arab Spring”*: GCC, Turkey/Jordan ties, pre/post-Arab-Spring contrasts; social media still present.
* **2015** — *Institutional/foreign-policy lens*: GCC and Turkish foreign policy, “Arab Spring countries,” disaster preparedness and human-rights protection.
* **2016** — *Legal/economic and humanitarian turn*: anti–money-laundering (incl. Jordan), *Syrian civil war* and *refugee women*, UAE/GCC; explicit *post–Arab Spring* framing.
* **2017** — *Society & state consolidation*: Egypt under *Fattah al-Sisi*, security-sector reform; *health/justice* and social-network sites appear together.
* **2018** — *Seven-years-on reflections*: “following/post Arab Spring,” rights/violence (incl. sexual minorities), *Libya–Syria–Yemen* cluster; post-revolutionary Egypt.
* **2019** — *State capacity & economy*: non-state actors, *health-care workers*, *stock-market returns*; ongoing country triads (Algeria/Egypt/Libya; Morocco/Tunisia/Libya) and “post-Arab Spring” retrospectives.

### How it evolves (in one breath)

The corpus moves from **pre-2011 social development, conflict hot-spots, and media infrastructure** (2002–2010) → **uprising-centric, country-specific waves with strong social-media and transition language** (2011–2013) → **region-level policy, institutions, and “post–Arab Spring” consolidation** (2014–2016) → **social consequences and governance stabilization** (2017–2019), where *health, non-state actors, finance/markets, and foreign policy* share the stage with ongoing case studies (Egypt, Libya, Tunisia, Syria, Yemen).


In [138]:
top_by_year[top_by_year['Year']==2018]

Unnamed: 0,Year,Trigram,Count,Rank_in_Year
18752,2018,following arab spring,7,1
19839,2018,post arab spring,7,2
20621,2018,middle east north,5,3
20622,2018,east north africa,5,4
20803,2018,egypt arab spring,5,5
20630,2018,aftermath arab spring,4,6
23573,2018,violence sexual minorities,4,7
25206,2018,libya syria yemen,4,8
32617,2018,arab spring uprisings,4,9
19084,2018,arab spring study,3,10
