# Notes
* Each major section of this document is designed to be ran independently.

* To print out something, use print(). In addition, jupyter automatically prints out the last line if its a value.

* For keywords, a "keywords.txt" file is needed in the "CDCS" folder. This must be a tag title followed by all words to match delimited by "|"

# Downloading CDCS

In [1]:
!pip install beautifulsoup4 --upgrade



In [3]:
import requests
import os
from bs4 import BeautifulSoup

#
#   Python script to scrape CDCS documents from usaid.gov
#   This script uses bs4. You may need to install bs4:
#       pip install bs4
#
# Wrapper to get the text content (html) from a page
def get_html(link):
    res = requests.get(link)
    txt = res.text
    res.close()
    return txt

# Wrapper to get the content from a page
def get_content(link):
    res = requests.get(link)
    content = res.content
    res.close()
    return content
    
# Check if a link contains '/' (located on host domain)
def check_link(link):
    if link[0] == '/':
        return 'https://www.usaid.gov'  + link
    else:
        return link


# Get webpage
soup = BeautifulSoup(get_html('https://www.usaid.gov/results-and-data/planning/country-strategies-cdcs'), 'html.parser')

    
# Make a directory to store
if not os.getcwd().split("\\")[-1] == "CDCS":
    if not os.path.exists("CDCS"):
        os.mkdir("CDCS")
    os.chdir("CDCS")


# Display dictionary
print("Pages Results")
print("====================================================================") 

# Obtain links to pdfs located on main site
# soup.select uses a CSS selector to obtain the link
pages = {}
idx = 1
for link in soup.select('div[class*="wysiwyg"] > a:first-child'):
    # Try to go to the link referenced
    # Access subpage and search for pdf
    child = BeautifulSoup(get_html(check_link(link.get("href"))), 'html.parser')
    pdf_attr = child.select('a[href*="pdf"]')

    # Error
    if not pdf_attr:
        pages[link.text] = "Not found"
        print("%d:%16s\t[%d] %s" % (idx, link.text[:12], 0, "Not found"))
        # Exit 1 
        idx += 1;
        continue
    
    # Visit pdf link
    # Handle multiple links
    pdf_link = pdf_attr[0].get("href")
    valid = 0
    if len(pdf_attr) > 1:
        for lnk in pdf_attr:
            # Language edgecase. Not comprehensive. TODO?
            if "title" in lnk.attrs and ("Español" in lnk["title"] or "Spanish" in lnk["title"]
                                         or not "Country" in lnk["title"]):
                continue
        
            if "class" in lnk.attrs and "usaid-link" in lnk["class"]:
                valid += 1
                pdf_link = lnk.get("href")
    
    # Update link
    pdf_link = check_link(pdf_link)
        
    # Check if file exists
    if os.path.isfile(link.text + " CDCS.pdf"):
        pages[link.text] = pdf_link
        print("%d:%16s\t[%d] %s" % (idx, link.text[:12], valid, pdf_link))
        # Exit 2
        idx += 1;
        continue

    # Write
    # MAYBE TODO: it is possible that the link returns a 404
    pdf = open(link.text + " CDCS.pdf", "wb")
    pdf.write(get_content(pdf_link))
    pdf.close()
    
    # Exit 3
    print("%d:%16s\t[%d] %s" % (idx, link.text[:12], valid, pdf_link))
    pages[link.text] = pdf_link
    idx += 1;



Pages Results
1:          Angola	[1] https://www.usaid.gov/sites/default/files/2022-05/Angola_CDCS_2014-2019.pdf
2:    Democratic R	[1] https://www.usaid.gov/sites/default/files/2022-05/Public_CDCS-DRC-12-2025.pdf
3:        Ethiopia	[1] https://www.usaid.gov/sites/default/files/2022-05/Ethiopia-CDCS_2019-2024_Final-Public-Dec-2019-2.pdf
4:          Ghana 	[1] https://www.usaid.gov/sites/default/files/2022-05/CDCS-Ghana-August-2025x.pdf
5:          Guinea	[1] https://www.usaid.gov/sites/default/files/2022-05/CDCS_Guinea_Dec_2025_2.pdf
6:           Kenya	[1] https://www.usaid.gov/sites/default/files/2022-05/Kenya_CDCS_External_Sept_2021.pdf
7:         Liberia	[1] https://www.usaid.gov/sites/default/files/2022-05/CDCS_Liberia_June_2024.pdf
8:      Madagascar	[1] https://www.usaid.gov/sites/default/files/2022-06/CDCS-Madagascar-September-2025_112020.pdf
9:          Malawi	[1] https://www.usaid.gov/sites/default/files/2022-05/EXTERNAL_CDCS_Malawi_2020-2025_-_Exp_April_2025_508_1.50.43_PM.pd

# Preprocessing into Bag of Words

In [1]:
!pip install PyPDF2



In [48]:
from PyPDF2 import PdfReader
import os
import pandas as pd
import sklearn.feature_extraction as skft
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
import json


# If not in directory, move into it
if not os.getcwd().split("\\")[-1] == "CDCS":
    os.chdir("CDCS")

# Get text from all pdfs
titles = []
documents = []
i = 0
for fname in os.listdir():
    # if the file name contains "CDCS.pdf" then try to read it
    if fname.find("CDCS.pdf") != -1:
        
        # Try reading, catching errors
        reader = None
        try:
            reader = PdfReader(fname)
        except:
            print("\nError reading", fname)
            continue
        
        # Extract text from a pdf
        all_text = ""
        for page in reader.pages:
            all_text += page.extract_text()
        
        documents.append(all_text)
        titles.append(fname[:fname.find("CDCS.pdf") - 1])
        
        # Print for progress
        i += 1
        if i % 4 == 0:
            print("")
            i = 0
        print(fname, end=" ")
print("")

Albania CDCS.pdf Angola CDCS.pdf Armenia CDCS.pdf 
Bangladesh CDCS.pdf Bosnia and Herzegovina CDCS.pdf Cambodia CDCS.pdf Central Asia CDCS.pdf 
Democratic Republic of the Congo CDCS.pdf Egypt CDCS.pdf El Salvador CDCS.pdf Ethiopia CDCS.pdf 
Georgia CDCS.pdf Ghana  CDCS.pdf Guatemala CDCS.pdf Guinea CDCS.pdf 
Honduras CDCS.pdf India CDCS.pdf Indonesia CDCS.pdf Jamaica CDCS.pdf 
Jordan CDCS.pdf Kenya CDCS.pdf Kosovo CDCS.pdf Kyrgyz Republic CDCS.pdf 
Lebanon CDCS.pdf Liberia CDCS.pdf Madagascar CDCS.pdf Malawi CDCS.pdf 
Mali CDCS.pdf Mexico CDCS.pdf Moldova CDCS.pdf Morocco CDCS.pdf 
Mozambique CDCS.pdf Nepal CDCS.pdf Niger CDCS.pdf Pakistan CDCS.pdf 
Paraguay CDCS.pdf Peru CDCS.pdf Philippines CDCS.pdf Regional Development Cooperation Strategy CDCS.pdf 
Error reading Rwanda CDCS.pdf

Serbia CDCS.pdf Somalia CDCS.pdf South Africa CDCS.pdf Sri Lanka CDCS.pdf 
Tanzania CDCS.pdf Timor-Leste CDCS.pdf Tunisia CDCS.pdf Ukraine CDCS.pdf 
Vietnam CDCS.pdf West Africa Regional CDCS.pdf Zambia CDC

In [46]:
 Save results 
# The results are stored in the same directory as the jupyter notebook
os.chdir("..")
with open('title.json', 'w', encoding='utf-8') as f:
    json.dump(titles, f, ensure_ascii=False, indent=4)

with open('cdcsraw.json', 'w', encoding='utf-8') as f:
    json.dump(documents, f, ensure_ascii=False, indent=4)

# Analysis

### Reload the file if necessary
The files should be in the same directory as the notebook

In [15]:
import pandas as pd
import json
    
#bag = pd.read_csv("cdcs_bag.csv")
#bag = bag.rename(columns = {'Unnamed: 0':'Nation'}).set_index("Nation")

titles = None
with open('title.json') as f:
    titles = json.load(f)
    
documents = None
with open('cdcsraw.json', encoding="utf8") as f:
    documents = json.load(f)

## Keyword analysis
Keyword analysis uses a "keywords.txt" file that must be in the same directory as this jupyter notebook

In [19]:
import re
import pandas as pd


# The format of the keywords file should be
# name
# keyword|keyword2|keyword3              (on a single line)
kwords = {}
with open('keywords.txt') as k:
    kdoc = k.read()
    spl = re.compile("\n").split(kdoc)
    
    # A simple algorithm that matches the previous non-empty line
    tag_name = None
    for s in spl:
        if not re.match("\s+", s):
            if tag_name:
                kwords[tag_name] = s
                tag_name = None
            else:
                tag_name = s
kwords.keys()

dict_keys(['Civil Society', 'Governance', 'Corruption', 'Human Rights', 'Independent Media and Free Flow of Information', 'Political Competition and Consensus Building', 'Rule of Law', "Women's Political Participation and Leadership", 'Youth', 'Environment & Climate Change', 'Migration', 'Misinformation'])

In [40]:
# General helper functions
# -----------------------------------------------------
# Get the total count from a specific keyword category
# Return the total occurrences of a keyword category as a int
def kwords_count(string, kword):
    return len(re.findall(kwords[kword], string))

# Get the individualized keyword breakdown from a string
# Return how much time keywords in a keyword category gets matched as a dict
def get_keywords_breakdown(string, kword):
    words = {}
    for k in kwords[kword].split("|"):
        words[k] = len(re.findall(k, string))
    return words
        

# Returns how much times ALL keywords categories match to this string as a dict
def kwords_category_count(string):
    result = {}
    for k in kwords.keys():
        result[k] = [kwords_count(string, k)]
    return result


# User functions
# -----------------------------------------------------


# Get the CDCS corresponding to this country
# Returns a string corresponding to the raw CDCS text
def get_country(country):
    return documents[titles.index(country)]

# Create a keyword dataframe out of all countries
def make_keyword_df():
    master = None
    for country in titles:
        # Get keywords for all categories
        add = kwords_category_count(get_country(country))
        if not master:
            # This is the first dictionary
            master = add
        else:
            # Merge dictionaries if not first dictionary
            for k in add:
                master[k] += add[k]
    df = pd.DataFrame.from_dict(master)
    df["country"] = titles
    df = df.set_index("country")
    return df

# Get the top num words for a country
def get_top_words(df, country, num):
    return df.loc[country, :].sort_values(ascending=False).head(num)

def make_kwords_cat(country, cat):
    bkd = get_keywords_breakdown(get_country(country), cat)
    return pd.Series(data=bkd, index=bkd.keys()).sort_values(ascending=False)
    

In [27]:
# Get keywords category count for all countries
keyword_df = make_keyword_df()
keyword_df

Unnamed: 0_level_0,Civil Society,Governance,Corruption,Human Rights,Independent Media and Free Flow of Information,Political Competition and Consensus Building,Rule of Law,Women's Political Participation and Leadership,Youth,Environment & Climate Change,Migration,Misinformation
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Albania,33,62,33,3,17,3,33,0,15,4,0,0
Angola,352,237,4,45,70,31,18,60,59,8,1,0
Armenia,154,318,36,35,116,42,76,13,35,39,2,8
Bangladesh,156,256,52,139,96,22,93,49,66,57,9,0
BosniaÂ and Herzegovina,183,341,43,67,150,29,52,40,26,22,15,1
Cambodia,279,369,46,168,176,19,97,61,129,82,6,1
Central Asia,101,158,2,26,50,14,38,21,30,35,54,0
Democratic Republic of the Congo,209,254,34,79,74,33,68,79,44,32,2,0
Egypt,68,251,18,72,59,17,23,93,42,28,0,0
El Salvador,137,292,54,79,51,25,213,25,50,34,154,0


In [28]:
# Example
get_top_words(keyword_df, "India", 20)

Governance                                        121
Political Competition and Consensus Building       80
Civil Society                                      68
Independent Media and Free Flow of Information     62
Human Rights                                       52
Youth                                              34
Environment & Climate Change                       32
Women's Political Participation and Leadership     25
Rule of Law                                        17
Corruption                                          2
Migration                                           2
Misinformation                                      0
Name: India, dtype: int64

In [55]:
# Get how much "hits" each word in a keywords category had
make_kwords_cat("Angola", "Civil Society")

economic               30
institution            19
government             16
road                    6
cost                    5
economy                 3
transport               2
procure                 1
legis                   1
budget                  1
constitution            0
administration          0
government official     0
audit                   0
public service          0
public information      0
civil servant           0
civil service           0
records                 0
police                  0
tax                     0
dtype: int64

## Bag of words analysis

In [42]:
# Stopwords
ADD_STOPWORDS = ["usaid", "usaids", "agency", 
                 "international", "development", 
                 "unclassified", "embassy", "approved", 
                 "public", "release", "cdcs", "mission", "country"]
stwords = list(skft.text.ENGLISH_STOP_WORDS.union(ADD_STOPWORDS))


# Convert documents into bag of words
# This also drops terms that do not appear in at least 5% of the documents
vectorizer = CountVectorizer(stop_words=stwords, lowercase=True, min_df=0.05)
raw_bag = vectorizer.fit_transform(documents)
bag = pd.DataFrame(data=raw_bag.toarray(), columns = vectorizer.get_feature_names_out(), index=titles)

In [23]:
# Example
get_top_words(bag, "India", 20)

india         223
health         97
pollution      67
sector         53
private        51
ir             43
air            42
economic       42
energy         40
education      38
improve        36
social         31
self           31
region         30
regional       30
management     29
support        29
reliance       29
growth         28
poverty        28
Name: India, dtype: int64

In [24]:
from sklearn.decomposition import LatentDirichletAllocation

# Looking at LDA
# Remove ir
lda = LatentDirichletAllocation(n_components=8)
lda.fit(bag)

# Print top words associated with each component
for topic in range(0, len(lda.components_)):
  print("".join(bag.columns[x] + " " for x in lda.components_[topic].argsort()[-10:]))

marginalized strategy private sector capacity institutions government economic women somalia 
support local morocco youth honduras private sector economic education ir 
support governance women education capacity private government economic sector health 
lanka women social ir sri private youth government sector economic 
systems risk services ir niger kenya women youth health climate 
energy war los growth wildlife tb agent environmental hiv vietnam 
reliance services self support capacity health private government economic sector 
rdo gender support health central countries asia economic region regional 


In [53]:
print(get_country("Bangladesh"))

i 
   
 
 
 
   
   
 
UNCLASSIFIED  
BANGLADESH  
 
 COUNTRY DEVELOPMENT  
COOPERATION STRATEGY 
(CDCS)  
 
DECEMBER 2020 – DECEMBER  2025 
 
APPROVED FOR PUBLIC RELEASE  
i 
  TABLE OF CONTENTS 
 
TABLE OF CONTENTS  ................................ ................................ ................................ ................................ ...................  i 
TABLES AND FIGURES  ................................ ................................ ................................ ................................ .................. iii 
ABBREVIATIONS AND ACRONYMS  ................................ ................................ ................................ .......................  iv 
EXECUTIVE SUMMARY  ................................ ................................ ................................ ................................ .................. 1 
COUNTRY CONTEXT  ................................ ................................ ................................ .....

UNCLASSIFIED  
 
FOR PUBLIC RELEASE   
 
 
   
UNCLASSIFIED  
ARMENIA  
 
 SEPTEMBER 24, 2020 – SEPTEMBER 24, 2025  
 COUNTRY DEVELOPMENT  
COOPERATION STRATEGY  
(CDCS)  
 
Glen Garanich for Reuters  UNCLASSIFIED  
 
FOR PUBLIC RELEASE  Table of Contents  
 
Acronyms  1 
I. Executive Summary  2 
II. Country Context  5 
The Velvet Revolution  5 
The Impact of COVID -19 5 
U.S. Foreign Policy Priorities  6 
J2SR Roadmap Analysis: Commitment  7 
J2SR Roadmap Analysis: Capacity  8 
J2SR Roadmap Analysis: The Full Picture  8 
Regional Actors  9 
III. Strategic Approach  11 
The Journey to Self -Reliance  11 
A Different Ap proach  12 
Redefining the Relationship: Toward Strategic Transition  13 
Redefining the Relationship: Resilience  13 
Inclusive Development  15 
Climate Change and Biodiversity in Armenia  15 
Milestones  16 
IV. Results Framework  17 
Goal Statement and Narrative  17 
Development Objective 1: Democratic Transition Advanced  17 
Development Objective 2: Economic Securit