In [318]:
!pip install -U selenium
!pip install python-docx
!pip install pyldavis
#!pip install -U getpass



## Requires a browser and corresponding driver
Navigate to 
https://selenium-python.readthedocs.io/installation.html#drivers

A driver allows a web browser that you have installed to be automatically ran.
Download one of the drivers. The driver may come in a .zip file which you may need to unzip.
Place the downloaded driver inside the same directory of this notebook and change the constants below.

In [86]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By


# Change USING_CHROME to False if you're not using Chrome
USING_CHROME = True

# Change driver path with the path to your driver
DRIVER_PATH = 'chromedriver.exe'

# Change "Chrome" to another browser i.e. "Firefox" if not using Chrome
driver = webdriver.Chrome(DRIVER_PATH)
if (USING_CHROME):
    # Allow multiple downloads - chrome only
    chrome_options = webdriver.ChromeOptions()

    prefs = {'profile.default_content_setting_values.automatic_downloads': 1}
    chrome_options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(options = chrome_options) 
    
driver.get('https://dfafacts.gov/');

# Click the "I understand" button
elem = driver.find_element(By.CSS_SELECTOR, 'input')
elem.click()

# Select the login name
usern = driver.find_element(By.CSS_SELECTOR, 'input#loginName')
passw = driver.find_element(By.CSS_SELECTOR, 'input#Password')
authc = driver.find_element(By.CSS_SELECTOR, 'input[id="Authenticator Code"]')
logb = driver.find_element(By.CSS_SELECTOR, 'button#btnLogin')


  driver = webdriver.Chrome(DRIVER_PATH)


## Login

Login directly by opening up the browser and entering credentials

## Downloading OPs
You may need to enable "Download Multiple Files" when prompted if your browser requests it.
The downloaded files will be in your default download location i.e. Downloads

In [83]:
import time

# Gotta add a delay cause FactsInfo is so laggy
# To reduce errors, increase the value of DELAY
DELAY = 3

# Helper function to repeatedly try to click
# Tries 10 times before stopping
def click(CSSorElem, lag=1):
    attempt = 0
    while attempt < 10:
        try:
            if type(CSSorElem) == str:
                driver.find_element(By.CSS_SELECTOR, CSSorElem).click()
            else:
                CSSorElem.click()
            attempt = 99
        except Exception as e:
            print("Could not click... " + str(e))
            time.sleep(DELAY)
        attempt += 1
    
    # How much to sleep after finishing
    time.sleep(DELAY * lag)
    return attempt

# Click the OP button
click('i.fa-rocket', lag=3)

# Click reports
click('div.col-md-12 > ul > li:nth-child(5) > a')

100

In [84]:
# Get all options for "Bureau"
bureaus = driver.find_elements(By.CSS_SELECTOR, 'select#selOpBureau > option')
for bureau in bureaus:
    click(bureau, lag=2)
    
    # Get all options for "Operational Unit"
    units = driver.find_elements(By.CSS_SELECTOR, 'select#selOpUnit > option')
    for unit in units:
        click(unit)
        print(bureau.text + " - " + unit.text)
        
        # Attempt to download the operational plan 
        click('i[title="Click to Generate Full Operation Plan Report"]', lag=2)
print("Done")

East Asia and Pacific - Burma
East Asia and Pacific - Cambodia
East Asia and Pacific - China
East Asia and Pacific - Fiji
East Asia and Pacific - Indonesia
East Asia and Pacific - Laos
East Asia and Pacific - Malaysia
East Asia and Pacific - Marshall Islands
East Asia and Pacific - Micronesia
East Asia and Pacific - Mongolia
East Asia and Pacific - North Korea
East Asia and Pacific - Papua New Guinea
East Asia and Pacific - Philippines
East Asia and Pacific - Samoa
East Asia and Pacific - Singapore
East Asia and Pacific - Thailand
East Asia and Pacific - Timor-Leste
East Asia and Pacific - Tonga
East Asia and Pacific - Vietnam
East Asia and Pacific - Pacific Islands Regional
East Asia and Pacific - State East Asia and Pacific Regional
East Asia and Pacific - USAID Regional Development Mission-Asia (RDM/A)
Europe and Eurasia - Albania
Europe and Eurasia - Armenia
Europe and Eurasia - Azerbaijan
Europe and Eurasia - Belarus
Europe and Eurasia - Bosnia and Herzegovina
Europe and Eurasia -

## Reading OPs

In [1]:
import docx
import glob
import re

# Get all documents, turn each document into [paragraph, table] objects
documents = []
for file_name in glob.glob('./OP/*.docx'):
    doc = docx.Document(file_name)
    documents.append([doc.paragraphs, doc.tables])    
print("Done")

Done


In [2]:
# Turn it into a more readable object
# pdocs is a dictionary: {"title" : {"par": paragraphs, "tab" : tables, "im" : list of ims}}
# ims: [table : string]   
pdocs = {}
for i in range(len(documents)):
    title = ""
    for j in range(len(documents[i][0])):
        if re.search("[A-z]+", documents[i][0][j].text):
            title = documents[i][0][j].text
            break
    pdocs[title] = {"par" : documents[i][0], "tab" : documents[i][1], "im" : []}
print("Done")

Done


In [3]:
# Build "im" list - list of all IM tables for a document
for title in pdocs:
    # Wipe 
    pdocs[title]["im"] = []
    p_last = 0
    for i in range(len(pdocs[title]["tab"])):
        table = pdocs[title]["tab"][i]
        #print("%s %s" % (len(table.rows), len(table.columns)))
        
        # Catch bugged tables
        try:
            table.cell(0, 0)
        except:
            print(title, "phantom table")
            continue
        
        
        if table.cell(0, 0).text.find("Mechanism") != -1:
            # Found an IM table, now find the corresponding paragraph with IM table
            # Get im number
            im_num = table.cell(0, 1).text
            
            # Find the corresponding summary.
            pars = pdocs[title]["par"]
            for j in range(p_last, len(pars)):
                if pars[j].text.find(im_num) != -1:
                    break
                    
            if j == len(pars) - 1:
                print("ERROR: %s matching paragraph could not be found" % im_num)
                pdocs[title]["im"].append([table, "ERROR"])
                continue
            
            # Append all text corresponding to the table
            corr_par = ""
            for j2 in range(j+1, len(pars)):
                # Skip this line
                if pars[j2].text.find("IMPLEMENTING MECHANISM NARRATIVE") != -1 or re.match("[\n ]+", pars[j].text):
                    continue
                
                # Stop at "FUNDING SUMMARY"
                if pars[j2].text.find("FUNDING SUMMARY") != -1:
                    break
                
                corr_par += pars[j2].text.strip() + "\n"
            
            # Try to optimize it a bit
            p_last = j
            
            # Append the table, summary
            pdocs[title]["im"].append([table, corr_par]) 
print("Done")

Togo phantom table
Done


In [4]:
# Helper functions
# Print out a IM table
def print_im(table):
    # IM will always have 2 columns
    for i in range(len(table.rows)):
        print("%-30s %s" % (table.cell(i, 0).text, table.cell(i, 1).text))
    print()

# Return a flat list of contents of im
def flat_im(table):
    return [x.text for x in table.column_cells(1)]

In [5]:
# Example: print out first 5 IM tables for Afhganistan
i = 0
for ims in pdocs["Afghanistan"]["im"]:
    if i == 5:
        break
    print_im(ims[0])
    print(ims[1])
    i += 1

Mechanism Number               46246
Implementing Mechanism Name:   Central Contraceptives Procurement (CCP), field support mechanism
Prime Partner:                 John Snow, Incorporated 
Award Number:                  306-AID-OAA-TO-10-00066
Implementing Mechanism Type:   Direct Contracts
Source Agency:                 U.S. Agency for International Development
Implementing Agency:           U.S. Agency for International Development
Planned Funding:               $3,000,000
Start Date:                    04/20/2015
End Date:                      11/28/2023
Total Estimated Cost:          $25,000,000

The Global Health Supply Chain- Procurement and Supply Management Project (GHS-PSM) serves as the central procurement mechanism for USAID Missions worldwide to purchase high quality contraceptives and other essential public health supplies. Through various contractors, this activity implements the USAID policy of centralized contraceptive procurement by providing a simplified mechanism fo

In [6]:
# Get all IM descriptions
# This fills titles with the mission tag + their IM number
im_docs = []
titles = []
for title in pdocs:
    ims = pdocs[title]["im"]
    for im in ims:
        titles.append(title[:3] + " " + im[0].cell(0, 1).text)
        im_docs.append(im[1])

In [7]:
import pandas as pd
import sklearn.feature_extraction as skft
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

ADD_STOPWORDS = ["usaid", "usaids", "agency", 
                 "international", "development", 
                 "unclassified", "embassy", "approved", 
                 "public", "release", "cdcs", "mission", "country"]
stwords = list(skft.text.ENGLISH_STOP_WORDS.union(ADD_STOPWORDS))

# Convert documents into bag of words
# throw out words that do noy appear in 3% of docs
vectorizer = CountVectorizer(stop_words=stwords, lowercase=True, min_df=0.03)
raw_bag = vectorizer.fit_transform(im_docs)
bag = pd.DataFrame(data=raw_bag.toarray(), columns = vectorizer.get_feature_names_out(), index=titles)

tfidf_vect = TfidfTransformer()
tfidf_bag = tfidf_vect.fit_transform(bag)
tfidf = pd.DataFrame(data=tfidf_bag.toarray(), columns = vectorizer.get_feature_names_out(), index=titles)

In [8]:
bag

Unnamed: 0,000,19,2020,2021,2022,2023,500,ability,able,access,...,work,workers,workforce,working,works,world,year,years,young,youth
Afg 46246,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Afg 46786,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
Afg 46809,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Afg 100772,0,0,0,0,1,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
Afg 102903,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zim 113484,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
Zim 113497,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Zim 113498,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
Zim 113499,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [26]:
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.lda_model

lda = LatentDirichletAllocation(n_components=9)
lda.fit(tfidf)
pyLDAvis.lda_model.prepare(lda, tfidf_bag, vectorizer)

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(


In [148]:
lda2 = LatentDirichletAllocation(n_components=9)
lda2.fit(bag)
pyLDAvis.lda_model.prepare(lda2, raw_bag, vectorizer)

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(


In [152]:
# TODO: we need better names cause this stuff is confusing / bug prone
bag_topic_dist = lda2.transform(raw_bag)
#tfidf_prob = pd.DataFrame(data=, columns = vectorizer.get_feature_names_out(), index=titles)

  and should_run_async(code)


In [153]:
tfidf_topic_dist.argsort(1)

  and should_run_async(code)


array([[3, 6, 8, ..., 1, 4, 7],
       [6, 2, 7, ..., 5, 3, 8],
       [2, 5, 3, ..., 6, 4, 1],
       ...,
       [3, 6, 5, ..., 1, 0, 7],
       [3, 6, 1, ..., 2, 5, 7],
       [3, 6, 2, ..., 5, 0, 7]], dtype=int64)

In [155]:
bag_topic_dist

  and should_run_async(code)


array([[0.00236493, 0.00236518, 0.07815769, ..., 0.00236569, 0.9052856 ,
        0.00236539],
       [0.0019168 , 0.07292676, 0.27613524, ..., 0.00191662, 0.00191623,
        0.00191683],
       [0.00584913, 0.00584878, 0.95320725, ..., 0.00584886, 0.00584935,
        0.00584928],
       ...,
       [0.00113446, 0.00113443, 0.00113475, ..., 0.00113458, 0.79527055,
        0.00113433],
       [0.00171004, 0.00171038, 0.00171009, ..., 0.00171019, 0.98631781,
        0.00171064],
       [0.00182214, 0.00182273, 0.00182225, ..., 0.00182218, 0.51272064,
        0.00182252]])

In [156]:
import numpy as np
a = bag_topic_dist.argsort(1)[:, [-1]]

# Quick and dirty unvectorized way of getting all the top probabilities
# Not sure how to do it the numpy way
b = []
for i in range(len(bag_topic_dist)):
    b.append(bag_topic_dist[i,a[i]][0])

  and should_run_async(code)


In [157]:
topic_df = pd.DataFrame(data={"topic":a.flatten(), "probability":b}, index=titles)
topic_df

  and should_run_async(code)


Unnamed: 0,topic,probability
Afg 46246,7,0.905286
Afg 46786,5,0.639438
Afg 46809,2,0.953207
Afg 100772,4,0.571326
Afg 102903,2,0.967071
...,...,...
Zim 113484,4,0.621622
Zim 113497,7,0.738817
Zim 113498,7,0.795271
Zim 113499,7,0.986318


In [158]:
def get_tg_topics(df, regex):
    r = re.compile(regex)
    #return topic_df.loc[list(filter(r.match, topic_df.index)), "topic"]
    return topic_df.loc[list(filter(r.match, topic_df.index)), "topic"].value_counts()

  and should_run_async(code)


In [160]:
get_tg_topics(topic_df, "Afg.*")

  and should_run_async(code)


5    5
8    5
2    4
4    4
7    3
1    2
3    2
0    1
6    1
Name: topic, dtype: int64

In [141]:
set([x[:3] for x in topic_df.index])

  and should_run_async(code)


{'Afg',
 'Afr',
 'Alb',
 'Alg',
 'Ang',
 'Arm',
 'Aze',
 'Bah',
 'Ban',
 'Bar',
 'Bel',
 'Ben',
 'Bhu',
 'Bos',
 'Bra',
 'Bul',
 'Bur',
 'Cab',
 'Cam',
 'Cen',
 'Cha',
 'Chi',
 'Col',
 'Cot',
 'Dem',
 'Dji',
 'Ecu',
 'Egy',
 'El ',
 'Est',
 'Eth',
 'Gam',
 'Geo',
 'Gha',
 'Gua',
 'Gui',
 'Hai',
 'Hon',
 'Ind',
 'Int',
 'Ira',
 'Isr',
 'Jam',
 'Jor',
 'Kaz',
 'Ken',
 'Kos',
 'Kyr',
 'Lao',
 'Leb',
 'Lib',
 'Mad',
 'Mal',
 'Mar',
 'Mau',
 'Mex',
 'Mic',
 'Mid',
 'Mol',
 'Mon',
 'Mor',
 'Moz',
 'Mul',
 'Nea',
 'Nep',
 'Nic',
 'Nig',
 'Nit',
 'Nor',
 'Oma',
 'Org',
 'Pac',
 'Pak',
 'Pan',
 'Pap',
 'Par',
 'Per',
 'Phi',
 'R/G',
 'Rwa',
 'S/G',
 'Sen',
 'Sie',
 'Som',
 'Sou',
 'Sri',
 'Sta',
 'Sud',
 'Syr',
 'Taj',
 'Tan',
 'Tha',
 'Tim',
 'Tog',
 'Tun',
 'Tur',
 'USA',
 'Uga',
 'Uzb',
 'Ven',
 'Vie',
 'Wes',
 'Yem',
 'Zam',
 'Zim'}

In [164]:
# Unused
for topic in range(0, len(lda2.components_)):
    print(topic)
    print("".join(bag.columns[x] + " " for x in lda2.components_[topic].argsort()[-10:]))
    a = lda2.components_[topic].copy()
    a.sort()
    print(a[-10:])
    print()

0
trafficking justice enforcement military capacity support law training security tb 
[ 313.79577709  346.04967062  407.05514796  469.11097309  543.60825131
  556.11200858  593.09295698  653.62543051  732.55903422 1188.2546918 ]

1
business increase growth youth opportunities women sector private economic activity 
[ 414.86057395  436.04150845  453.20186682  465.09817276  478.51173684
  610.77810546  864.4025696   879.28158434 1109.74471246 1567.23415183]

2
provide funding staff programs activities assistance technical funds program support 
[ 403.93310777  451.62627467  494.28046981  509.86110816  528.17535499
  647.09357572  647.10698486  659.10053509 1178.23899889 1587.54356381]

3
regional resilience activities security global support research countries climate food 
[374.96335421 388.1110431  393.40169842 413.73153888 420.91446171
 453.85012134 521.74630797 553.39418914 705.84628969 706.82977817]

4
support resources access sustainable local improve management services water acti

  and should_run_async(code)
