In [17]:
import time

import requests
import re
import json
import os
import string
import hashlib

import bs4
from bs4 import BeautifulSoup
import tqdm 

# Retrieve All Embassy Website URLs

We first retrieve all the urls to each embassy's website.

In [2]:
us_embassy_main_url = 'https://www.usembassy.gov/post-sitemap.xml'
us_embassy_main_request = requests.get(us_embassy_main_url)
us_embassy_main_request

<Response [200]>

In [3]:
us_embassy_main_html = us_embassy_main_request.content
us_embassy_main_soup = BeautifulSoup(us_embassy_main_html, 'lxml')

In [4]:
embassy_url_list = [loc.string for loc in us_embassy_main_soup.find_all('loc')]
embassy_url_list[:5]

['https://www.usembassy.gov/south-africa/',
 'https://www.usembassy.gov/indonesia/',
 'https://www.usembassy.gov/malaysia/',
 'https://www.usembassy.gov/laos/',
 'https://www.usembassy.gov/marshall-islands/']

In [95]:
def embassy_url_prefix(x):
    a = re.match("(http|https)://(?!www).*\.(usmission|usembassy|usconsulate).*", x['href'])
    return a.group() if a else None

In [52]:
embassy_url_map = {}
for embassy in embassy_url_list:
    country_name = embassy.split('/')[-2]
    print(embassy.split('/')[-2])
    embassy_request = requests.get(embassy)
    embassy_html = embassy_request.content
    embassy_soup = BeautifulSoup(embassy_html, 'lxml')
    # cities = embassy_soup.find(class_='cityname1')
    # country_website = '/'.join(cities.a['href'].split('/')[:3])
    country_website_list = list(
        filter(
            None,
            list(
                map(
                    embassy_url_prefix,
                    embassy_soup.find_all('a')
                )
            )
        )
    )
    if len(country_website_list)>0:
        country_website = country_website_list[0]
        country_website = '/'.join(country_website.split('/')[:3])
        print(country_website)
        embassy_url_map[country_name] = country_website
    time.sleep(2)

south-africa
https://za.usembassy.gov
indonesia
https://id.usembassy.gov
malaysia
https://my.usembassy.gov
laos
https://la.usembassy.gov
marshall-islands
https://mh.usembassy.gov
samoa
https://ws.usembassy.gov
thailand
https://th.usembassy.gov
vietnam
https://vn.usembassy.gov
austria
https://at.usembassy.gov
belarus
https://by.usembassy.gov
czech-republic
https://cz.usembassy.gov
denmark
https://dk.usembassy.gov
france
https://fr.usembassy.gov
ireland
https://ie.usembassy.gov
italy
https://it.usembassy.gov
latvia
https://lv.usembassy.gov
malta
https://mt.usembassy.gov
poland
https://pl.usembassy.gov
romania
https://ro.usembassy.gov
sweden
https://se.usembassy.gov
united-kingdom
https://uk.usembassy.gov
jordan
https://jo.usembassy.gov
saudi-arabia
https://sa.usembassy.gov
bangladesh
https://bd.usembassy.gov
kazakhstan
https://kz.usembassy.gov
pakistan
https://pk.usembassy.gov
sri-lanka
https://lk.usembassy.gov
argentina
https://ar.usembassy.gov
chile
https://cl.usembassy.gov
ecuador
htt

## Missing URLs

The `embassy_url_prefix` function is not exhaustive. There are a few remaining embassies left to categorize, which we do manually.

In [86]:
def embassy_country(embassy_url):
    country_name = embassy_url.split('/')[-2]
    return country_name
missing_embassies = []
# set(map(d, embassy_url_list)) - set(embassy_url_map.keys())
for i, country in list(enumerate(embassy_url_list)):
    if embassy_country(country) not in set(embassy_url_map.keys()):
        missing_embassies.append(country)
missing_embassies

['https://www.usembassy.gov/hong-kong/',
 'https://www.usembassy.gov/guatemala/',
 'https://www.usembassy.gov/bermuda/',
 'https://www.usembassy.gov/curacao/',
 'https://www.usembassy.gov/usau/',
 'https://www.usembassy.gov/taiwan-2/']

In [97]:
for url in missing_embassies:
    country_name = d(url)
    url_request = requests.get(url)
    url_html = url_request.content
    url_soup = BeautifulSoup(url_html, 'lxml')
    url_links = list(
        filter(
            None,
            list(
                map(
                    embassy_url_prefix,
                    url_soup.find_all('a')
                )
            )
        )
    )
    print(country_name)
    if len(url_links)>0:
        country_website = url_links[0]
        country_website = '/'.join(country_website.split('/')[:3])
        print(country_website)
        embassy_url_map[country_name] = country_website
    time.sleep(2)

hong-kong
https://hk.usconsulate.gov
guatemala
http://gt.usembassy.gov
bermuda
https://bm.usconsulate.gov
curacao
https://cw.usconsulate.gov
usau
taiwan-2


In [98]:
set(map(d, embassy_url_list)) - set(embassy_url_map.keys())

{'taiwan-2', 'usau'}

In [99]:
embassy_url_map['usau'] = 'https://www.usau.usmission.gov/'
embassy_url_map['taiwan-2'] = 'https://www.ait.org.tw'

## Save/Load your Work

In [101]:
with open("embassy_url_map.json", 'w') as f:
    json.dump(embassy_url_map, f, indent=6)

In [7]:
with open(os.path.join(os.getcwd(), 'data', "embassy_url_map.json"), 'r') as f:
    embassy_url_map = json.load(f)
list(embassy_url_map.items())[:5]

[('south-africa', 'https://za.usembassy.gov'),
 ('indonesia', 'https://id.usembassy.gov'),
 ('malaysia', 'https://my.usembassy.gov'),
 ('laos', 'https://la.usembassy.gov'),
 ('marshall-islands', 'https://mh.usembassy.gov')]

# Retrieve Posts from a Specific Country

## First, we need to get all the existing posts of an embassy's website

In [8]:
target_country = 'south-africa'
sitemap_url = f"{embassy_url_map[target_country]}/post-sitemap.xml"
sitemap_request = requests.get(sitemap_url)
sitemap_html = sitemap_request.content
sitemap_soup = BeautifulSoup(sitemap_html, 'lxml')
embassy_posts = [loc.string for loc in sitemap_soup.find_all('loc')]
embassy_posts

['https://za.usembassy.gov/pepfar-nike-foundation-and-gates-foundation-partnership-restores-hope-for-an-aids-free-future-for-girls/',
 'https://za.usembassy.gov/long-fight-with-tb-the-story-of-thabo-pelesane/',
 'https://za.usembassy.gov/the-power-of-partnership-extraordinary-progress-lessons-learned-and-great-hope-for-the-future-in-south-africa/',
 'https://za.usembassy.gov/pepfar-stories-of-survival-in-rural-mpumalanga-province/',
 'https://za.usembassy.gov/siyayinqoba-beat-it-conquers-fear-of-hiv-testing/',
 'https://za.usembassy.gov/desmond-tutu-blesses-panel-from-aids-quilt-returning-to-washington/',
 'https://za.usembassy.gov/thibela-tb-has-miners-covered/',
 'https://za.usembassy.gov/icap-leaves-proud-legacy-in-eastern-cape/',
 'https://za.usembassy.gov/pepfar-partner-egpafs-successful-project-transitions-to-south-african-organizations/',
 'https://za.usembassy.gov/sm_090415/',
 'https://za.usembassy.gov/sm_200815/',
 'https://za.usembassy.gov/updated-worldwide-caution/',
 'http

## Reading all posts of an embassy's website

In [19]:
data_path = os.path.join(os.getcwd(), 'data', target_country)
if not os.path.exists(data_path):
    os.makedirs(data_path)
for i, embassy_link in enumerate(tqdm.tqdm(embassy_posts)):
    r_post = requests.get(embassy_link)
    html_post = r_post.content
    soup_post = BeautifulSoup(html_post, 'lxml')
    
    post_content = ''
    
    post_title = soup_post.find(class_='mo-breadcrumbs').find('h1').string.strip()
    post_title = post_title.translate(str.maketrans('', '', string.punctuation))
    post_title_hash = hashlib.sha1(post_title.encode('utf-8')).hexdigest()
    
    f = open(
        os.path.join(
            data_path,
            post_title_hash,
        ),
        'w'
    )
    
    f.write(post_title + '\n') 
    for sibling in soup_post.find(class_='main').article.find(class_='entry-content').div.next_siblings:
        if type(sibling) is bs4.element.Tag:
            if sibling.name == 'p':
                text = f"{sibling.text if sibling.text else ''} "
                if sibling.attrs:
                    if 'class' in sibling.attrs:
                        if 'byline' not in sibling.attrs['class']:
                            f.write(text)
                else:
                    f.write(text)
    # print(post_content+"\n")
    f.close()
    time.sleep(5)


  0%|          | 0/950 [00:00<?, ?it/s][A
  0%|          | 1/950 [00:05<1:23:28,  5.28s/it][A
  0%|          | 2/950 [00:14<1:58:37,  7.51s/it][A
  0%|          | 3/950 [00:23<2:09:10,  8.18s/it][A
  0%|          | 4/950 [00:31<2:11:39,  8.35s/it][A
  1%|          | 5/950 [00:40<2:14:38,  8.55s/it][A
  1%|          | 6/950 [00:50<2:20:18,  8.92s/it][A
  1%|          | 7/950 [00:59<2:18:24,  8.81s/it][A
  1%|          | 8/950 [01:07<2:16:42,  8.71s/it][A
  1%|          | 9/950 [01:12<1:59:55,  7.65s/it][A
  1%|          | 10/950 [01:19<1:53:10,  7.22s/it][A
  1%|          | 11/950 [01:24<1:45:11,  6.72s/it][A
  1%|▏         | 12/950 [01:30<1:40:41,  6.44s/it][A
  1%|▏         | 13/950 [01:35<1:35:32,  6.12s/it][A
  1%|▏         | 14/950 [01:41<1:31:28,  5.86s/it][A
  2%|▏         | 15/950 [01:50<1:47:39,  6.91s/it][A
  2%|▏         | 16/950 [01:59<1:56:57,  7.51s/it][A
  2%|▏         | 17/950 [02:08<2:02:58,  7.91s/it][A
  2%|▏         | 18/950 [02:17<2:09:08,  8.31s/

 16%|█▌        | 151/950 [20:43<2:07:22,  9.57s/it][A
 16%|█▌        | 152/950 [20:53<2:07:01,  9.55s/it][A
 16%|█▌        | 153/950 [21:02<2:05:28,  9.45s/it][A
 16%|█▌        | 154/950 [21:11<2:01:58,  9.19s/it][A
 16%|█▋        | 155/950 [21:19<2:00:52,  9.12s/it][A
 16%|█▋        | 156/950 [21:29<2:01:33,  9.19s/it][A
 17%|█▋        | 157/950 [21:41<2:12:47, 10.05s/it][A
 17%|█▋        | 158/950 [21:52<2:18:09, 10.47s/it][A
 17%|█▋        | 159/950 [21:59<2:01:14,  9.20s/it][A
 17%|█▋        | 160/950 [22:14<2:24:59, 11.01s/it][A
 17%|█▋        | 161/950 [22:27<2:33:29, 11.67s/it][A
 17%|█▋        | 162/950 [22:38<2:31:36, 11.54s/it][A
 17%|█▋        | 163/950 [22:49<2:26:22, 11.16s/it][A
 17%|█▋        | 164/950 [23:00<2:25:48, 11.13s/it][A
 17%|█▋        | 165/950 [23:06<2:06:54,  9.70s/it][A
 17%|█▋        | 166/950 [23:12<1:53:47,  8.71s/it][A
 18%|█▊        | 167/950 [23:19<1:45:02,  8.05s/it][A
 18%|█▊        | 168/950 [23:25<1:39:00,  7.60s/it][A
 18%|█▊   

 31%|███▏      | 299/950 [39:18<1:33:15,  8.60s/it][A
 32%|███▏      | 300/950 [39:30<1:43:47,  9.58s/it][A
 32%|███▏      | 301/950 [39:39<1:43:16,  9.55s/it][A
 32%|███▏      | 302/950 [39:49<1:43:15,  9.56s/it][A
 32%|███▏      | 303/950 [39:58<1:42:21,  9.49s/it][A
 32%|███▏      | 304/950 [40:05<1:33:17,  8.66s/it][A
 32%|███▏      | 305/950 [40:14<1:34:48,  8.82s/it][A
 32%|███▏      | 306/950 [40:23<1:34:24,  8.80s/it][A
 32%|███▏      | 307/950 [40:32<1:33:13,  8.70s/it][A
 32%|███▏      | 308/950 [40:41<1:35:31,  8.93s/it][A
 33%|███▎      | 309/950 [40:49<1:33:52,  8.79s/it][A
 33%|███▎      | 310/950 [40:59<1:35:32,  8.96s/it][A
 33%|███▎      | 311/950 [41:08<1:36:39,  9.08s/it][A
 33%|███▎      | 312/950 [41:17<1:36:26,  9.07s/it][A
 33%|███▎      | 313/950 [41:27<1:39:42,  9.39s/it][A
 33%|███▎      | 314/950 [41:36<1:36:17,  9.08s/it][A
 33%|███▎      | 315/950 [41:44<1:33:32,  8.84s/it][A
 33%|███▎      | 316/950 [41:52<1:31:37,  8.67s/it][A
 33%|███▎ 

 47%|████▋     | 447/950 [1:01:46<1:08:28,  8.17s/it][A
 47%|████▋     | 448/950 [1:01:52<1:03:41,  7.61s/it][A
 47%|████▋     | 449/950 [1:02:00<1:05:02,  7.79s/it][A
 47%|████▋     | 450/950 [1:02:07<1:01:13,  7.35s/it][A
 47%|████▋     | 451/950 [1:02:16<1:06:56,  8.05s/it][A
 48%|████▊     | 452/950 [1:02:23<1:02:53,  7.58s/it][A
 48%|████▊     | 453/950 [1:02:38<1:20:48,  9.76s/it][A
 48%|████▊     | 454/950 [1:02:47<1:20:47,  9.77s/it][A
 48%|████▊     | 455/950 [1:02:56<1:18:50,  9.56s/it][A
 48%|████▊     | 456/950 [1:03:05<1:15:55,  9.22s/it][A
 48%|████▊     | 457/950 [1:03:14<1:16:09,  9.27s/it][A
 48%|████▊     | 458/950 [1:03:24<1:17:08,  9.41s/it][A
 48%|████▊     | 459/950 [1:03:32<1:14:38,  9.12s/it][A
 48%|████▊     | 460/950 [1:03:39<1:07:22,  8.25s/it][A
 49%|████▊     | 461/950 [1:03:48<1:10:38,  8.67s/it][A
 49%|████▊     | 462/950 [1:03:57<1:11:21,  8.77s/it][A
 49%|████▊     | 463/950 [1:04:04<1:06:15,  8.16s/it][A
 49%|████▉     | 464/950 [1:04:

 62%|██████▏   | 592/950 [1:22:23<54:04,  9.06s/it][A
 62%|██████▏   | 593/950 [1:22:31<53:04,  8.92s/it][A
 63%|██████▎   | 594/950 [1:22:40<52:23,  8.83s/it][A
 63%|██████▎   | 595/950 [1:22:49<53:25,  9.03s/it][A
 63%|██████▎   | 596/950 [1:22:59<54:07,  9.17s/it][A
 63%|██████▎   | 597/950 [1:23:08<53:33,  9.10s/it][A
 63%|██████▎   | 598/950 [1:23:17<53:38,  9.14s/it][A
 63%|██████▎   | 599/950 [1:23:28<56:13,  9.61s/it][A
 63%|██████▎   | 600/950 [1:23:38<56:27,  9.68s/it][A
 63%|██████▎   | 601/950 [1:23:47<56:03,  9.64s/it][A
 63%|██████▎   | 602/950 [1:23:56<54:22,  9.37s/it][A
 63%|██████▎   | 603/950 [1:24:05<54:12,  9.37s/it][A
 64%|██████▎   | 604/950 [1:24:15<53:54,  9.35s/it][A
 64%|██████▎   | 605/950 [1:24:25<54:56,  9.56s/it][A
 64%|██████▍   | 606/950 [1:24:35<55:26,  9.67s/it][A
 64%|██████▍   | 607/950 [1:24:44<55:04,  9.63s/it][A
 64%|██████▍   | 608/950 [1:24:53<54:01,  9.48s/it][A
 64%|██████▍   | 609/950 [1:25:03<53:24,  9.40s/it][A
 64%|█████

 78%|███████▊  | 740/950 [1:44:09<30:20,  8.67s/it][A
 78%|███████▊  | 741/950 [1:44:17<30:08,  8.65s/it][A
 78%|███████▊  | 742/950 [1:44:26<29:41,  8.56s/it][A
 78%|███████▊  | 743/950 [1:44:34<29:22,  8.51s/it][A
 78%|███████▊  | 744/950 [1:44:44<30:30,  8.88s/it][A
 78%|███████▊  | 745/950 [1:44:53<30:58,  9.06s/it][A
 79%|███████▊  | 746/950 [1:45:02<30:00,  8.82s/it][A
 79%|███████▊  | 747/950 [1:45:11<29:52,  8.83s/it][A
 79%|███████▊  | 748/950 [1:45:19<29:32,  8.77s/it][A
 79%|███████▉  | 749/950 [1:45:28<29:10,  8.71s/it][A
 79%|███████▉  | 750/950 [1:45:36<28:56,  8.68s/it][A
 79%|███████▉  | 751/950 [1:45:45<28:33,  8.61s/it][A
 79%|███████▉  | 752/950 [1:45:54<28:34,  8.66s/it][A
 79%|███████▉  | 753/950 [1:46:03<29:36,  9.02s/it][A
 79%|███████▉  | 754/950 [1:46:12<29:01,  8.89s/it][A
 79%|███████▉  | 755/950 [1:46:18<26:13,  8.07s/it][A
 80%|███████▉  | 756/950 [1:46:28<27:46,  8.59s/it][A
 80%|███████▉  | 757/950 [1:46:36<27:20,  8.50s/it][A
 80%|█████

 93%|█████████▎| 888/950 [2:06:03<09:18,  9.00s/it][A
 94%|█████████▎| 889/950 [2:06:11<09:00,  8.87s/it][A
 94%|█████████▎| 890/950 [2:06:21<09:06,  9.11s/it][A
 94%|█████████▍| 891/950 [2:06:31<09:19,  9.48s/it][A
 94%|█████████▍| 892/950 [2:06:42<09:39,  9.99s/it][A
 94%|█████████▍| 893/950 [2:06:52<09:22,  9.86s/it][A
 94%|█████████▍| 894/950 [2:07:01<08:53,  9.53s/it][A
 94%|█████████▍| 895/950 [2:07:09<08:22,  9.13s/it][A
 94%|█████████▍| 896/950 [2:07:19<08:28,  9.41s/it][A
 94%|█████████▍| 897/950 [2:07:28<08:19,  9.43s/it][A
 95%|█████████▍| 898/950 [2:07:35<07:23,  8.54s/it][A
 95%|█████████▍| 899/950 [2:07:43<07:13,  8.50s/it][A
 95%|█████████▍| 900/950 [2:07:50<06:35,  7.92s/it][A
 95%|█████████▍| 901/950 [2:07:59<06:49,  8.36s/it][A
 95%|█████████▍| 902/950 [2:08:09<06:56,  8.69s/it][A
 95%|█████████▌| 903/950 [2:08:14<05:59,  7.65s/it][A
 95%|█████████▌| 904/950 [2:08:20<05:30,  7.19s/it][A
 95%|█████████▌| 905/950 [2:08:30<05:59,  7.99s/it][A
 95%|█████

In [117]:
print(soup_post.find(class_='mo-breadcrumbs').find('h1').string.strip())
for sibling in soup_post.find(class_='main').article.find(class_='entry-content').div.next_siblings:
    if type(sibling) is bs4.element.Tag:
        if sibling.name == 'p':
            if sibling.attrs:
                if 'class' in sibling.attrs:
                    if 'byline' not in sibling.attrs['class']:
                        print(sibling.text if sibling.text else '')
            else:
                print(sibling.text if sibling.text else '')

Siyayinqoba: Beat It! Conquers Fear of HIV Testing
Siyayinqoba Beat It!, a South African magazine show produced by the Community Health Media Trust (CMT) discusses hard-hitting topics about people living with HIV.
Not content with using a mass media to reach out on a national level, CMT goes one step further: with funding from PEPFAR through the Centers for Disease Control and Prevention the team has rolled out a comprehensive social mobilization campaign using the catchy Siyayinqoba brand at all levels of media, starting at national and reaching out to local communities through thought-provoking radio talk shows, news articles and even taxis branded with HIV prevention messaging.
CMT’s new project team, based in rural Mthatha in the Eastern Cape, works with local NGOs and community platforms to create a communication cascade that ensures the flow of HIV prevention and health information to all corners of the communities in the predominantly isiXhosa province.
CMT works closely with TB

In [172]:
for s in soup_post.find("main").article.find(class_='entry-content').div.next_siblings:
    print(s.name, s.string, type(s))
    if s.name == 'p':
        if 'class' in s.attrs:
            print('Attributes:', s.attrs['class'])
            if 'byline' not in s.attrs['class']:
                print('Hello!', s.text)
        else:
            print('Hello!', s.text)
        print()

figure None <class 'bs4.element.Tag'>
None 
 <class 'bs4.element.NavigableString'>
p None <class 'bs4.element.Tag'>
Hello! Della Mae Live in Majuro! Embassy Majuro welcomed Nashville-based bluegrass band Della Mae with traditional Marshallese songs and dances performed by Youth to Youth in Health.  Band members spent the week holding music workshops with local youth around town.  Their visit will culminate Saturday night with a free all ages concert in Delap Park at 7 pm.  Della Mae’s visit to Majuro is part of the American Music Abroad program.

None 
 <class 'bs4.element.NavigableString'>
None  AddThis Advanced Settings above via filter on the_content  <class 'bs4.element.Comment'>
None  AddThis Advanced Settings below via filter on the_content  <class 'bs4.element.Comment'>
None  AddThis Advanced Settings generic via filter on the_content  <class 'bs4.element.Comment'>
None  AddThis Share Buttons above via filter on the_content  <class 'bs4.element.Comment'>
None  AddThis Share Butt

In [171]:
'byline' in soup_post.find("main").article.find(class_='entry-content').find(class_='byline').attrs['class']

True

In [162]:
soup_post.find("main").article.find(class_='entry-content').attrs

{'class': ['entry-content']}