In [1]:
import time

import requests
import re
import json
import os
import string
import hashlib

import bs4
from bs4 import BeautifulSoup
import tqdm 

# Retrieve All Embassy Website URLs

We first retrieve all the urls to each embassy's website.

In [2]:
us_embassy_main_url = 'https://www.usembassy.gov/post-sitemap.xml'
us_embassy_main_request = requests.get(us_embassy_main_url)
us_embassy_main_request

<Response [200]>

In [3]:
us_embassy_main_html = us_embassy_main_request.content
us_embassy_main_soup = BeautifulSoup(us_embassy_main_html, 'lxml')

In [4]:
embassy_url_list = [loc.string for loc in us_embassy_main_soup.find_all('loc')]
embassy_url_list[:5]

['https://www.usembassy.gov/south-africa/',
 'https://www.usembassy.gov/indonesia/',
 'https://www.usembassy.gov/malaysia/',
 'https://www.usembassy.gov/laos/',
 'https://www.usembassy.gov/marshall-islands/']

In [95]:
def embassy_url_prefix(x):
    a = re.match("(http|https)://(?!www).*\.(usmission|usembassy|usconsulate).*", x['href'])
    return a.group() if a else None

In [52]:
embassy_url_map = {}
for embassy in embassy_url_list:
    country_name = embassy.split('/')[-2]
    print(embassy.split('/')[-2])
    embassy_request = requests.get(embassy)
    embassy_html = embassy_request.content
    embassy_soup = BeautifulSoup(embassy_html, 'lxml')
    # cities = embassy_soup.find(class_='cityname1')
    # country_website = '/'.join(cities.a['href'].split('/')[:3])
    country_website_list = list(
        filter(
            None,
            list(
                map(
                    embassy_url_prefix,
                    embassy_soup.find_all('a')
                )
            )
        )
    )
    if len(country_website_list)>0:
        country_website = country_website_list[0]
        country_website = '/'.join(country_website.split('/')[:3])
        print(country_website)
        embassy_url_map[country_name] = country_website
    time.sleep(2)

south-africa
https://za.usembassy.gov
indonesia
https://id.usembassy.gov
malaysia
https://my.usembassy.gov
laos
https://la.usembassy.gov
marshall-islands
https://mh.usembassy.gov
samoa
https://ws.usembassy.gov
thailand
https://th.usembassy.gov
vietnam
https://vn.usembassy.gov
austria
https://at.usembassy.gov
belarus
https://by.usembassy.gov
czech-republic
https://cz.usembassy.gov
denmark
https://dk.usembassy.gov
france
https://fr.usembassy.gov
ireland
https://ie.usembassy.gov
italy
https://it.usembassy.gov
latvia
https://lv.usembassy.gov
malta
https://mt.usembassy.gov
poland
https://pl.usembassy.gov
romania
https://ro.usembassy.gov
sweden
https://se.usembassy.gov
united-kingdom
https://uk.usembassy.gov
jordan
https://jo.usembassy.gov
saudi-arabia
https://sa.usembassy.gov
bangladesh
https://bd.usembassy.gov
kazakhstan
https://kz.usembassy.gov
pakistan
https://pk.usembassy.gov
sri-lanka
https://lk.usembassy.gov
argentina
https://ar.usembassy.gov
chile
https://cl.usembassy.gov
ecuador
htt

## Missing URLs

The `embassy_url_prefix` function is not exhaustive. There are a few remaining embassies left to categorize, which we do manually.

In [86]:
def embassy_country(embassy_url):
    country_name = embassy_url.split('/')[-2]
    return country_name
missing_embassies = []
# set(map(d, embassy_url_list)) - set(embassy_url_map.keys())
for i, country in list(enumerate(embassy_url_list)):
    if embassy_country(country) not in set(embassy_url_map.keys()):
        missing_embassies.append(country)
missing_embassies

['https://www.usembassy.gov/hong-kong/',
 'https://www.usembassy.gov/guatemala/',
 'https://www.usembassy.gov/bermuda/',
 'https://www.usembassy.gov/curacao/',
 'https://www.usembassy.gov/usau/',
 'https://www.usembassy.gov/taiwan-2/']

In [97]:
for url in missing_embassies:
    country_name = d(url)
    url_request = requests.get(url)
    url_html = url_request.content
    url_soup = BeautifulSoup(url_html, 'lxml')
    url_links = list(
        filter(
            None,
            list(
                map(
                    embassy_url_prefix,
                    url_soup.find_all('a')
                )
            )
        )
    )
    print(country_name)
    if len(url_links)>0:
        country_website = url_links[0]
        country_website = '/'.join(country_website.split('/')[:3])
        print(country_website)
        embassy_url_map[country_name] = country_website
    time.sleep(2)

hong-kong
https://hk.usconsulate.gov
guatemala
http://gt.usembassy.gov
bermuda
https://bm.usconsulate.gov
curacao
https://cw.usconsulate.gov
usau
taiwan-2


In [98]:
set(map(d, embassy_url_list)) - set(embassy_url_map.keys())

{'taiwan-2', 'usau'}

In [99]:
embassy_url_map['usau'] = 'https://www.usau.usmission.gov/'
embassy_url_map['taiwan-2'] = 'https://www.ait.org.tw'

## Save/Load your Work

In [101]:
with open("embassy_url_map.json", 'w') as f:
    json.dump(embassy_url_map, f, indent=6)

In [2]:
with open(os.path.join(os.getcwd(), 'data', "embassy_url_map.json"), 'r') as f:
    embassy_url_map = json.load(f)
list(embassy_url_map.items())[:5]

[('south-africa', 'https://za.usembassy.gov'),
 ('indonesia', 'https://id.usembassy.gov'),
 ('malaysia', 'https://my.usembassy.gov'),
 ('laos', 'https://la.usembassy.gov'),
 ('marshall-islands', 'https://mh.usembassy.gov')]

# Retrieve Posts from a Specific Country

## First, we need to get all the existing posts of an embassy's website

In [42]:
def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, 'w:gz') as tar_handle:
        tar_handle.add(source_dir, arcname=os.path.basename(source_dir))

In [9]:
target_country = 'malaysia'
sitemap_url = f"{embassy_url_map[target_country]}/post-sitemap.xml"
sitemap_request = requests.get(sitemap_url)
sitemap_html = sitemap_request.content
sitemap_soup = BeautifulSoup(sitemap_html, 'lxml')
embassy_posts = [loc.string for loc in sitemap_soup.find_all('loc')]
embassy_posts

['https://my.usembassy.gov/video-the-dialog-with-ambassador-joseph-yun/',
 'https://my.usembassy.gov/u-s-ambassadors-remarks-at-the-reception-to-celebrate-the-238th-anniversary-of-independence-of-the-united-states-of-america/',
 'https://my.usembassy.gov/condolence-statement-from-ambassador-joseph-y-yun-on-the-death-of-karpal-singh/',
 'https://my.usembassy.gov/condolence-statement-from-ambassador-joseph-y-yun-on-the-death-of-dr-irene-fernandez/',
 'https://my.usembassy.gov/u-s-government-signs-agreement-on-quick-response-for-disaster-relief-aid-in-the-region/',
 'https://my.usembassy.gov/state-dept-on-malaysias-sedition-act-amendments/',
 'https://my.usembassy.gov/u-s-condolence-message-on-malaysias-helicopter-crash/',
 'https://my.usembassy.gov/special-presidential-envoy-john-allen-travel-to-jordan-malaysia-singapore-and-australia/',
 'https://my.usembassy.gov/u-s-embassy-statement-on-the-conviction-of-anwar-ibrahim/',
 'https://my.usembassy.gov/u-s-government-to-provide-additional-r

In [10]:
print("Number of posts", len(embassy_posts[20:]))

Number of posts 820


## Reading all posts of an embassy's website

In [11]:
data_path = os.path.join(os.getcwd(), 'data', target_country)
if not os.path.exists(data_path):
    os.makedirs(data_path)
for i, embassy_link in enumerate(tqdm.tqdm(embassy_posts[20:])):
    try:
        r_post = requests.get(embassy_link)
        html_post = r_post.content
        soup_post = BeautifulSoup(html_post, 'lxml')

        post_content = ''

        post_title = soup_post.find(class_='mo-breadcrumbs').find('h1').string.strip()
        post_title = post_title.translate(str.maketrans('', '', string.punctuation))
        post_title_hash = hashlib.sha1(post_title.encode('utf-8')).hexdigest()

        f = open(
            os.path.join(
                data_path,
                post_title_hash,
            ),
            'w'
        )

        f.write(post_title + '\n') 
        for sibling in soup_post.find(class_='main').article.find(class_='entry-content').div.next_siblings:
            if type(sibling) is bs4.element.Tag:
                if sibling.name == 'p':
                    text = f"{sibling.text if sibling.text else ''} "
                    if sibling.attrs:
                        if 'class' in sibling.attrs:
                            if 'byline' not in sibling.attrs['class']:
                                f.write(text)
                    else:
                        f.write(text)
        # print(post_content+"\n")
        f.close()
        time.sleep(5)
    except:
        print("Failure to scrape ", embassy_link)


100%|██████████| 820/820 [1:57:06<00:00,  8.57s/it]  


In [46]:
make_tarfile(f'{"south-africa"}.tar.gz', os.path.join(os.getcwd(), 'data', 'south-africa'))

In [34]:
os.chdir(os.path.join(os.getcwd(), '..', '..'))
os.getcwd()

'/home/bryan/Projects/embassy-webscrape'

In [35]:
import tarfile

main_directory = os.getcwd()
os.chdir(os.path.join(os.getcwd(), 'data'))
print(os.path.join(os.getcwd()))
def tardir(path, tar_name):
    with tarfile.open(tar_name, "w:gz") as tar_handle:
        tar_handle.add(path, recursive=True)
tardir('malaysia', 'malaysia.tar.gz')
os.chdir(main_directory)
os.getcwd()

/home/bryan/Projects/embassy-webscrape/data


'/home/bryan/Projects/embassy-webscrape'

In [37]:
data_path

'/home/bryan/Projects/embassy-webscrape/data/malaysia'

In [39]:
def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, 'w:gz') as tar_handle:
        tar_handle.add(source_dir, arcname=os.path.basename(source_dir))
make_tarfile('malaysia.tar.gz', data_path)

In [48]:
data_directory = os.path.join(os.getcwd(), 'data')
data_directory

'/home/bryan/Projects/embassy-webscrape/data'

In [60]:
for subdir in os.listdir(data_directory):
    if 'embassy_url_map' not in subdir:
        print(subdir)
        make_tarfile(f'{subdir}.tar.gz', os.path.join(data_directory, subdir))

samoa
laos
thailand
marshall-islands
south-africa
malaysia
indonesia


In [54]:
import subprocess
subprocess.call(['tar', '-czf', 'malaser.tar.gz', data_path])

0

In [3]:
from gcloud import storage
from oauth2client.service_account import ServiceAccountCredentials
import os

In [2]:
credentials_dict = {
    'type': 'service_account'
}

Package                  Version  
------------------------ ---------
appdirs                  1.4.4    
argon2-cffi              20.1.0   
async-generator          1.10     
attrs                    21.2.0   
backcall                 0.2.0    
beautifulsoup4           4.9.3    
black                    21.6b0   
bleach                   3.3.0    
bs4                      0.0.1    
certifi                  2021.5.30
cffi                     1.14.5   
chardet                  4.0.0    
click                    8.0.1    
debugpy                  1.3.0    
decorator                5.0.9    
defusedxml               0.7.1    
entrypoints              0.3      
gcloud                   0.18.3   
googleapis-common-protos 1.53.0   
httplib2                 0.19.1   
idna                     2.10     
ipykernel                6.0.0    
ipython                  7.25.0   
ipython-genutils         0.2.0    
ipywidgets               7.6.3    
jedi                     0.1

In [13]:
with open("embassy_url_map.json", 'w') as f:
    json.dump(embassy_url_map, f, indent=6)

In [171]:
'byline' in soup_post.find("main").article.find(class_='entry-content').find(class_='byline').attrs['class']

True

In [162]:
soup_post.find("main").article.find(class_='entry-content').attrs

{'class': ['entry-content']}