In [1]:
import pywren, json, requests, time, bs4
import pandas as pd
from re import findall
import numpy as np

In [2]:
data = pd.read_csv('expert_info_v1.csv')
data = data[['expert_id', 'name', 'institution', 'href']]

## American Enterprise Institue

In [28]:
def find_all_work(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text)
    raw = soup.find('div', {'class': 'search-facet periodical-name'})
    if not raw: return None, None
    else: raw = raw.findAll('li')
    media_appearance = [(i['data-facet-count'], i['data-facet-query'][15:]) for i in raw]
    
    works = []
    i = 1
    while True:
        new_url = url + f'&wpsolr_page={i}'
        r = requests.get(new_url)
        soup = bs4.BeautifulSoup(r.text)
        raw = soup.findAll('a', {'class': 'news-thumbnail'})
        time = [i.text for i in soup.findAll('span', {'class': 'primary-18'})]
        if len(time) == 0: break
        i += 1
        works.extend(list(zip(time, [i['href'] for i in raw])))
        
    return media_appearance, works

def scrape_expert_aei(user_id):
    expert = dict()
    href = f'https://www.aei.org/profile/{user_id}/'
    r = requests.get(href)
    soup = bs4.BeautifulSoup(r.text)
    
    expert['title'] = soup.find('h5').text
    raw = soup.find('div', {'class': 'minimize js-minimize wysiwyg'})
    if not raw: texts = []
    else: texts = [i.text for i in raw.findAll(['p', 'h2', 'li'])]
    
    try: 
        expert['bio'] = '\n'.join(texts[:texts.index('Experience')]).strip()
        expert['experience'] = '\n'.join(texts[texts.index('Experience')+1: 
                             texts.index('Education')]).strip()
        expert['education'] = '\n'.join(texts[texts.index('Education')+1:]).strip()
    except ValueError:
        expert['bio'] = '\n'.join(texts).strip()
        expert['experience'] = ''
        expert['education'] = ''
    
    work_link = [i['href'] for i in soup.findAll('a', {'class': 'cta'}) 
                 if "View all" in i.text]
    assert('search-results' in work_link[0]); assert('type:event'!= work_link[0])
    media_appearance, works = find_all_work(work_link[0])
    expert['media_appearance'] = media_appearance
    expert['works'] = works
    
    return {user_id: expert}

In [29]:
#expert_aei = dict()
for num, eid in enumerate(data[data['institution']=='aei']['expert_id']):
    if num < 75: continue
    print(num, end = '\r')
    expert_aei.update(scrape_expert_aei(eid))

85

In [32]:
json.dump(expert_aei, open('expert_aei.json', 'w'))

## Brookings

In [None]:
def scrape_users_brookings(user_id):
    r = requests.get(f'https://www.brookings.edu/experts/{user_id}/')
    expert = dict()
    soup = bs4.BeautifulSoup(r.text)
    try: expert['title'] = soup.find('h3', {'class': 'title'}).text
    except AttributeError: expert['title'] = None
    expert['bio'] = soup.find('div', 
                              {'class': 'expert-intro-text post-body'}).text.strip()
    raw = soup.findAll(['dt', 'dd'])
    info, key = dict(), ''
    for i in raw:
        if i.name == 'dt':
            key = i.text.strip(); info[key] = []
        else: info[key].append(i.text.strip())
    expert.update(info)
    expert['articles'] = find_articles(user_id)
    return {user_id: expert}

def find_articles(user_id):
    articles = []
    i = 0
    while True:
        url = f'https://www.brookings.edu/author/{user_id}/?type=all&paged={i}'
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.text)
        if 'Page not found' in soup.findAll('title')[0].text: return articles
        raw = list(zip(soup.findAll('h4', {'class': 'title'}), soup.findAll('time')))
        new_added = [(i[0].find('a')['href'], i[1].text.strip()) 
                    for i in raw if i[0].find('a')]
        if len(new_added) == 0: break
        articles.extend(new_added)
        i += 1
        print(len(articles))
    return articles

In [None]:
tar_accounts = list(set(data[data['institution']=='brookings']['expert_id']))
pwex = pywren.default_executor(job_max_runtime = 500)
futures = pwex.map(scrape_users_brookings, tar_accounts)
print(len(futures))

In [None]:
dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
print(len(dones), len(not_dones), end = '\r')

In [None]:
expert_data = dict()
for num, item in enumerate(dones):
    try: expert_data.update(item.result())
    except (IndexError, AttributeError): continue

In [None]:
json.dump(expert_data, open('expert_brookings.json', 'w'))

## Heritage

In [47]:
base = 'https://www.heritage.org/'

def scrape_expert_heritage(expert_id):
    expert = dict()
    r = requests.get(f'https://www.heritage.org/staff/{expert_id}')
    soup = bs4.BeautifulSoup(r.text)
    raw = soup.find('h2', {'class':'expert-bio-card__expert-title'})
    if raw: expert['title'] = raw.text.strip()
    else: expert['title'] = ''
    raw = soup.find('div', {'class': 'expert-bio__read-more-container'})
    if raw: expert['bio'] = raw.text.strip()
    else: expert['bio'] = ''

    articles = []
    i = 0 
    while True:
        url = f'https://www.heritage.org/staff/{expert_id}?page={i}'
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.text)
        hrefs = [base+i['href'] for i in soup.findAll('a', {'class': 'result-card__title'})]
        time = [i.text[:-10].strip() for i in soup.findAll('p', {'class': 'result-card__date'})]
        new_added = list(zip(hrefs, time))
        if len(new_added) == 0: break
        articles.extend(new_added)
        print(i, end = '\r')
        i += 1
    expert['articles'] = articles
    
    return {expert_id: expert}

In [29]:
tar_accounts = list(set(data[data['institution']=='heritage']['expert_id']))
pwex = pywren.default_executor(job_max_runtime = 500)
futures = pwex.map(scrape_expert_heritage, tar_accounts)
print(len(futures))

123


In [None]:
not_dones = [1]
while len(not_dones) != 0: 
    dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
    print(len(dones), len(not_dones), end = '\r')

In [60]:
expert_data = dict()
for num, item in enumerate(dones):
    expert_data.update(item.result())

In [61]:
json.dump(expert_data, open('expert_heritage.json', 'w'))

## CAP

In [None]:
def scrape_expert_cap(expert_id):
    r = requests.get(f'https://www.americanprogress.org/person/{expert_id}/')
    soup = bs4.BeautifulSoup(r.text)
    
    expert = dict()
    try: expert['title'] = soup.find('div', {'class': 'col-md-12'}).text.strip()
    except AttributeError: expert['title'] = ''
    try: expert['bio'] = soup.find('div', {'class': 'bio-text'}).text.strip()
    except AttributeError: expert['bio'] = ''
    
    raw = soup.find('table', {'class': 'display responsive'})
    if raw: raw = raw.findAll('td')
    else: 
        expert['articles'] = []
        return {expert_id: expert}
    raw = [i.find('a')['href'] if i.find('a') else i.text for i in raw]
    articles = np.reshape(raw, (-1,3)).tolist()
    expert['articles'] = articles
    return {expert_id: expert}

In [None]:
tar_accounts = list(set(data[data['institution']=='cap']['expert_id']))
pwex = pywren.default_executor(job_max_runtime = 500)
futures = pwex.map(scrape_expert_cap, tar_accounts)
print(len(futures))

In [None]:
not_dones = [1]
while len(not_dones) != 0: 
    dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
    print(len(dones), len(not_dones), end = '\r')

In [None]:
expert_data = dict()
for num, item in enumerate(dones):
    try: expert_data.update(item.result())
    except: continue

In [None]:
json.dump(expert_data, open('expert_cap.json', 'w'))