In [1]:
import pywren, json, requests, time, bs4
import pandas as pd
from re import findall
import numpy as np

In [2]:
data = pd.read_csv('expert_twitters_complete.csv', encoding = "ISO-8859-1")
data = data[['expert_id', 'name', 'institution', 'href']]

## American Enterprise Institue

In [3]:
def find_all_work(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text)
    raw = soup.find('div', {'class': 'search-facet periodical-name'})
    if not raw: return None, None
    else: raw = raw.findAll('li')
    media_appearance = [(i['data-facet-count'], i['data-facet-query'][15:]) for i in raw]
    
    works = []
    i = 1
    while True:
        if i == 1: new_url = url
        else: new_url = url + f'&wpsolr_page={i}'
        #print(new_url)
        r = requests.get(new_url)
        soup = bs4.BeautifulSoup(r.text)
        raw = soup.findAll('a', {'class': 'news-thumbnail'})
        time = [i.text for i in soup.findAll('span', {'class': 'primary-18'})]
        if len(time) == 0: break
        i += 1
        works.extend(list(zip(time, [i['href'] for i in raw])))
        #print(len(works))
        
    return media_appearance, works

def scrape_expert_aei(user_id):
    expert = dict()
    href = f'https://www.aei.org/profile/{user_id}/'
    r = requests.get(href)
    soup = bs4.BeautifulSoup(r.text)
    
    try: expert['title'] = soup.find('h5').text
    except: return {user_id: None}
    
    raw = soup.find('div', {'class': 'minimize js-minimize wysiwyg'})
    if not raw: texts = []
    else: texts = [i.text for i in raw.findAll(['p', 'h2', 'li'])]
    
    try: 
        expert['bio'] = '\n'.join(texts[:texts.index('Experience')]).strip()
        expert['experience'] = '\n'.join(texts[texts.index('Experience')+1: 
                             texts.index('Education')]).strip()
        expert['education'] = '\n'.join(texts[texts.index('Education')+1:]).strip()
    except ValueError:
        expert['bio'] = '\n'.join(texts).strip()
        expert['experience'] = ''
        expert['education'] = ''
    
    work_link = [i['href'] for i in soup.findAll('a', {'class': 'cta'}) 
                 if "View all" in i.text]
    assert('search-results' in work_link[0]); assert('type:event'!= work_link[0])
    media_appearance, works = find_all_work(work_link[0])
    expert['media_appearance'] = media_appearance
    expert['works'] = works
    
    return {user_id: expert}

In [4]:
expert_data = dict()
tar_accounts = list(set(data[data['institution']=='aei']['expert_id']))
for num, account in enumerate(tar_accounts):
    print(num, end = '\r')
    add = scrape_expert_aei(account)
    if account == 'roger-bate': 
        print(account, len(add['roger-bate']['works']))
    try: expert_data.update(add)
    except (IndexError, AttributeError): continue

roger-bate 1013
125

In [7]:
json.dump(expert_data, open('draft_data/expert_aei_v2.json', 'w'))

## Brookings

In [37]:
def scrape_users_brookings(user_id):
    r = requests.get(f'https://www.brookings.edu/experts/{user_id}/')
    expert = dict()
    soup = bs4.BeautifulSoup(r.text)
    try: expert['title'] = soup.find('h3', {'class': 'title'}).text
    except AttributeError: expert['title'] = None
    try: expert['bio'] = soup.find('div', 
                              {'class': 'expert-intro-text post-body'}).text.strip()
    except AttributeError: expert['bio'] = None
    try: alt_id = soup.find('div', {'class':
                                    'section-header'}).find('a')['href'].split('/')[-2]
    except AttributeError: alt_id = user_id
    raw = soup.findAll(['dt', 'dd'])
    info, key = dict(), ''
    for i in raw:
        if i.name == 'dt':
            key = i.text.strip(); info[key] = []
        else: info[key].append(i.text.strip())
    expert.update(info)
    expert['articles'] = find_articles(user_id, alt_id)
    return {user_id: expert}

def find_articles(user_id, alt_id):
    articles = []
    
    i = 0
    while True:
        url = f'https://www.brookings.edu/author/{user_id}/?type=all&paged={i}'
        print(url)
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.text)
        if 'Page not found' in soup.findAll('title')[0].text: 
            if i == 0: 
                user_id = alt_id
                url = f'https://www.brookings.edu/author/{alt_id}/?type=all&paged={i}'
                print(url)
                r = requests.get(url)
                soup = bs4.BeautifulSoup(r.text)
                if 'Page not found' in soup.findAll('title')[0].text: 
                    return []
            else: return articles
        raw = list(zip(soup.findAll('h4', {'class': 'title'}), soup.findAll('time')))
        new_added = [(i[0].find('a')['href'], i[1].text.strip()) 
                    for i in raw if i[0].find('a')]
        if len(new_added) == 0: break
        articles.extend(new_added)
        i += 1
        print(len(articles))
    return articles

In [39]:
tar_accounts = list(set(data[data['institution']=='brookings']['expert_id']))
#tar_accounts = list(set(tar_accounts)-expert_data.keys())
len(tar_accounts)

414

In [40]:
pwex = pywren.default_executor(job_max_runtime = 500)
futures = pwex.map(scrape_users_brookings, tar_accounts)
print(len(futures))

414


In [41]:
not_dones = [1]
while len(not_dones) != 0: 
    dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
    print(len(dones), len(not_dones), end = '  \r')

414 0    

In [42]:
expert_data = dict()
for num, item in enumerate(dones):
    try: expert_data.update(item.result())
    except (IndexError, AttributeError, NameError): continue
list(set(tar_accounts)-expert_data.keys())

[]

In [53]:
json.dump(expert_data, open('draft_data/expert_brookings_v2.json', 'w'))

## Heritage

In [47]:
base = 'https://www.heritage.org/'

def scrape_expert_heritage(expert_id):
    expert = dict()
    r = requests.get(f'https://www.heritage.org/staff/{expert_id}')
    soup = bs4.BeautifulSoup(r.text)
    raw = soup.find('h2', {'class':'expert-bio-card__expert-title'})
    if raw: expert['title'] = raw.text.strip()
    else: expert['title'] = ''
    raw = soup.find('div', {'class': 'expert-bio__read-more-container'})
    if raw: expert['bio'] = raw.text.strip()
    else: expert['bio'] = ''

    articles = []
    i = 0 
    while True:
        url = f'https://www.heritage.org/staff/{expert_id}?page={i}'
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.text)
        hrefs = [base+i['href'] for i in soup.findAll('a', {'class': 'result-card__title'})]
        time = [i.text[:-10].strip() for i in soup.findAll('p', {'class': 'result-card__date'})]
        new_added = list(zip(hrefs, time))
        if len(new_added) == 0: break
        articles.extend(new_added)
        print(i, end = '\r')
        i += 1
    expert['articles'] = articles
    
    return {expert_id: expert}

In [29]:
tar_accounts = list(set(data[data['institution']=='heritage']['expert_id']))
pwex = pywren.default_executor(job_max_runtime = 500)
futures = pwex.map(scrape_expert_heritage, tar_accounts)
print(len(futures))

123


In [None]:
not_dones = [1]
while len(not_dones) != 0: 
    dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
    print(len(dones), len(not_dones), end = '\r')

In [60]:
expert_data = dict()
for num, item in enumerate(dones):
    expert_data.update(item.result())

In [61]:
json.dump(expert_data, open('draft_data/expert_heritage_v2.json'， 'w'))

## CAP

In [7]:
def scrape_expert_cap(expert_id):
    r = requests.get(f'https://www.americanprogress.org/person/{expert_id}/')
    soup = bs4.BeautifulSoup(r.text)
    
    expert = dict()
    try: expert['title'] = soup.find('div', {'class': 'col-md-12'}).text.strip()
    except AttributeError: expert['title'] = ''
    try: expert['bio'] = soup.find('div', {'class': 'bio-text'}).text.strip()
    except AttributeError: expert['bio'] = ''
    
    raw = soup.find('table', {'class': 'display responsive'})
    if raw: raw = raw.findAll('td')
    else: 
        expert['articles'] = []
        return {expert_id: expert}
    raw = [i.find('a')['href'] if i.find('a') else i.text for i in raw]
    articles = np.reshape(raw, (-1,3)).tolist()
    expert['articles'] = articles
    return {expert_id: expert}

In [8]:
tar_accounts = list(set(data[data['institution']=='cap']['expert_id']))
pwex = pywren.default_executor(job_max_runtime = 500)
futures = pwex.map(scrape_expert_cap, tar_accounts)
print(len(futures))

338


In [9]:
not_dones = [1]
while len(not_dones) != 0: 
    dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
    print(len(dones), len(not_dones), end = '\r')

338 048

In [23]:
expert_data = dict()
for num, item in enumerate(dones):
    try: expert_data.update(item.result())
    except: continue

In [11]:
json.dump(expert_data, open('draft_data/expert_cap_v2.json', 'w'))

## Data Merge and Check

In [13]:
#data = pd.read_csv('expert_info_v1.csv')
data = pd.read_csv('expert_twitters_complete.csv', encoding = "ISO-8859-1")
data = data[['expert_id', 'name', 'institution', 'href']]

In [14]:
brookings = json.load(open('draft_data/expert_brookings_v2.json'))
heritage = json.load(open('draft_data/expert_heritage_v2.json'))
aei = json.load(open('draft_data/expert_aei_v2.json'))
cap = json.load(open('draft_data/expert_cap_v2.json'))

In [15]:
expert_dict = dict()
for dct in [brookings, heritage, aei, cap]:
    for expert in dct:
        expert_dict[expert] = dict()
        expert_dict[expert]['title'] = dct[expert]['title']

for expert in cap:
    expert_dict[expert]['education'] = cap[expert]['bio']
    expert_dict[expert]['experience'] = cap[expert]['bio']
    expert_dict[expert]['articles'] = cap[expert]['articles']

for expert in aei:
    expert_dict[expert]['education'] = aei[expert]['education']
    expert_dict[expert]['experience'] = aei[expert]['experience']
    if aei[expert]['works']:
        expert_dict[expert]['articles'] = [(i[1], i[0]) for i in aei[expert]['works']]
    else:  expert_dict[expert]['articles'] = []
    
for expert in brookings:
    if 'Education' in brookings[expert]:
        expert_dict[expert]['education'] = brookings[expert]['Education']
    else: expert_dict[expert]['education'] = ''
    experiences = []
    if 'Current Positions' in brookings[expert]: experiences.extend(brookings[expert]['Current Positions'])
    if 'Past Positions' in brookings[expert]: experiences.extend(brookings[expert]['Past Positions'])
    if brookings[expert]['bio']: experiences.append(brookings[expert]['bio'])
    expert_dict[expert]['experience'] = experiences
    expert_dict[expert]['articles'] = brookings[expert]['articles']
    
for expert in heritage:
    expert_dict[expert]['education'] = heritage[expert]['bio']
    expert_dict[expert]['experience'] = heritage[expert]['bio']
    expert_dict[expert]['articles'] = heritage[expert]['articles']

In [16]:
brookings = json.load(open('draft_data/expert_brookings.json'))
heritage = json.load(open('draft_data/expert_heritage.json'))
aei = json.load(open('draft_data/expert_aei.json'))
cap = json.load(open('draft_data/expert_cap.json'))

In [17]:
for expert in cap:
    if expert in cap:
        expert_dict[expert]['articles'].extend(cap[expert]['articles'])

for expert in aei:
    if aei[expert]['works']:
        expert_dict[expert]['articles'].extend([(i[1], i[0]) for i in aei[expert]['works']])
    
for expert in brookings:
    expert_dict[expert]['articles'].extend(brookings[expert]['articles'])
    
for expert in heritage:
    expert_dict[expert]['articles'].extend(heritage[expert]['articles'])

In [18]:
for expert in expert_dict:
    expert_dict[expert]['articles'] = list(set([i[0] for i 
                                                in expert_dict[expert]['articles']]))

In [19]:
json.dump(expert_dict, open('expert_all.json', 'w'))

## Write to Excel for Bio Coding

In [20]:
import pywren, json, requests, time, bs4
import pandas as pd
from re import findall
import numpy as np
expert_dict = json.load(open('expert_all.json'))
data = pd.read_csv('expert_twitters_complete.csv', encoding = "ISO-8859-1")
data = data[['expert_id', 'name', 'institution']]

In [21]:
data['title'] = data['expert_id'].apply(lambda x: expert_dict[x]['title'] if x 
                                        in expert_dict else '')
data['education'] = data['expert_id'].apply(lambda x: expert_dict[x]['education'] if 
                                            x in expert_dict else '')
data['experience'] = data['expert_id'].apply(lambda x: expert_dict[x]['experience'] if 
                                             x in expert_dict else '')
data.to_csv('expert_bio_complete.csv', index=False)

## Scrape Articles

In [22]:
articles = list(set([href for expert in expert_dict for href in 
                     expert_dict[expert]['articles']]))
article_data = pd.DataFrame({'href': list(articles)})
len(articles)

75737

In [25]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from requests.exceptions import SSLError, MissingSchema, ConnectionError
stop_words = set(stopwords.words('english')) 

In [26]:
def scrape_article(href):
    try: r = requests.get(href, timeout = 10)
    except: return {href: '-1'}
    if not r: return {href: '-1'}
    soup = bs4.BeautifulSoup(r.text)
    raw = '\n'.join([i.text for i in soup.findAll('p')])
    raw = '\n'.join([i.strip() for i in raw.split('\n') if len(i.split(' ')) > 15])
    return {href: raw}

In [27]:
article_dct = json.load(open('article_dct_full.json'))
tar_articles = list(set(articles)-article_dct.keys()-{''})
len(tar_articles)

2

In [33]:
#article_dct = dict()
folds = 3000
pwex = pywren.default_executor(job_max_runtime = 500)

for i in range(int(len(tar_articles)/folds)):
    #if i*folds < len(article_dct): continue
    print(i*folds, end = '\r')
    if i*folds+folds < len(articles):
        tar_hrefs = tar_articles[i*folds: i*folds+folds]
    else: tar_hrefs = tar_articles[i*folds:]
        
    futures = pwex.map(scrape_article, tar_hrefs)
    dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
    while len(not_dones) > 5:
        dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
    
    for item in dones:
        try: article_dct.update(item.result())
        except (MissingSchema, SSLError, AttributeError, ConnectionError, 
                IndexError): continue
    json.dump(article_dct, open('article_dct_full.json', 'w'))

12000

In [28]:
#tar_articles = set(tar_articles) - article_dct.keys()
pwex = pywren.default_executor(job_max_runtime = 500)
futures = pwex.map(scrape_article, tar_articles)
len(futures)
dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
print(len(dones), len(not_dones), end  = '\r')
while len(not_dones) !=0:
    dones, not_dones = pywren.wait(futures, pywren.ANY_COMPLETED)
    print(len(dones), len(not_dones), end  = '\r')

2 0

In [29]:
for item in dones:
    try: article_dct.update(item.result())
    except: continue
json.dump(article_dct, open('article_dct_full.json', 'w'))