## Scrape Think Tank Experts
* expert names
* expert links
* titles and short descriptions
* twitter acounts

In [1]:
import requests, bs4, re, json
import pandas as pd

### American Enterprise Institute

In [2]:
def get_info(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, "html.parser").body
    
    try: accounts = soup.findAll('a', {'class':'twitter-handle'})[0].text
    except IndexError:  accounts = ''
    short_desc = soup.findAll('h5', {'class':None})[0].text
    scp = soup.findAll('p', {'class': None})
    des = '\n'.join([i.text for i in scp])
    tag = '\n'.join(find_tags(soup))
    name = soup.findAll('h1')[0].text.strip('\n').strip()
    
    return name, accounts, short_desc, des, tag

def find_tags(sp):
    collect = []
    for i in sp.findAll('a'):
        try: href = i['href']
        except KeyError: continue
        if 'tags_str' in href: collect.append(i.text)
    return collect

In [3]:
url = 'https://www.aei.org/our-scholars/'
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, "html.parser").body
staff_urls = [i['href'] for i in soup.findAll('a', {'class':'news-thumbnail'})]
results = [get_info(url) for url in staff_urls]

In [5]:
user_data = pd.DataFrame({'name': [i[0] for i in results],
                          'twitter': [i[1] for i in results],
                          'short_desc': [i[2] for i in results],
                          'desc': [i[3] for i in results],
                          'tag': [i[4] for i in results], 
                          'href': staff_urls})
user_data['name'] = user_data['name'].apply(lambda x: x.strip())
user_data['twitter'] = user_data['twitter'].apply(lambda x: x.strip().strip('@').strip())
user_data = user_data.drop_duplicates()
user_data.to_csv('draft_data/aei_twitter.csv', index=False)

### Brookings

In [2]:
def extract_info(sp):
    lst = sp.findAll(['h2','h3', 'a'], {'class':['title','name', 'twitter']})
    name_info = dict()
    for i in lst:
        if i['class'][0] == 'name': 
            name = i.text
            name_info[name = dict()
            name_info[name]['href'] = i.find('a')['href']
            name_info[name]['title'] = []
            continue
        if i['class'][0] == 'title':
            name_info[name]['title'].append(i.text)
            continue
        if i['class'][0] == 'twitter':
            name_info[name]['twitter'] = i.text.split('\n')[1].strip('@')
        else: name_info[name]['twitter'] = ''
            
    return name_info

In [3]:
total_data = dict()
i = 0
while True:
    url = f'https://www.brookings.edu/experts/page/{i}?'
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, "html.parser").body
    if len(soup.findAll('h3')) == 0: break
    add = extract_info(soup)
    total_data.update(add)
    i += 1; print(i, end = '\r')

15

In [17]:
name = list(total_data.keys())
twitter = [total_data[i]['twitter'] if 'twitter' in total_data[i] else '' for i in total_data]
title = [';'.join(total_data[i]['title']) for i in total_data]
user_data = pd.DataFrame({'name':name, 'twitter':twitter, 'title':title, 
                          'href': [total_data[i]['href'] for i in total_data]})
user_data['name'] = user_data['name'].apply(lambda x: x.strip())
user_data['twitter'] = user_data['twitter'].apply(lambda x: x.strip())
user_data = user_data.drop_duplicates()
user_data.to_csv('draft_data/brookings_twitter.csv', index=False)

### Heritage

In [2]:
def gen_add(sp):
    raw = sp.findAll(['div', 'a'], 
                   {"class": ['person-list-small__name', 
                              'person-list-small__twitter-handle']})
    desc_raw = sp.findAll('p', {"class": ['person-list-small__title']})
    
    names = []
    dat = [i.text for i in raw]
    
    while(len(dat)!=0):
        if (len(dat) == 1) or (dat[1][0] != '@' and len(dat[1].split(' '))>1):
            names.append((dat[0], None))
            dat.remove(dat[0])
        else:
            names.append((dat[0], dat[1]))
            dat.remove(dat[0])
            dat.remove(dat[0])
            
    href = [i.find('a')['href'] for i in soup.findAll('div', 
                                {'class': 'person-list-small__name'})]
            
    add = pd.DataFrame({'name': [i[0] for i in names],
                        'twitter': [i[1] for i in names],
                        'title': [i.text for i in desc_raw], 
                        'href': href})
    return add

In [3]:
types = []
user_data = pd.DataFrame()

for ch in range(ord('A'), ord('Z') + 1):
    print(chr(ch), end = '\r')
    url = f'https://www.heritage.org/about-heritage/staff/leadership/{chr(ch)}'
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, "html.parser").body
    add = gen_add(soup)
    user_data = user_data.append(add)
    types.extend(['leadership']*len(add))
    
for ch in range(ord('A'), ord('Z') + 1):
    print(chr(ch), end = '\r')
    url = f'https://www.heritage.org/about-heritage/staff/experts/{chr(ch)}'
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, "html.parser").body
    add = gen_add(soup)
    user_data = user_data.append(add)
    types.extend(['experts']*len(add))

for ch in range(ord('A'), ord('Z') + 1):
    print(chr(ch), end = '\r')
    url = f'https://www.heritage.org/about-heritage/staff/other/{chr(ch)}'
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, "html.parser").body
    add = gen_add(soup)
    user_data = user_data.append(add)
    types.extend(['other']*len(add))

Z

In [4]:
user_data['types'] = types
user_data['name'] = user_data['name'].apply(lambda x: x.strip())
user_data['twitter'] = user_data['twitter'].apply(lambda x: x.strip('@').strip() if x else x)
user_data = user_data.drop_duplicates()
user_data.to_csv('draft_data/heritage_twitter.csv', index=False)

### Center for American Progress

In [2]:
url = 'https://www.americanprogress.org/about/staff/'
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, "html.parser").body

In [3]:
leaders = soup.findAll('ul', {'class': 'stafflist'})[0].findAll('li')
name = [i.find('a').text for i in leaders]
href = [i.find('a')['href'] for i in leaders]
title = [i.text[len(i.find('a').text)+2:] for i in leaders]
types = ["leadership"]*len(name)

In [4]:
types_lst = [i.text for i in soup.findAll('h2')]
for i, ty in enumerate(types_lst):
    target = soup.findAll('ul', {'class': 'stafflist'})[i].findAll('li')
    name.extend([i.find('a').text for i in target])
    href.extend([i.find('a')['href'] for i in target])
    title.extend([i.text[len(i.find('a').text)+2:] for i in target])
    types.extend([ty]*len(target))

In [5]:
user_data = pd.DataFrame({'name':name, 'title':title, 
                          'type': types, 'href': href})
user_data['name'] = user_data['name'].apply(lambda x: x.strip())
user_data['href'] = user_data['href'].apply(lambda x: 
                    f"https://www.americanprogress.org/person/{x.split('/')[-3]}/")
user_data = user_data.drop_duplicates()
user_data.to_csv('draft_data/cap_twitter.csv', index=False)

### Merge

In [2]:
aei = pd.read_csv('draft_data/aei_twitter.csv')
brookings = pd.read_csv('draft_data/brookings_twitter.csv')
heritage = pd.read_csv('draft_data/heritage_twitter.csv')
cap = pd.read_csv('draft_data/cap_twitter.csv')

In [3]:
aei['name'] = aei['name'].apply(lambda x: x.split('@')[0].strip())
aei['expert_id'] = aei['href'].apply(lambda x: x.split('/')[-2])
aei.to_csv('draft_data/aei_twitter.csv', index=False)

In [5]:
aei = aei[['expert_id', 'name', 'twitter', 'short_desc', 'href']]
aei = aei.rename(columns = {'short_desc': 'title'})
heritage = heritage.rename(columns = {'types': 'type'})
brookings['expert_id'] = brookings['href'].apply(lambda x: x.split('/')[-2])
heritage['expert_id'] = heritage['href'].apply(lambda x: x.split('/')[-1])
cap['expert_id'] = cap['href'].apply(lambda x: x.split('/')[-2])

aei['institution'] = 'aei'
brookings['institution'] = 'brookings'
heritage['institution'] = 'heritage'
cap['institution'] = 'cap'

In [6]:
data = aei.append(brookings)
data = data.append(heritage)
data = data.append(cap)
data = data[['expert_id', 'name', 'twitter', 'institution', 'type', 'title', 'href']]

In [15]:
address = '/Users/chenliang/Desktop/twitter_expert_sna/notebooks/'
pre_data = pd.read_csv(address+'data_analysis.csv')

In [16]:
pre_data['name'] = pre_data['name'].apply(lambda x: x.split('@')[0].strip())
dct = dict(zip(pre_data['name'], pre_data['twitter']))
print(len(set(data['twitter'])))
data['twitter'] = data[['name', 'twitter']].apply(lambda x: dct[x[0]] 
                  if x[0] in dct else x[1], axis=1)
print(len(set(data['twitter'])))

342
433


In [18]:
data.to_csv('expert_twitters.csv', index=False)