In [1]:
import requests 
import bs4 
import pandas as pd
from collections import Counter
import numpy as np

## Get Target Links
The Hoover Institution website redirects all requests that specify a page number. Therefore I manually downloaded all pages from https://www.hoover.org/publications/policy-review to extract urls.

In [2]:
fpath = 'draft_data/policy_review/'
links = []
for i in range(1, 99):
    file = fpath + f"{i}.htm"
    soup = bs4.BeautifulSoup(open(file).read()).body.findAll('a')
    href = list(set([i.get('href') for i in soup]))
    links.extend([i for i in href if i and "/research/" in i])

In [8]:
link_count = Counter(links)
links = list(set([i for i in link_count if link_count[i] < 5]))
len(links)

886

## Get Soup

In [4]:
from requests.exceptions import MissingSchema

In [9]:
soup_dict = dict()
wrong_url = []
start = 0

for ind, url in enumerate(links):
    if ind < start: continue ## rerun when interrupted
    if ind % 10 == 0: print(ind, end = '\r')
    #if (ind % 5000 == 0) and (ind != start): break ## save every 10000 pages
    try: r = requests.get(url.split('?')[0])
    except MissingSchema:
        wrong_url.append(url)
        continue
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    soup_dict[ind] = soup

880

## Get Content

In [39]:
text = []
for ind in soup_dict:
    t = soup_dict[ind].body.findAll('p')
    t = [i.text for i in t]
    txt = []
    for i in t:
        if i == 'View the discussion thread.': break
        if len(i) == 0: continue
        if i == 'Donate now': txt = []
        else: txt.append(i)
    txt = "\n".join(txt)
    text.append(txt)

In [62]:
authors = []
for ind in soup_dict:
    
    try: t = soup_dict[ind].body.findAll('div', {'class': 
                                                 'field-name-field-research-authors'})[0]
    except IndexError: 
        authors.append('')
        continue
    t = set([i.text for i in t])
    t.remove('by ')
    authors.append('; '.join(t))

In [66]:
date = []
for ind in soup_dict:
    t = soup_dict[ind].body.findAll('span', {'class': 'date-display-single'})[0].text
    date.append(t)

In [71]:
data = pd.DataFrame({'url': links, 'text': text, 'author': authors, 'date': date})
data.head()

Unnamed: 0,url,text,author,date
0,https://www.hoover.org/research/sending-public...,The untold story of special education\nOne of ...,Jonathan Fox,"Friday, January 1, 1999"
1,https://www.hoover.org/research/uns-waste-frau...,"In 1992 and 1993, former U.S. Attorney General...",Adam Meyerson,"Saturday, April 1, 1995"
2,https://www.hoover.org/research/state-special-...,The former u.s. ambassador to the Court of St....,Robin Harris,"Saturday, June 1, 2002"
3,https://www.hoover.org/research/civic-renewal-...,"In two recent reports, elite opinion is divide...",Don Eberly,"Tuesday, September 1, 1998"
4,https://www.hoover.org/research/oblivious-voter,Thomas E. Patterson.The Vanishing Voter: Publi...,Benjamin Wallace-Wells,"Tuesday, April 1, 2003"


In [72]:
data.to_csv('policy_review_total_text.csv', index=False)