In [1]:
import requests 
import bs4 
import pandas as pd
import re
#import urllib
from requests.exceptions import MissingSchema

In [2]:
def getTextFromPage(soup):
    if not soup: return ""
    lst = [i.text.strip('\n') for i in soup.findAll("p", {"class" : None})]
    lst = [i.replace('\n', ' ') for i in lst if len(i)>1]
    if len(lst) == 0: return ""
    if lst[-1] == 'Tags:': lst = lst[:-1]
    return '\n'.join(lst)

def getNameFromPage(soup):
    lst = soup.findAll("div", {"class" : 'post-author'})
    lst = [i.text.split('\n')[2] for i in lst]
    authors = [i for i in lst if len(i) != 0]
    if len(authors) != 0: return authors
    lst = soup.findAll("em")
    lst = [i.text for i in lst]
    lst = [i[:i.index(' is')] for i in lst if " is" in i]
    return lst

def getDateFromPage(soup):
    lst = soup.findAll("div", {"class" : 'post-date'})
    lst = [i.text for i in lst]
    return lst

def getTagFromPage(soup):
    tags = soup.findAll("meta",  property="article:tag")
    tags = [i['content'] for i in tags]
    return tags

def getSectionFromPage(soup):
    section = soup.findAll("meta",  property="article:section")
    section = [i['content'] for i in section]
    return section

## Get Target Links

In [4]:
def get_soups(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, "html.parser").body
    if not soup: 
        print(url)
        return []
    return soup.findAll('a')

In [5]:
max_page = 4582 ## from https://www.aei.org/search-results
soups = []
for i in range(1, max_page): 
    if i % 10 == 0: print(i, end = '\r')
    url = f'https://www.aei.org/search-results/?wpsolr_page={i}'
    soups.extend(get_soups(url))

4580

In [6]:
href = set([i.get('href') for i in soups])
href = [i for i in href if i and "www.aei.org" in i]
cleaned_href = []
for i in href:
    not_wanted = ["/search-results/", "/multimedia/", 
                  "/events/", "/profile/", "/publishers/"]
    pass_it = False
    for j in not_wanted:
        if j in i: pass_it = True 
    if not pass_it: cleaned_href.append(i)

In [36]:
#data = pd.DataFrame({'urls': cleaned_href})
#data.to_csv('url_aei.csv', index=False)

## Scrape Pages from Urls, Get Soup

In [3]:
data = pd.read_csv('url_aei.csv')
cleaned_href = list(data['urls'])
len(cleaned_href)

77521

In [7]:
#soup_dict = dict()
#wrong_url = []
start = 75780

for ind, url in enumerate(cleaned_href):
    if ind < start: continue ## rerun when interrupted
    if ind % 10 == 0: print(ind, end = '\r')
    if (ind % 10000 == 0) and (ind != start): break ## save every 10000 pages
    try: r = requests.get(url.split('?')[0])
    except MissingSchema:
        wrong_url.append(url)
        continue
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    soup_dict[ind] = soup

77520

In [8]:
wrong_url

[]

## Extract Information from Soups

In [9]:
urls, texts, authors, date, tag, section = [],[],[],[],[],[]
for i in soup_dict:
    urls.append(cleaned_href[i])
    texts.append(getTextFromPage(soup_dict[i]))
    authors.append(getNameFromPage(soup_dict[i]))
    date.append(getDateFromPage(soup_dict[i]))
    tag.append(getTagFromPage(soup_dict[i]))
    section.append(getSectionFromPage(soup_dict[i]))

In [10]:
data = pd.DataFrame({'url':urls, 'text': texts, 'author': authors, "date": date,
                     'tag': tag, "section": section})
data['len'] = data['date'].apply(lambda x: len(x))
data = data[data['text'] != ""]
data = data[data['len'] != 0]
data['date'] = data['date'].apply(lambda x: x[0])

In [11]:
print(data.shape)
data.head()

(6812, 7)


Unnamed: 0,url,text,author,date,tag,section,len
0,https://www.aei.org/carpe-diem/map-of-the-gove...,The map above from the Freedom Center of Misso...,[Mark J. Perry],"August 6, 2011",[],[Carpe Diem],1
1,https://www.aei.org/carpe-diem/from-45-to-15-i...,Ten days ago there was a 45% chance of a gove...,[Mark J. Perry],"August 17, 2009",[],[Carpe Diem],1
2,https://www.aei.org/research-products/working-...,Abstract\nSince the 2009 Supervisory Capital A...,[Paul H. Kupiec],"April 2, 2019","[banking, financial stability]",[Economics],1
3,https://www.aei.org/carpe-diem/q-oil-speculato...,"Last summer, Sen. Bernie Sanders (I-VT), a mem...",[Mark J. Perry],"October 27, 2014",[oil],[Carpe Diem],1
5,https://www.aei.org/economics/october-jobs-rep...,The October jobs report contained lots of good...,[James Pethokoukis],"November 4, 2016","[jobs report, Labor force participation rate, ...",[Economics],1


In [12]:
data = data[['url', 'text', 'author', 'date', 'tag', 'section']]
data.to_csv('aei_text_8.csv', index=False)

## Combine Datasets

In [26]:
import numpy as np
from collections import Counter
import math
from datetime import datetime

In [96]:
data = pd.DataFrame()

In [97]:
for i in range(1, 9):
    file = f"aei_text_{i}.csv"
    print(file)
    data = data.append(pd.read_csv(file), sort=False)
data.reset_index(inplace = True, drop = True)

aei_text_1.csv
aei_text_2.csv
aei_text_3.csv
aei_text_4.csv
aei_text_5.csv
aei_text_6.csv
aei_text_7.csv
aei_text_8.csv


In [102]:
sections = [i.strip('[').strip(']').strip('\'') if isinstance(i, str) else "" 
            for i in data["section"]]
data["section"] = sections

In [103]:
data['date'] = data['date'].apply(lambda x: datetime.strptime(x, 
                                            '%B %d, %Y').strftime('%Y-%m-%d'))

In [107]:
authors = [", ".join(i.strip('[').strip(']').strip('\'').split("', '"))
           for i in data["author"]]
data['author'] = authors

In [113]:
data = data.sort_values('date')
data = data[['url', 'text', 'author', 'date', 'section']]
data.reset_index(inplace = True, drop = True)

In [118]:
data.to_csv('aei_full_text.csv', index=False)