In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import requests 
import bs4 
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

## American Enterprise Institute

In [2]:
data_aei = pd.read_csv('raw_data/aei_full_text.csv')
data_aei['text'] = data_aei['text'].apply(lambda x: x.strip())

In [3]:
drop_words = ['This statement is available here as an Adobe PDF.',
              'This statement is available here\xa0as an Adobe PDF.',
              'Watch the latest video at foxnews.com',
              'Watch the latest video at video.foxnews.com',
              'Link.',
              '4-Block World.',
              'HT: Warren Smith',
              'Read More',
              'Read the PDF.',
              'The collected papers for this conference can be found here.\n Read the PDF.',
              'Watch the latest video at video.foxbusiness.com',
              'Read More\nTags:\nby Timothy P. Carney\nFormat: Hardcover\nISBN: 9780062797100\nPublished by: HarperCollins']
data_aei = data_aei[~data_aei['text'].isin(drop_words)]

In [4]:
def check_target(x):
    target = 'Originally posted at Carpe Diem.'
    if x[1] != target: return x[1]
    url = x[0]
    r = requests.get(url)
    sp = bs4.BeautifulSoup(r.text, "html.parser")
    return '\n'.join([i.text for i in sp.findAll('em')])

data_aei['text_cleaned'] = data_aei[['url', 'text']].apply(check_target, axis = 1)

In [5]:
def check_target(x):
    target = ''
    if x[1] != target: return x[1]
    url = x[0]
    r = requests.get(url)
    sp = bs4.BeautifulSoup(r.text, "html.parser").findAll('span')
    txt = "\n".join([i.text.replace('\n', ' ') for i in sp if i.text != ''])
    try: txt = txt[:txt.index('Originally posted at')].strip()
    except ValueError: pass
    return '\n'.join(txt)

data_aei['text_cleaned'] = data_aei[['url', 'text_cleaned']].apply(check_target, axis = 1)

In [6]:
def check_target(x):
    target = 'Mark Falcoff is a resident scholar at the American Enterprise Institute.'
    if x[1] != target: return x[1]
    url = x[0]
    r = requests.get(url)
    sp = bs4.BeautifulSoup(r.text, "html.parser")
    txt = [i.text for i in sp.findAll('p',{'class':"MsoPlainText"})]
    if 'Notes' in txt: txt = txt[:txt.index('Notes')]
    return '\n'.join(txt)

data_aei['text_cleaned'] = data_aei[['url', 'text_cleaned']].apply(check_target, axis = 1)

In [7]:
def check_target(x):
    target = 'Editor’s note: The next president is in for a rough welcome to the Oval Office given the list of immediate crises and slow-burning policy challenges, both foreign and domestic.\xa0What should Washington do? Why should the average American care? We’ve set out to clearly define US strategic interests and provide actionable policy solutions to help the new administration build a 2017 agenda that strengthens American leadership abroad while bolstering prosperity at home.\nWhat to Do: Policy Recommendations for 2017 is an ongoing project from AEI. Click here for access to the complete series, which addresses a wide range of issues from rebuilding America’s military to higher education reform to helping people find work.'
    if x[1] != target: return x[1]
    url = x[0]
    r = requests.get(url)
    sp = bs4.BeautifulSoup(r.text, "html.parser")
    txt = [i.text for i in sp.findAll('p', {'class':['pagpag1','pagpag2','pagpag3']})]
    return '\n'.join(txt)

data_aei['text_cleaned'] = data_aei[['url', 'text_cleaned']].apply(check_target, axis = 1)

In [8]:
def check_target(x):
    target = 'Tags:\nby Dalibor Rohac\nFormat: Hardcover\nISBN: 9781442270640'
    if x[1] != target: return x[1]
    url = x[0]
    r = requests.get(url)
    sp = bs4.BeautifulSoup(r.text, "html.parser")
    txt = [i.text for i in sp.findAll('p', {'class':'selectionShareable'})]
    return '\n'.join(txt)

data_aei['text_cleaned'] = data_aei[['url', 'text_cleaned']].apply(check_target, axis = 1)

In [9]:
data_aei = data_aei.drop_duplicates(['text_cleaned'])

### Sentence Based Clean

In [10]:
data_aei['Sent'] = data_aei['text_cleaned'].apply(lambda x: 
                                                  [j for i in x.split('\n') 
                                                   for j in tokenizer.tokenize(i)])

In [13]:
sent = data_aei['Sent'].sum()
count = Counter(sent)

In [14]:
lst = [(i, count[i]) for i in count if count[i]>5 and len(i.split(' '))>1]
lst[:10]

[('This title is currently out of print, but online booksellers sometimes have used copies available.',
  155),
 ('See links below.', 146),
 ('Read the full PDF.', 709),
 ('Buy the book.', 568),
 ('Published by: AEI Press', 61),
 ('* * *', 180),
 ('Download the PDF', 231),
 ('Download the PDF.', 11),
 ('Click here to view the full book as an Adobe PDF.', 43),
 ('Walter Berns is a resident scholar at AEI.', 14)]

In [19]:
pdf = set([j for j in count if 'PDF' in j])
aei = set([j for j in count if ('at AEI' in j) and ('Scholar' in j)])
sy  = set([j for j in count if 'School Year:' in j])

drop_sent = [
    'This title is currently out of print, but online booksellers sometimes have used copies available.',
    'See links below.',
    'Read more.',
    'Buy the book.',
    'Read the full book.',
    'I.',
    '* * *',
    'Published by: AEI Press',
    'All rights reserved.'
]
drop_sent.extend(list(pdf) + list(aei) + list(sy))

In [20]:
data_aei['cleaned_sent'] = data_aei['Sent'].apply(lambda x: 
                                                  [i for i in x if i not in drop_sent])
data_aei['text'] = data_aei['cleaned_sent'].apply(lambda x: "\n".join(x))

In [26]:
data_aei = data_aei[['url', 'text', 'author', 'date', 'section']]

In [28]:
data_aei.to_csv('aei_cleaned_text.csv', index=False)