In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import requests 
import bs4 
import nltk.data
from datetime import datetime
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [2]:
data = pd.read_csv('raw_data/heritage_full_text.csv')
data['text'] = data['text'].apply(lambda x: x.strip() if isinstance(x, str) else 0)

In [4]:
def check_target(x):
    target = 0
    if x[1] != target: return x[1]
    url = x[0]
    r = requests.get(url)
    sp = bs4.BeautifulSoup(r.text, "html.parser")
    txt = [j for i in sp.body.findAll('div', {'class':None}) for j in 
           i.text.strip().split('\n') if j != '']
    try: txt = txt[(txt.index('Trade')+1):txt.index('More on This Issue')]
    except ValueError: pass
    return '\n'.join(txt)

data['text_cleaned'] = data[['urls', 'text']].apply(check_target, axis = 1)

In [5]:
def check_target(x):
    if x[2] > 5: return x[1]
    url = x[0]
    r = requests.get(url)
    sp = bs4.BeautifulSoup(r.text, "html.parser")
    
    txt = [i.text for i in sp.body.findAll('p')]
    txt_cleaned = []
    for i in txt:
        if ("This piece originally appeared" in i) or ('min read') in i: break
        if i == 'Economy': txt_cleaned = []
        else: txt_cleaned.append(i)

    return '\n'.join(txt)

data['sent_len'] = data['text_cleaned'].apply(lambda x: 
                                              len(x.split('\n')) if isinstance(x, str) else 0)

data["text_cleaned"] = data[['urls', 'text_cleaned', 
                             'sent_len']].apply(check_target, axis = 1)

In [6]:
data = data.drop_duplicates(['text_cleaned'])

In [8]:
print(data.shape)
data.to_csv('heritage_temp.csv', index=False)

(16655, 7)


### Sentence Based Clean

In [9]:
data['Sent'] = data['text_cleaned'].apply(lambda x: 
                                          [j for i in x.split('\n') 
                                           for j in tokenizer.tokenize(i)])

In [11]:
sent = data['Sent'].sum()
count = Counter(sent)

In [30]:
lst = [(i, count[i]) for i in count if count[i]>20]
drop_sent = [i[0] for i in lst]

In [31]:
data['cleaned_sent'] = data['Sent'].apply(lambda x: [i for i in x if i not in drop_sent])

In [131]:
def combine_text(lst):
    new = []
    for i in lst:
        add = i.replace(u'\xa0',' ')
        add = i.replace(u'\\xa0',' ')
        if 'http' in add: continue
        if len(i) < 3: continue
        if ('Analyst' in i) and ('The Heritage Foundation' in i): continue 
        if ('Authors:' in i) or ('Author:' in i): continue 
        if ('Scholar' in i) and ('The Heritage Foundation' in i): continue 
        if '.pdf' in add: continue 
        if i[0].isnumeric() and (not i[2:4].isalpha()): continue
        if ("[" in i) and ("]" in i):
            target = i[i.index('['): (i.index(']')+1)]
            add = i.replace(target, '')
        new.append(add)
    return '\n'.join(new)

In [132]:
data['cleaned_text_sent'] = data['cleaned_sent'].apply(combine_text)

In [133]:
cleaned_data = data[['urls', 'author', 'date', 'category', 'cleaned_text_sent']]

## Convert Date

In [150]:
def convert_date(s):
    if len(s.strip()) < 3: return ""
    s = s.replace('th', '')
    s = s.replace('st', '')
    s = s.replace('rd', '')
    s = s.replace('nd', '')
    s = s.replace('Augu', 'Aug')
    try: d = datetime.strptime(s, '%B %d, %Y')
    except ValueError:
        d = datetime.strptime(s, '%b %d, %Y')
    return d.strftime('%Y-%m-%d')

In [155]:
cleaned_data['date'] = cleaned_data['date'].apply(convert_date)
cleaned_data.head()

Unnamed: 0,urls,author,date,category,cleaned_text_sent
0,https://www.heritage.org/energy-economics/comm...,['Katie Tubb'],2014-05-14,energy-economics,Yesterday the Senate decided to take up legisl...
1,https://www.heritage.org/housing/commentary/no...,"['Norbert J. Michel, Ph.D.']",2018-09-13,housing,The 2008 financial crisis was a major missed o...
2,https://www.heritage.org/budget-and-spending/c...,['Stephen Moore'],2015-10-19,budget-and-spending,"It hasn't gotten much attention, but two big b..."
3,https://www.heritage.org/trade/report/trade-ad...,"['James Sherk', 'David Muhlhausen']",2011-05-16,trade,The Obama Administration and Congress recently...
4,https://www.heritage.org/education/report/prom...,['Kirk Johnson'],2005-04-06,education,"On April 5, 2005, the Georgetown Public Policy..."


In [156]:
cleaned_data.to_csv('heritage_clean_text.csv', index=False)