In [1]:
import requests 
import bs4 
import pandas as pd
import re
import numpy as np
import json

## Get Target Links

In [2]:
categories = ['topics-economy', 'education', 'energy', 'topics-health-care',
              'international', 'topics-law', 'politics-topics', 'security',
              'topics-society']
## from https://www.dailysignal.com/

In [None]:
link_collection = dict()
for cat in categories:  
    print(cat)
    i = 0
    link_collection[cat] = []
    
    url = f"https://www.dailysignal.com/category/{cat}/page/{i}"
    r = requests.get(url)
    
    while r.status_code != 404:
        print(i, end = '\r')
        sp = bs4.BeautifulSoup(r.text, "html.parser").body.findAll('a')
        link_collection[cat].extend(set([i.get('href') for i in sp]))
    
        i += 1
        url = f"https://www.dailysignal.com/category/{cat}/page/{i}"
        r = requests.get(url)

In [None]:
cat_lst, link_lst = [], []
for i in categories:
    for j in link_collection[i]:
        cat_lst.append(i)
        link_lst.append(j)
href = pd.DataFrame({'category': cat_lst, 'urls': link_lst})
href.to_csv('raw_href.csv', index=False)

## Clean Links

In [3]:
data = pd.read_csv('raw_href.csv')

In [4]:
href = dict(zip(data['urls'], data['category']))
href = {i:href[i] for i in href if len(i.split('/')) > 3 and 
        i.split('/')[3].isnumeric()}

In [5]:
data = pd.DataFrame({'urls': list(href), 'category': list(href.values())})
data.to_csv('daily_signal_urls.csv', index=False)
data.head()

Unnamed: 0,urls,category
0,https://www.dailysignal.com/2020/02/28/white-h...,topics-economy
1,https://www.dailysignal.com/2020/03/04/without...,topics-economy
2,https://www.dailysignal.com/2020/03/04/sanders...,topics-economy
3,https://www.dailysignal.com/2020/03/02/epa-nee...,topics-economy
4,https://www.dailysignal.com/2020/02/29/budget-...,topics-economy


In [6]:
data.shape, len(set(data['urls'])) ## all urls fall into exact one category

((30672, 2), 30672)

## Scrape Pages from Urls, Get Soup

In [8]:
from requests.exceptions import MissingSchema
urls = list(data['urls'])
len(urls)

30672

In [14]:
soup_dict = dict()
wrong_url = []
start = 0

for ind, url in enumerate(urls):
    if ind < start: continue ## rerun when interrupted
    if ind % 10 == 0: print(ind, end = '\r')
    try: r = requests.get(url.split('?')[0])
    except MissingSchema:
        wrong_url.append(url)
        continue
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    soup_dict[ind] = soup

30670

In [98]:
wrong_url

[]

## Get Text from Soup

In [15]:
data['date'] = data['urls'].apply(lambda x: "-".join(x.split('/')[3:6]))

In [16]:
texts = []
for ind in soup_dict:
    if ind % 10 == 0: print(ind, end = '\r')
    sp = soup_dict[ind].body.findAll('div',{'class':'tds-content'})
    txt = "\n".join([i.text.strip().replace(u'\xa0', u' ') for i in sp])
    texts.append(txt)
data['text'] = texts

30670

In [90]:
authors = []
for ind in soup_dict:
    print(ind, end = '\r')
    cont = soup_dict[ind].body.find('script', {'type': 'application/ld+json'}).contents
    cont = "".join(cont)
    cont = cont.replace('\r', '')
    cont = cont.replace('\n', '')
    cont = cont.replace('\t', '')
    cont = cont[:cont.index('"description"')].strip().strip(',')+'}'
    auth = json.loads(cont)['author']['name']
    authors.append(auth)
data['authors'] = authors

30671

In [94]:
data.to_csv('daily_signal_full_text.csv', index=False)