This notebook scrapes web articles from two political websites: Salon (liberal) and Townhall (conservative). Text data is stored as a .csv file, including the source url of each article. This data is intended to be used as training in a political ideology text classifier. 

In [None]:
import requests
from bs4 import BeautifulSoup


# Salon
## A liberal website

Political articles are scraped from the list of 'Trending' articles. That site links to a list of about 20 articles and also to a subsequent Page with more articles. the code below scrapes the text from each of those articles iteratively. 


In [None]:
list0_url = 'https://www.salon.com/category/news-and-politics?sort=trending&type=all'

article_count = 0 # Let's stop after a reasonable number, I guess
page_count = 1

articles = {'url':[], 'text':[]}

while article_count < 20000: 
    if article_count == 0:
        list_url = list0_url
    
    print('scraping from %s'%list_url)
    r = requests.get(list_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    links = []
# get links in the page    
    for link in soup.find_all('a'):
        ilink = link.get('href')
        if ilink is not None:
            if ilink.startswith('/201'): # This hopefully points to 2017, 2018, so on, and it links to actual articles
                links.append("https://www.salon.com%s"%ilink)
            if 'pagenum=%d'%(page_count+1) in ilink:
                list_url = "https://www.salon.com%s"%ilink # This should correspond to the next page with articles
    for article in links:
        print('article from %s'%article)
        r = requests.get(article)
        soup = BeautifulSoup(r.content, 'html.parser')
        articles['text'].append(soup.find_all('article'))
        articles['url'].append(article)
        article_count += 1
    page_count += 1
    
    




In [None]:
print(len(articles['url']))
import pandas as pd
salon_df = pd.DataFrame.from_dict(articles)
salon_file = 'salon_data.csv'

salon_df.to_csv(salon_file)


# Townhall
## A conservative news and opinion site
Here I scrap text from columnists sorted by date. The script scans the columinsts page for a given week and goes back in time one week at a time. There are on average around 90 articles per page.

In [None]:
import datetime
x0 = datetime.datetime(2019, 7, 21) #- datetime.timedelta(days=7)

list0 = 'https://townhall.com/columnists/date/'

article_count = 0 # Let's stop after a reasonable number, I guess
page_count = 1
weeks = 0

townhall = {'url':[], 'text':[]}

while article_count < 5000: 
    xx = x0 - datetime.timedelta(days = 7*weeks)
    list_url = "%s%s"%(list0, xx.strftime("%Y/%m/%d"))
    print('scraping from %s'%list_url)
    r = requests.get(list_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    links = []
    for link in soup.find_all('a'):
        ilink = link.get('href')
        if ilink is not None:
            if ilink.startswith('https://townhall.com/columnists'): # This hopefully points to 2017, 2018, so on, and it links to actual articles
                links.append(ilink)
    print ('%d articles found in page'%(len(links)/3))
    for i in range(0,len(links),3):
        article = links[i]
        print('article from %s'%article)
        r = requests.get(article)
        soup = BeautifulSoup(r.content, 'html.parser')
        townhall['text'].append(soup.find_all('p'))
        townhall['url'].append(article)
        article_count += 1
    weeks += 1
    
    
    
    


In [None]:
#saving data
townhall_df = pd.DataFrame.from_dict(townhall)
townhall_file = 'townhall_data.csv'
townhall_df.to_csv(townhall_file)


In [None]:
len(townhall['url'])