##  Download WSJ headlines of archived articles
The date, time, category and headline of the article is extracted using Beautiful soup and saved in .csv for years 1998-2020

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import matplotlib.pyplot as plt
import sys
import datetime
import numpy as np
import calendar
import time
%matplotlib inline

### In case an exception is raised the request is retried (maximum of 3 times).
TODO : give_articles_nextpages() and give_articles() could be combined into one method.

In [2]:
ntries = 3

#Read in the pages after the first page, if they exist
def give_articles_nextpages(dtgstr,npages):
    
 tags_next = []     

 for ipage in np.arange(2,np.int(npages)+1):
    wsj_url = 'https://www.wsj.com/news/archive/'+dtgstr+'?page='+np.str(ipage)  
    for tr in np.arange(ntries):  
        if tr>=1: 
         print('\tIn nextpage^^tr = ',tr)
        try: 
            resp = requests.get(wsj_url,headers={'user-agent': 'my-app/0.0.1'})
            resp.raise_for_status()
        except requests.exceptions.HTTPError as errh:
            print ("Http Error:",errh)
            print('nextpage tr = ',tr)  
            time.sleep(5)
            continue
        except requests.exceptions.ConnectionError as errc:
            print ("Error Connecting:",errc)
            print('nextpage tr = ',tr)
            time.sleep(5)
            continue
        except requests.exceptions.Timeout as errt:
            print ("Timeout Error:",errt)
            print('nextpage tr = ',tr)  
            time.sleep(5)  
            continue  
        except requests.exceptions.RequestException as err:
            print ("Something else went wrong",err)
            print('nextpage tr = ',tr)  
            time.sleep(5)  
            continue  
        else:
            soup = BeautifulSoup(resp.text)
            tags = soup.find_all('article')
            #print('\tlen(tags) = ',len(tags))
            tags_next.extend(tags)  #articles from every page accumulate in tags_next
            break
    else:     
     print('\tTried 3 times, but to no avail')   
    
 return tags_next
    

#Read in the first page
def give_articles(dtgstr):

 tags_all = []  
      
 wsj_url = 'https://www.wsj.com/news/archive/'+dtgstr

 for tr in np.arange(ntries):  
        
     if tr>=1: 
      print('\tTry again : tr = ',tr)   
    
     try: 
          #print('\nTRYING')  
          resp = requests.get(wsj_url,headers={'user-agent': 'my-app/0.0.1'})
          resp.raise_for_status()
     except requests.exceptions.HTTPError as errh:
          print ("Http Error:",errh)
          print('**tr = ',tr)  
          time.sleep(5)
          continue
     except requests.exceptions.ConnectionError as errc:
          print ("Error Connecting:",errc)
          print('**tr = ',tr)  
          time.sleep(5)
          continue  
     except requests.exceptions.Timeout as errt:
          print ("Timeout Error:",errt)
          print('**tr = ',tr)  
          time.sleep(5)
          continue     
     except requests.exceptions.RequestException as err:
          print ("Something else went wrong",err)
          print('**tr = ',tr)  
          time.sleep(5)
          continue
     else:
          soup = BeautifulSoup(resp.text)
          tags_all = soup.find_all('article')
          #Find the total number of pages for this date from the dropdown list
          tags_npage = soup.find_all('span',class_=re.compile('WSJTheme--pagepicker-total*'))
          #print('\ttags_npage = ',tags_npage)  
          if tags_npage:
           npages =  tags_npage[0].text.split(' ')[1]
           assert npages.isdigit(), "npages is not a digit : npages = {0}, dtgstr = {1}".format(npages,dtgstr)
           if np.int(npages)>1:
            tags_nextpages = give_articles_nextpages(dtgstr,npages)
            tags_all.extend(tags_nextpages)
          break  
 else:     
  print('\tTried 3 times, but to no avail')           
    
 return tags_all

In [3]:
def give_articles_year(yr):

 columns = ['date', 'time','category', 'headline']

 articles_data = []
    
 dtgobj = datetime.datetime.strptime(np.str(yr)+'/01/01','%Y/%m/%d')
  
 ndays = 366 if calendar.isleap(yr) else 365    
    
 for i in np.arange(ndays):
    
  dtgstr = dtgobj.strftime('%Y/%m/%d')

  if np.int(dtgobj.strftime('%d'))%15==1:
   print('\n--',dtgstr)
  tags = give_articles(dtgstr)
  #print('\t** len = ',len(tags))
    
  for i,tt in enumerate(tags):
   if (tt.p is not None) and (tt.span is not None) and (tt.h2 is not None):
    articles_data.append([dtgstr,tt.p.text,tt.span.text,tt.h2.text])      
    ########                date,     time,    category, headline
  dtgobj += datetime.timedelta(days=1)   
 else:
  print('\nDone scraping : ',yr)   
  print('first = ',articles_data[0])
  print('# of articles = ',len(articles_data))
  articlesdf = pd.DataFrame(articles_data, columns=columns)
  articlesdf.to_csv('data_new/WSJ_headlines_'+np.str(yr)+'.csv',index=False)
  print('Done writing')    

In [4]:
for year in np.arange(1998,2021):
 print('\nProcessing year ',year)   
 give_articles_year(year)   



Processing year  2011

-- 2011/01/01

-- 2011/01/16

-- 2011/01/31

-- 2011/02/01

-- 2011/02/16

-- 2011/03/01

-- 2011/03/16

-- 2011/03/31

-- 2011/04/01

-- 2011/04/16

-- 2011/05/01

-- 2011/05/16

-- 2011/05/31

-- 2011/06/01

-- 2011/06/16

-- 2011/07/01

-- 2011/07/16

-- 2011/07/31

-- 2011/08/01

-- 2011/08/16

-- 2011/08/31

-- 2011/09/01

-- 2011/09/16

-- 2011/10/01

-- 2011/10/16

-- 2011/10/31

-- 2011/11/01

-- 2011/11/16

-- 2011/12/01

-- 2011/12/16
Http Error: 404 Client Error: Not Found for url: https://www.wsj.com/news/archive/2011/12/20
**tr =  0
	Try again : tr =  1
Http Error: 404 Client Error: Not Found for url: https://www.wsj.com/news/archive/2011/12/23?page=5
nextpage tr =  0
	In nextpage^^tr =  1

-- 2011/12/31

Done scraping :  2011
first =  ['2011/01/01', '11:39 PM ET', 'Asia Business', 'Kan Vows to Push for Free-Trade Deal']
# of articles =  71258
Done writing

Processing year  2012

-- 2012/01/01

-- 2012/01/16
Http Error: 404 Client Error: Not Found f


-- 2016/10/01

-- 2016/10/16

-- 2016/10/31

-- 2016/11/01

-- 2016/11/16

-- 2016/12/01

-- 2016/12/16

-- 2016/12/31

Done scraping :  2016
first =  ['2016/01/01', '11:59 PM ET', 'Style & Fashion', '5 Fashion Resolutions for 2016']
# of articles =  78628
Done writing

Processing year  2017

-- 2017/01/01

-- 2017/01/16
Http Error: 404 Client Error: Not Found for url: https://www.wsj.com/news/archive/2017/01/17?page=5
nextpage tr =  0
	In nextpage^^tr =  1

-- 2017/01/31

-- 2017/02/01

-- 2017/02/16

-- 2017/03/01

-- 2017/03/16

-- 2017/03/31

-- 2017/04/01

-- 2017/04/16

-- 2017/05/01
Http Error: 404 Client Error: Not Found for url: https://www.wsj.com/news/archive/2017/05/11
**tr =  0
	Try again : tr =  1

-- 2017/05/16

-- 2017/05/31

-- 2017/06/01

-- 2017/06/16

-- 2017/07/01

-- 2017/07/16

-- 2017/07/31

-- 2017/08/01

-- 2017/08/16

-- 2017/08/31

-- 2017/09/01

-- 2017/09/16

-- 2017/10/01

-- 2017/10/16

-- 2017/10/31

-- 2017/11/01

-- 2017/11/16

-- 2017/12/01

-- 2017