In [2]:
#script for the Reuters scraper. Uses the Reuters News Archive (now defunct)
from bs4 import BeautifulSoup
import urllib
from datetime import datetime
from datetime import timedelta
import pandas as pd
import numpy as np
import feather
import requests



#function to get the links from https://www.reuters.com/resources/archive/us/DATE.html
#DATE is in YYYYMMDD format
#get two dates from user:start date and end date. Get all links for articles between these dates
def get_links():
    #get input
    start_date = datetime.strptime(input("Please enter the start date:"), '%Y%m%d')
    end_date=datetime.strptime(input("Please enter the end date:"), '%Y%m%d')
    #create list of dates between start and end dates
    delta=end_date-start_date
    date_list=[start_date + timedelta(i) for i in range(delta.days + 1)]
    link_list=[]
    #scrape what the links of articles for each date. put them in a list.
    for i in date_list:
        p = urllib.request.urlopen('https://www.reuters.com/resources/archive/us/'+i.strftime('%Y%m%d')+'.html').read()
        soup = BeautifulSoup(p,'lxml')
        content=soup.find_all("div",class_="module")
        if len(content)>=1:
            for i in content[0].find_all(class_='headlineMed'): 
                link_list.append(i.a['href'])            
    return link_list

#function to get the dates of each article
#use the link list created by the "get_links" function and get timestamps of each article
def get_dates(links):
    date_list=[]
    for i in links:
        try:
            y=urllib.request.urlopen(i).read()  
        except Exception as e:
            date_list.append(e)
            continue
        z = BeautifulSoup(y,'lxml')
        a=z.find_all("div",class_="date_V9eGk")
        #print(len(a))
        if len(a)>=1:
            date_list.append(a[0].get_text()[0:23])
        elif len(a)==0:
            date_list.append("Na")                    
    return date_list

#function to get the keywords of each article
#use the link list created by the "get_links" function and get keywords of each article
def get_keywords(links):
    keyword_list=[]
    for i in links:
        try:
            y=urllib.request.urlopen(i).read()
        except Exception as e:
            keyword_list.append(e)
            continue
        z = BeautifulSoup(y,'lxml')
        a_keys=z.find_all('meta',attrs={'name':'keywords'})
        #print(len(a))
        if len(a_keys)>=1:
            keyword_list.append(a_keys[0]['content'].lower().split(','))
        elif len(a)==0:
            keyword_list.append("Na")                        
    return keyword_list


#function to get the articles 
#use the link list created by the "get_links" function and get titles of each article
def get_articles(links):
    article_list=[]
    for i in links:
        try:
            y=urllib.request.urlopen(i).read()
       
        except Exception as e:
            print(e)
            continue
        
        z = BeautifulSoup(y,'lxml')
        a=z.find_all("div",class_="body_1gnLA")
        #print(len(a))
        if len(a)>=1:
            start_time = time.time()
            article_list.append(a[0].get_text())
            print("--- %s seconds ---" % (time.time() - start_time))
        elif len(a)==0:
            article_list.append('Na')
    return article_list
        
#run the functions
links=get_links()
keywords=get_keywords(links)
dates=get_dates(links)
articles=get_articles(links)

#put everything in a n * 3 dataframe where n is the number of articles and the columns are dates, links and keywords
#respectively. you need to do some cleaning to get to the appropriate format of the dates. 
df_1 = pd.DataFrame(np.column_stack([dates, links, keywords,articles]),columns=['date', 'link', 'keywords','articles']) 
df_1['date'] = df_1['date'].astype(str)
df_1['new']=df_1.loc[:,'date'].str[0:23]
df_1.drop('date',axis=1,inplace=True)
df_1.rename(columns={'new':'date'}, inplace=True)

#put it in a csv
df_1.to_csv('reuters_all_articles.csv')

