In [None]:
#Script to get the links, titles, articles and keywords for NYT's front page
#example link for the front page: https://www.nytimes.com/issue/todayspaper/2018/12/15/todays-new-york-times
from bs4 import BeautifulSoup
import urllib
from datetime import datetime
from datetime import timedelta
import pandas as pd
import numpy as np


#DATE is in YYYYMMDD format
#get two dates from user:start date and end date. Get all links for the front page between these dates.
def get_links_nyt():
    #get input
    start_date = datetime.strptime(input("Please enter the start date:"), '%Y%m%d')
    end_date=datetime.strptime(input("Please enter the end date:"), '%Y%m%d')
    #create list of dates between start and end dates
    delta=end_date-start_date
    date_list=[start_date + timedelta(i) for i in range(delta.days + 1)]
    link_list=[]
    #scrape what the links of articles for each date. put them in a list.
    for i in date_list:
        try:
            p=urllib.request.urlopen(' https://www.nytimes.com/issue/todayspaper/'+i.strftime('%Y/%m/%d/')+'todays-new-york-times').read()  
        except Exception as e:
            link_list.append(e)
            continue
        
        soup = BeautifulSoup(p,'lxml')
        content=soup.find_all("h2",class_="headline") 
        print(len(content))
        for i in content:
            link_list.append(i.a['href'])            
            
                
    return link_list

#get the date of each article, put them in a list 
def get_dates_nyt(links):
    dates_list=[]
    for i in links:
        try:
            y=urllib.request.urlopen(i).read()
        except Exception as e:
            dates_list.append(e)
            continue
        z = BeautifulSoup(y,'lxml')
        a_dates=z.find_all('meta',attrs={'itemprop':'datePublished'})
        print(a_dates)
        if len(a_dates)>=1:
            dates_list.append(a_dates[0]['content'].lower().split('T'))
        elif len(a_dates)==0:
            dates_list.append("Na")                        
    return dates_list

#get the keywords of each article, put them in a list 
def get_keywords_nyt(links):
    keywords_list=[]
    for i in links:
        try:
            y=urllib.request.urlopen(i).read()
        except Exception as e:
            keywords_list.append(e)
            continue
        z = BeautifulSoup(y,'lxml')
        a_keywords=z.find_all('meta',attrs={'name':'news_keywords'})
        if len(a_keywords)>=1:
            keywords_list.append(a_keywords[0]['content'].lower())
        elif len(a_keywords)==0:
            keywords_list.append("Na")                        
    return keywords_list

#get the bodies of each article. put them on a list 
def get_articles_nyt(links):
    article_list=[]
    for i in links:
        try:
            y=urllib.request.urlopen(i).read()
       
        except Exception as e:
            article_list.append(e)
            continue
        
        z = BeautifulSoup(y,'lxml')
        a=z.find_all("div",class_="css-18sbwfn StoryBodyCompanionColumn")
        if len(a)>=1:
            article_list.append(' '.join(paragraph.get_text() for paragraph in a))
        elif len(a)==0:
            article_list.append('Na')
    return article_list

#get the titles for each article. put them in a list 
def get_titles_nyt(links):
    title_list=[]
    for i in links:
        try:
            y=urllib.request.urlopen(i).read()     
        except Exception as e:
            title_list.append(e)
            continue
        z = BeautifulSoup(y,'lxml')
        a_titles=z.find_all("title")
        if len(a_titles)>=1:
            title_list.append(a_titles[0].get_text()[0:-21])
        elif len(a)==0:
            title_list.append("Na")            
    return title_list



#run all functions, get all elements 
links=get_links_nyt()
articles=get_articles_nyt(links)
keywords=get_keywords_nyt(links)
titles=get_titles_nyt(links)
dates=get_dates_nyt(links)

#put all elements in a pandas dataframe as separate columns
df_nyt = pd.DataFrame(np.column_stack([dates, links, keywords,articles,titles]),columns=['date', 'link', 'keywords','article','title']) 

#write dataframe in a csv 
df_nyt.to_csv("20180516-20180621_nyt.csv")