## The Hindu archive scraper

TheHindu.com allows scraping of archived links at `https://www.thehindu.com/archive/web/year/month/day` (from `2009 August` onwards). They also make `/print/` version available.

This script scrapes year-wise `web` archived articles.

In [1]:
# Step 1 - Create URLs for an year

from calendar import monthrange

urls = {}  # stores month-wise archive links
yr = 2016

for month in range(1, 13):
    t = monthrange(yr, month)
    urls[month] = {}
    month_formatted = '{:02d}'.format(month)
    for day in range(1, t[1] + 1):
        day_formatted = '{:02d}'.format(day)
        urls[month][day] = 'https://www.thehindu.com/archive/print/{}/{}/{}'.format(yr, 
            month_formatted, day_formatted)

print(len(urls.keys()))  # 12, total months
print(urls[5])           # print May links for reference

12
{1: 'https://www.thehindu.com/archive/print/2016/05/01', 2: 'https://www.thehindu.com/archive/print/2016/05/02', 3: 'https://www.thehindu.com/archive/print/2016/05/03', 4: 'https://www.thehindu.com/archive/print/2016/05/04', 5: 'https://www.thehindu.com/archive/print/2016/05/05', 6: 'https://www.thehindu.com/archive/print/2016/05/06', 7: 'https://www.thehindu.com/archive/print/2016/05/07', 8: 'https://www.thehindu.com/archive/print/2016/05/08', 9: 'https://www.thehindu.com/archive/print/2016/05/09', 10: 'https://www.thehindu.com/archive/print/2016/05/10', 11: 'https://www.thehindu.com/archive/print/2016/05/11', 12: 'https://www.thehindu.com/archive/print/2016/05/12', 13: 'https://www.thehindu.com/archive/print/2016/05/13', 14: 'https://www.thehindu.com/archive/print/2016/05/14', 15: 'https://www.thehindu.com/archive/print/2016/05/15', 16: 'https://www.thehindu.com/archive/print/2016/05/16', 17: 'https://www.thehindu.com/archive/print/2016/05/17', 18: 'https://www.thehindu.com/archiv

In [2]:
# import libraries

from bs4 import BeautifulSoup as bs
import os
import pandas as pd
import bs4
import requests
import time
import glob

In [None]:
# Each URL from the archive is written to a file on the disk

# urls[1][3], 3rd day in January (YEAR)
errors = []
for month in range(1, 2):
    print(yr, month)
    for day, url in urls[month].items():
        try:
            r = requests.get(url)
            if r.status_code == 200:
                with open('{}-{:02d}-{:02d}.html'.format(yr, month, day), 'w') as out_file:
                    out_file.write(r.text)
            else:
                errors.append([yr, month, day, url])
        except:
            print("Exception for", url)
            continue
        # after each day, sleep for 3 seconds and go easy on the server
        time.sleep(3)
    # after each month, sleep for 5 seconds and go easy on the server
    time.sleep(5)
# prints any errors if a file isn't accessible
print(errors)

2016 1


In [5]:
# Store all links from a single archive link to a specific directory
# ex: all links from https://www.thehindu.com/archive/print/2018/03/01/
# are stored in 2018-03-01/links.csv
# Alternative approach would be store it in a sqlite file

import glob

# run below on each of the above fetched URL
# https://www.thehindu.com/archive/print/2018/03/01/ saved as archive-example.html
for in_file in glob.glob("{}-*.html".format(yr)):
    print(in_file, "\t\t", os.path.splitext(in_file)[0])
    soup = bs(open(in_file), "html.parser")

    ul = soup.find_all('ul', {'class':'archive-list'})
    directory = os.path.dirname(os.path.abspath("___file___"))
    directory = os.path.join(directory, os.path.splitext(in_file)[0])
    if not os.path.isdir(directory):
        os.makedirs(directory)

    links = []
    for li in ul:
        for anchor in li:
            for a in anchor:
                if a is not None and type(a) is bs4.element.Tag:
                    links.append([a['href'], a.get_text()])

    df = pd.DataFrame.from_records(links, columns=['link', 'title'])
    df.to_csv(os.path.join(directory, 'links.csv'), index=False)

2016-11-12.html 		 2016-11-12
2016-05-26.html 		 2016-05-26
2016-09-19.html 		 2016-09-19
2016-02-05.html 		 2016-02-05
2016-06-20.html 		 2016-06-20
2016-03-21.html 		 2016-03-21
2016-09-08.html 		 2016-09-08
2016-02-13.html 		 2016-02-13
2016-12-20.html 		 2016-12-20


KeyboardInterrupt: 

In [3]:
# For every link in day/links.out, get the content and write to day/links-out.csv

def fetch_data(row):
    try:
        content = requests.get(row)
        if content.status_code == 200:
            soup = bs(content.text, "lxml")
            div_article = soup.find_all('div', {'class':'article'})
            return '{}'.format(div_article)
        else:
            print("\t nope", row)
    except:
        print("Exception for ", row)
        pass

for links in glob.glob('{}-06-3*/links.csv'.format(yr)):
    directory = os.path.dirname(os.path.abspath("___file___"))
    directory = os.path.join(directory, os.path.splitext(links)[0])

    print(links, ",", os.path.dirname(links), ",", os.path.join(directory, os.path.dirname(links)), "\t", os.path.basename(links))
    df = pd.read_csv(links)
    df['content'] = df['link'].apply(fetch_data)
    out_f = os.path.splitext(links)[0]
    print("\t\t", out_f)
    df.to_csv('{}-out.csv'.format(out_f) , index=False, quoting=3, escapechar='\\')
    # after every day, sleep for 5 seconds 
    time.sleep(60)


2018-06-30/links.csv , 2018-06-30 , /home/gramener/Documents/personal/iodc-2018/thehindu/2018-06-30/links/2018-06-30 	 links.csv
Exception for  https://www.thehindu.com/todays-paper/tp-features/tp-weekend/surf-and-sustainability/article24296611.ece
		 2018-06-30/links


In [41]:
# to read the written files, use below command
df.from_csv('{}-out.csv'.format(out_f), index=False, quoting=3, escapechar='\\')

(826, 3) 2018-01-09/links
2018-01-09 2018-01-09/links
