# Blueprint: download and interpret robots.txt

In [None]:
import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://www.reuters.com/robots.txt")
rp.read()

In [None]:
rp.can_fetch("*", "https://www.reuters.com/sitemap.xml")

In [None]:
rp.can_fetch("*", "https://www.reuters.com/finance/stocks/option")

# Blueprint: finding URLs from sitemap.xml

In [None]:
!pip install xmltodict

In [None]:
# might need to install xmltodict
import xmltodict
import requests

sitemap = xmltodict.parse(requests.get('https://www.reuters.com/sitemap_news_index1.xml').text)

In [None]:
# just see some of the URLs
urls = [url["loc"] for url in sitemap["urlset"]["url"]]
print("\n".join(urls[0:3]))

# Blueprint: finding URLs from RSS

In [None]:
!pip install feedparser

In [None]:
# might need to install feedparser
import feedparser
feed = feedparser.parse('http://feeds.reuters.com/Reuters/worldNews')

In [None]:
[(e.title, e.link) for e in feed.entries]

In [None]:
[e.id for e in feed.entries]

# Example: Downloading HTML pages with Python

In [None]:
%%time
s = requests.Session()
for url in urls[0:10]:
    # get the part after the last / in URL and use as filename
    file = url.split("/")[-1]
    
    r = s.get(url)
    with open(file, "w+b") as f:
        f.write(r.text.encode('utf-8'))

In [None]:
with open("urls.txt", "w+b") as f:
    f.write("\n".join(urls).encode('utf-8'))

# Blueprint: Extraction with regular expressions

We first have to download a single article

In [None]:
url = 'https://www.reuters.com/article/us-health-vaping-marijuana-idUSKBN1WG4KT'

file = url.split("/")[-1] + ".html"

r = requests.get(url)

with open(file, "w+") as f:
    f.write(r.text)

In [None]:
import re
with open(file, "r") as f:
    html = f.read()
    g = re.search(r'<title>(.*)</title>', html, re.MULTILINE|re.DOTALL)
    if g:
        print(g.groups()[0])

# Using an HTML parser for extraction

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
soup.select("h1.ArticleHeader_headline")

## Blueprint: extracting the title/headline

In [None]:
soup.h1

In [None]:
soup.h1.text

In [None]:
soup.title.text

In [None]:
soup.title.text.strip()

## Blueprint: extracting the article text

In [None]:
soup.select_one("div.StandardArticleBody_body").text

## Blueprint: extracting image captions

In [None]:
soup.select("div.StandardArticleBody_body figure")

Variant

In [None]:
soup.select("div.StandardArticleBody_body figure img")

In [None]:
soup.select("div.StandardArticleBody_body figcaption")

## Blueprint: extracting the URL

In [None]:
soup.find("link", {'rel': 'canonical'})['href']

In [None]:
soup.select_one("link[rel=canonical]")['href']

## Blueprint: extracting list information (authors)

In [None]:
soup.find("meta", {'name': 'Author'})['content']

Variant

In [None]:
sel = "div.BylineBar_first-container.ArticleHeader_byline-bar div.BylineBar_byline span"
soup.select(sel)

In [None]:
[a.text for a in soup.select(sel)]

## Blueprint: Extracting text of links (section)


In [None]:
soup.select_one("div.ArticleHeader_channel a").text

## Blueprint: Extracting reading time

In [None]:
soup.select_one("p.BylineBar_reading-time").text

## Blueprint: extracting attributes (id)

In [None]:
soup.select_one("div.StandardArticle_inner-container")['id']

Alternative: URL

## Blueprint: Extracting Attribution

In [None]:
soup.select_one("p.Attribution_content").text

## Blueprint: Extracting Timestamp

In [None]:
ptime = soup.find("meta", { 'property': "og:article:published_time"})['content']
print(ptime)

In [None]:
from dateutil import parser
parser.parse(ptime)

In [None]:
parser.parse(soup.find("meta", { 'property': "og:article:modified_time"})['content'])

# Blueprint: Spidering

In [None]:
import requests
from bs4 import BeautifulSoup
import os.path
from dateutil import parser

def download_archive_page(page):
    filename = "page-%06d.html" % page
    if not os.path.isfile(filename):
        url = "https://www.reuters.com/news/archive/" + \
              "?view=page&page=%d&pageSize=10" % page
        r = requests.get(url)
        with open(filename, "w+") as f:
            f.write(r.text)

def parse_archive_page(page_file):
    with open(page_file, "r") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    hrefs = ["https://www.reuters.com" + a['href'] 
               for a in soup.select("article.story div.story-content a")]
    return hrefs

def download_article(url):
    # check if article already there
    filename = url.split("/")[-1] + ".html"
    if not os.path.isfile(filename):
        r = requests.get(url)
        with open(filename, "w+") as f:
            f.write(r.text)

def parse_article(article_file):
    with open(article_file, "r") as f:
        html = f.read()
    r = {}
    soup = BeautifulSoup(html, 'html.parser')
    r['id'] = soup.select_one("div.StandardArticle_inner-container")['id']
    r['url'] = soup.find("link", {'rel': 'canonical'})['href']
    r['headline'] = soup.h1.text
    r['section'] = soup.select_one("div.ArticleHeader_channel a").text
    
    r['text'] = soup.select_one("div.StandardArticleBody_body").text
    r['authors'] = [a.text 
                    for a in soup.select("div.BylineBar_first-container.ArticleHeader_byline-bar\
                                          div.BylineBar_byline span")]
    r['time'] = soup.find("meta", { 'property': "og:article:published_time"})['content']
    return r

In [None]:
# download 10 pages of archive
for p in range(1, 10):
    download_archive_page(p)

In [None]:
# parse archive and add to article_urls
import glob

article_urls = []
for page_file in glob.glob("page-*.html"):
    article_urls += parse_archive_page(page_file)

In [None]:
# download articles
for url in article_urls:
    download_article(url)

In [None]:
# arrange in pandas DataFrame
import pandas as pd

df = pd.DataFrame()
for article_file in glob.glob("*-id???????????.html"):
    df = df.append(parse_article(article_file), ignore_index=True)
df['time'] = pd.to_datetime(df.time)

In [None]:
df

In [None]:
df.sort_values("time")

In [None]:
%matplotlib inline
df[df["time"]>='2020-01-01'].set_index("time").resample('D').agg({'id': 'count'}).plot.bar()

# Blueprint Density extraction

In [None]:
!pip install readability-lxml

In [None]:
# might need to install readability-lxml
from readability import Document

doc = Document(html)
doc.title()

In [None]:
doc.short_title()

In [None]:
doc.summary()

In [None]:
doc.url

In [None]:
density_soup = BeautifulSoup(doc.summary(), 'html.parser')
density_soup.body.text

# Blueprint: Scrapy

In [None]:
!pip install scrapy

In [None]:
# might need to install scrapy
import scrapy
import logging


class ReutersArchiveSpider(scrapy.Spider):
    name = 'reuters-archive'
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEED_FORMAT': 'json',
        'FEED_URI': 'reuters-archive.json'
    }
    
    start_urls = [
        'https://www.reuters.com/news/archive/',
    ]

    def parse(self, response):
        for article in response.css("article.story div.story-content a"):
            yield response.follow(article.css("a::attr(href)").extract_first(), self.parse_article)

        next_page_url = response.css('a.control-nav-next::attr(href)').extract_first()
        if (next_page_url is not None) & ('page=2' not in next_page_url):
            yield response.follow(next_page_url, self.parse)

    def parse_article(self, response):
        yield {
          'title': response.css('h1::text').extract_first().strip(),
          'section': response.css('div.ArticleHeader_channel a::text').extract_first().strip(),
          'text': "\n".join(response.css('div.StandardArticleBody_body p::text').extract())
        }

In [None]:
# this can be run only once from a Jupyter notebook due to Twisted
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()

process.crawl(ReutersArchiveSpider)
process.start()

In [None]:
glob.glob("*.json")

In [None]:
!cat 'reuters-archive.json'