In [25]:
import os
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu') 

driver = webdriver.Chrome(executable_path='/Users/alex/chromedriver', options=chrome_options)

In [28]:
## make chromedriver run on GCE
os.environ['webdriver.chrome.driver'] = "/usr/bin/chromedriver"
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu') 
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars") 
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox") 
driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=chrome_options)

In [2]:
def get_html(path, use_selenium):
    if use_selenium:
        driver.get(path)
        return driver.page_source
    else:
        page = requests.get(path)
        return page.text

def parse_table(soup):
    table_output = []
    for row in soup.find_all('tr'):
        row_ouput = []
        for cell in row.find_all('td'):
            if cell.find('a'):
                cell_link = cell.a['href']
                row_ouput.append(cell_link)
            cell_value = ''.join(cell.stripped_strings)
            row_ouput.append(cell_value)
        table_output.append(row_ouput)

    table_df = (pd.DataFrame(
        list(filter(lambda x: len(x) != 0, table_output)), 
        columns=['newssniff_link', 'headline', 'version', 'outlet', 'last_updated'])
    )
    return table_df

def get_search_result(page, use_selenium=True):
    url = 'https://www.newssniffer.co.uk/versions?page=%s' % page
    search_page = get_html(url, use_selenium=True)
    soup = BeautifulSoup(search_page.text)
    page_df = parse_table(soup)
    page_df['search_page'] = page
    return page, page_df


def parse_article(html):
    """Given the HTML of an article return a table with a row for each version."""

    ## get url
    soup = BeautifulSoup(html)
    source, url = soup.find_all('cite')[0].text, soup.find_all('cite')[1].text

    ## get the rest of the article
    version_tables = pd.read_html(html)[0]
    version_table_flat = (version_tables
     .apply(lambda s: ('</p><p>').join(s.dropna().tolist()))
     .reset_index()
     .rename(columns={'level_0': 'version', 'level_1':'title', 'level_2': 'time', 0:'text'})
    )

    ## merge
    version_table_flat['url'] = url
    version_table_flat['source'] = source

    return version_table_flat

In [None]:
last_page = 239179
start_page = 50503
all_search_output = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as pool:        
    result_futures = list(map(lambda x: pool.submit(get_search_result, x), range(start_page, last_page)))
    for future in tqdm(concurrent.futures.as_completed(result_futures), total=last_page - start_page):
        try:
            page_idx, page_df = future.result()
            all_search_output.append(page_df)
            if len(all_search_output) > 1000:
                all_search_output_df = pd.concat(all_search_output)
                all_search_output_df.to_csv('cache/newssniffer-index-%s.csv' % page_idx)
                all_search_output = []
        except Exception as e:
            print('e is', e, type(e))

In [7]:
pd.concat(all_search_output).to_csv('cache/newssniffer-index-%s.csv' % page_idx)

In [53]:
search_pages = list(map(lambda x: pd.read_csv(x, usecols=['search_page'], squeeze=True), search_files))
searched_df = pd.concat(search_pages) 

In [65]:
start_page = 200
last_page = 239295
all_search_pages = pd.Series(list(range(start_page, last_page)))

In [78]:
ls ../scraping/scrapy-cloud/newssniffer_scrape/output_dir/

file-ids-scraped.csv      search-pages-scraped.csv


In [77]:
searched_df.drop_duplicates().to_csv('../scraping/scrapy-cloud/newssniffer_scrape/output_dir/search-pages-scraped.csv')

In [90]:
search_files = glob.glob('../scraping/scrapy-cloud/output_dir/newssniffer*')
search_pages = list(map(lambda x: pd.read_csv(x, index_col=0), search_files))

In [91]:
pd.concat(search_pages)

Unnamed: 0,newssniff_link,headline,version,outlet,last_updated,search_page
0,https://www.newssniffer.co.uk/articles/1709190...,Fourth Soldier Dies After Coordinated Bomb Bla...,1,nytimes,"04 Dec 2018, 05:50",52505
1,https://www.newssniffer.co.uk/articles/1709318...,Arrests over India policeman killed by 'cow sl...,1,bbc,"04 Dec 2018, 05:50",52505
2,https://www.newssniffer.co.uk/articles/1695611...,Left behind: families face uncertain future wh...,2,guardian,"04 Dec 2018, 05:47",52505
3,https://www.newssniffer.co.uk/articles/1709331...,Country diary: thatching is a job to take your...,0,guardian,"04 Dec 2018, 05:45",52505
4,https://www.newssniffer.co.uk/articles/1689299...,Angela Merkel’s successor could be bad news fo...,1,guardian,"04 Dec 2018, 05:45",52505
...,...,...,...,...,...,...
11,https://www.newssniffer.co.uk/articles/1680196...,Elizabeth Warren Releases DNA Results on Nativ...,0,nytimes,"15 Oct 2018, 13:55",56508
12,https://www.newssniffer.co.uk/articles/1680150...,Fracking starts at landmark Lancashire site,2,bbc,"15 Oct 2018, 13:55",56508
13,https://www.newssniffer.co.uk/articles/1680175...,"Cologne hostage taker ‘under control,’ one wom...",1,rtcom,"15 Oct 2018, 13:55",56508
14,https://www.newssniffer.co.uk/articles/1679806...,Among the Ruins of Mexico Beach Stands One Hou...,3,nytimes,"15 Oct 2018, 13:55",56508


# See output so far

In [6]:
import glob
files = list(map(
    lambda x: pd.read_csv(x, index_col=0),
    glob.glob('cache/newssniffer-index-*')
))

files_df = (pd.concat(files)
            .reset_index(drop=True)
            .assign(file_id=lambda df: df['newssniff_link'].str.split('/').str.get(4))
           )

max_diff_per_file_id = (
    files_df[['version', 'file_id']]
    .assign(version=lambda df: df['version'].astype(int))
    .groupby('file_id')
    ['version']    
    .max()
)

files_df_versions = files_df.merge(
    max_diff_per_file_id.astype(int).to_frame('num_diffs_per_file'),
    left_on='file_id',
    right_index=True
)

In [None]:
files_df['file_id'].value_counts().head()
files_df.drop_duplicates('file_id')['outlet'].value_counts()
files_df['last_updated_time'] = files_df['last_updated'].pipe(lambda s: pd.to_datetime(s, errors='coerce'))
files_df['last_updated_time'].dropna().sort_values().min()
files_df_versions.groupby('outlet')['num_diffs_per_file'].median()
files_df_versions.drop_duplicates('file_id')['outlet'].value_counts()

# Test Running the File fetcher

In [8]:
min_cut, max_cut = (
    files_df_versions
        .drop_duplicates('file_id')['num_diffs_per_file']
        .loc[lambda s: s != 0].quantile([.01, .99])
)

to_scrape = (files_df_versions
 .loc[lambda df: df['num_diffs_per_file'].pipe(lambda s: (s >= min_cut) & (s<=max_cut))] ## 1, 30
 [['file_id', 'num_diffs_per_file']]
 .drop_duplicates()
)

In [24]:
import time

In [None]:
article_dfs = []
num_tries = 5
start_at = 11194
for loop_idx, (file_id, num_diffs) in tqdm(
    enumerate(to_scrape.iloc[start_at:].itertuples(index=False)), 
    total=len(to_scrape.iloc[start_at:])
):
    if (loop_idx % 500 == 0) and len(article_dfs) > 0:
        to_disk = pd.concat(article_dfs)
        to_disk.to_csv('cache/article-versions-%s.csv' % (loop_idx + start_at))
        article_dfs = []
    
    v = list(range(num_diffs+1))
    for s, e in list(zip(v[:-1], v[1:])):
        for i in range(num_tries):
            try:
                url = 'https://www.newssniffer.co.uk/articles/%s/diff/%s/%s' % (file_id, s, e)
                html = get_html(url, use_selenium=True)
                article_df = parse_article(html)
                article_df['file_id'] = file_id
                article_dfs.append(article_df)
                break
            except ConnectionError:
                print('driver failed... re-initializing...')
                driver.quit()
                driver = webdriver.Chrome(executable_path='/Users/alex/chromedriver', options=chrome_options)
            except:
                print('failed, sleeping...')
                time.sleep(5)

# Article-Level Parsing

In [None]:
import pandas as pd 
html = pd.read_html('https://www.newssniffer.co.uk/articles/2057481/diff/10/11')
html[0]

In [73]:
import glob
import re
import os
import concurrent.futures

In [None]:
import concurrent.futures
iterable = [1,2,3,4,6,7,8,9,10]

def f(x):
    if x == 2:
        raise Exception('x')
    return x

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:


In [72]:
import time
e = ThreadPoolExecutor(4)
s = range(10)
for i in e.map(time.sleep, s):
    print(i)

None
None
None
None
None
None
None
None
None
None


# Docker

In [None]:
import scrapy
from selenium import webdriver

class DemoSpider(scrapy.Spider):
    name = 'demo'
    start_urls = ['http://quotes.toscrape.com/js']

    def __init__(self, *args, **kwargs):
        super(DemoSpider, self).__init__(*args, **kwargs)

        options = webdriver.ChromeOptions()
        options.add_argument("--disable-extensions")
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        self.driver = webdriver.Chrome(chrome_options=options, executable_path='/usr/bin/chromedriver')
    
    def parse(self, response):
        self.driver.get(response.url)
        for quote in self.driver.find_elements_by_css_selector('div.quote'):
            yield {
                'quote': quote.find_element_by_css_selector('span').text,
                'author': quote.find_element_by_css_selector('small').text,
            }
        next_page_url = response.css('nav li.next a ::attr(href)').extract_first()
        if next_page_url:
            yield scrapy.Request(response.urljoin(next_page_url))