In [1]:
import requests, json, numpy, datetime, dataset, re, pywren
import matplotlib.pyplot as plt
from scipy.interpolate import UnivariateSpline
import pywren
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin, urlparse
from scraper import *

## Question 1: Scraping

In [2]:
db = dataset.connect('sqlite:///books.db')
base_url = 'http://books.toscrape.com/'

In [3]:
## Scrape the pages in the catalogue
url = base_url
while True:
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    scrape_books(html_soup, url, db)
    # Is there a next page?
    next_a = html_soup.select('li.next > a')
    if not next_a or not next_a[0].get('href'): break
    url = urljoin(url, next_a[0].get('href'))

In [14]:
books = db['books'].find(order_by=['last_seen'])

t0 = datetime.now()
for book in books:
    book_id = book['book_id']
    book_url = base_url + 'catalogue/{}'.format(book_id)
    r = requests.get(book_url)
    r.encoding = 'utf-8'
    html_soup = BeautifulSoup(r.text, 'html.parser')
    scrape_book(html_soup, book_id, db)
    db['books'].upsert({'book_id' : book_id,
                        'last_seen' : datetime.now()
                        }, ['book_id'])

print('serial time spent:', datetime.now()-t0)

serial time spent: 0:09:55.448328


In [11]:
def scrape_book_parallel(book_id):
    
    base_url = 'http://books.toscrape.com/'
    book_url = base_url + 'catalogue/{}'.format(book_id) 
    
    r = requests.get(book_url)
    r.encoding = 'utf-8'
    html_soup = BeautifulSoup(r.text, 'html.parser')
    main = html_soup.find(class_='product_main')
    
    book = {}
    book['book_id'] = book_id
    book['title'] = main.find('h1').get_text(strip=True)
    book['price'] = main.find(class_='price_color').get_text(strip=True)
    book['stock'] = main.find(class_='availability').get_text(strip=True)
    book['rating'] = ' '.join(main.find(class_='star-rating') \
                        .get('class')).replace('star-rating', '').strip()
    book['img'] = html_soup.find(class_='thumbnail').find('img').get('src')
    desc = html_soup.find(id='product_description')
    book['description'] = ''
    if desc: book['description'] = desc.find_next_sibling('p').get_text(strip=True)
    book_product_table = html_soup.find(text='Product Information').find_next('table')
    for row in book_product_table.find_all('tr'):
        header = row.find('th').get_text(strip=True)
        header = re.sub('[^a-zA-Z]+', '_', header)
        value = row.find('td').get_text(strip=True)
        book[header] = value     
    return book

In [3]:
pwex = pywren.default_executor()
book_ids = [book['book_id'] for book in db['books']]
split = split_list(book_ids, 50) ## 50 urls for each lambda instance

In [13]:
def scrape_helper(id_lst): 
    return [scrape_book_parallel(book_id) for book_id in id_lst]

t0 = datetime.now()
futures = pwex.map(scrape_helper, split)
result = pywren.get_all_results(futures)
print('parallel time spent:', datetime.now()-t0)

parallel time spent: 0:00:28.147050


In [14]:
t0 = datetime.now()
result = [j for i in result for j in i]
for book in result:
    db['books'].upsert({'book_id' : book['book_id'],
                        'last_seen' : datetime.now()}, ['book_id'])
print('last transfer time:', datetime.now()-t0)

last transfer time: 0:00:01.812117


* It takes the original serial version about 10 mins to run, but it takes PyWren only 28+2 = 30 secs. 
* There is still a significant serial part at the beginning (before we start to scrape), taking about 12.26 secs to transfer the data to each instance.
* **the logic behind using a relational database** The traditional databases were flat. This means that the information was stored in one long text file (tab delimited file). The text file makes it difficult to search for specific information or to create reports that include only certain fields from each record. [[1]](https://computer.howstuffworks.com/question599.htm) In this scraping solution we use a relational database because it is faster and safer to for SQL-type databases to add, update or delete rows of data, retrieving subsets of data for transaction processing and analytics applications, and to manage all aspects of the database.[[2]](https://aws.amazon.com/relational-database/)
* **We would want to scale up to AWS large-scale database** 1) if the information of each book keeps changing and we want to keep updated with these changes. In this case, we would want a distributed data storage method which maintains availability (every request receives a non-error response) and partition tolerance (the system continues to operate despite network failures). 2) If we want to make the data to be publicly available: S3 provides cost-efficient methods for us to store and publish large scale databse. 

## Question 2: MRJOB

In [18]:
for book in result:
    db['books'].upsert({'book_id' : book['book_id'],
                        'desc': book['description'],
                        'last_seen' : datetime.now()}, ['book_id'])

In [19]:
desc = [book['desc'] for book in db['books']]
with open("book_description.tsv", "w") as file:
    for i in desc: file.write(i+'\n')
! python mrjob_pb2.py book_description.tsv

Using configs in /Users/chen.liang/.mrjob.conf
No configs specified for inline runner
Creating temp directory /var/folders/1_/vb8z7w9d62s_hq9m0yd097z00000gn/T/mrjob_pb2.chen.liang.20200516.071749.095448
Running step 1 of 2...
Running step 2 of 2...
job output is in /var/folders/1_/vb8z7w9d62s_hq9m0yd097z00000gn/T/mrjob_pb2.chen.liang.20200516.071749.095448/output
Streaming final output from /var/folders/1_/vb8z7w9d62s_hq9m0yd097z00000gn/T/mrjob_pb2.chen.liang.20200516.071749.095448/output...
[[2002, "with"], [2147, "that"], [2513, "her"], [3136, "is"], [4348, "in"], [6096, "to"], [7088, "a"], [7882, "of"], [8705, "and"], [13156, "the"]]	null
Removing temp directory /var/folders/1_/vb8z7w9d62s_hq9m0yd097z00000gn/T/mrjob_pb2.chen.liang.20200516.071749.095448...


## Question 3: SNS Notifications

See the seperated `problem3 jupyter notebook`

## Question 4: Propose a Final Project Topic
* I propose a thesis-related political science research topic that primarily uses PyWren to scrape Twitter data and potentially analyze tweets as well.
* The general idea is that I want to construct an expert-based social network for each targeted policy think tank, and try to prove that 1) twitter follower/following network can imply the level of power centralization in an organization like think tanks. 2) Experts' control over information flow, i.e. betweenness centrality, is strongly correlated with their roles within the organization. (See the graph below)
<img src="heritage.png" width="600" align="center" />
* To collect data, I need to automatically scrape over 600 twitter handles and obtain not only their profile information but also a whole list of their followers and following accounts. It is particularly computationally exhaustive to scrape all follower accounts because we need to sequentially scrape each page which gives 20 follower accounts as well as a cursor that indicates the url of the next page. I am using PyWren to parallel the scraping process; that is, I can scrape the followers of multiple accounts at the same time. By using AWS services, I can also circumvent the IP issues. 
* Likewise, I will also use PyWren to scrape the profile information and historical tweets for each targeted accounts. Potentially, (if time permits), I am planning to analyze the political stances as implied by the historical tweets with a semantic topic modeling approach. If the dataset is too large, I might use RCC and MPI to multiprocess tokenization.