Topic: Project 2    
Subject: Scraping Roger Ebert's Reviews  
Date: 10/06/2017  
Name: Zach Heick

In [1]:
import requests
import pandas as pd
import numpy as np
import time
import os
import pickle
import random
from bs4 import BeautifulSoup
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from collections import OrderedDict

In [None]:
chromedriver = '/Applications/chromedriver'
os.environ['webdriver.chrome.driver'] = chromedriver
driver = webdriver.Chrome(chromedriver)

In [None]:
url = 'http://www.rogerebert.com/reviews'
driver.get(url)

I use Selenium to filter out movies reviewed by Roger Ebert.

In [None]:
reviewer_form = driver.find_element_by_class_name('search-field').click()
reviewer_list = driver.find_element_by_class_name('chosen-results')
for review in driver.find_elements_by_tag_name('li'):
    if review.text == 'Roger Ebert':
        review.click()
        break
        
check_box = driver.find_element_by_id('no_stars').click()

The tricky part about scraping Ebert's website is that it has "infinite scrolling". The only way to view more of his movie reviews is to scroll down. Before I could even scrape anything, I want to scroll to the bottom of the page to have all movie titles on one page. Selenium has a nice way to automate scrolling.

In [None]:
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    #Let the page load
    time.sleep(3)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

In [None]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

Once I've scrolled to the bottom of the page, I can now scrape each individual movie title.

In [None]:
def get_movie_details(movie_url):
    """
    Gets data from movie review page from rogertebert.com.
    :param movie_url: url for movie review
    :return: movie's genre, subgenre, mpaa rating, and runtime
    """
    sleep_interval = random.randint(0,1)
    time.sleep(sleep_interval)
    url = 'http://www.rogerebert.com' + movie_url
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,'html5lib')
    
    details_d = OrderedDict()
    details_d['genre'] = ''
    details_d['sub-genre'] = ''
    details_d['mpaa-rating'] = ''
    details_d['running-time'] = ''
    
    movie_details = []
    for detail in soup.find(class_='more-details').findChildren('p'):

        detail_class = detail['class'][0]
        detail_text = [item for item in detail.text.split('\n') if item != '']
        
        if detail_class == 'genres':
            genres = detail_text[0].split(', ')
            details_d['genre'] = genres[0]
            if len(genres) > 1:
                details_d['sub-genre'] = genres[1]
        if detail_class == 'mpaa-rating':
            details_d[detail_class] = detail_text[0].split(' ')[1]
        if detail_class == 'running-time':
            details_d[detail_class] = int(detail_text[0].split(' ')[0])
    
    genre = details_d['genre']
    sub_genre = details_d['sub-genre']
    mpaa_rating = details_d['mpaa-rating']
    running_time = details_d['running-time']
    
    return genre, sub_genre, mpaa_rating, running_time

Roger Ebert's rating scale ranges from zero to four stars, incrementing in halves.

In [None]:
def calculate_numeric_rating(star_ratings):
    """
    Converts images of stars as scores to numeric value.
    :param star_ratings: list of star rating strings
    :return: numberic star score value
    """
    rating_conversion = {
        'icon-star-full': 1,
        'icon-star-half': 0.5,
        'icon-thumbsdown': 0
    }
    
    numeric_rating = []
    for star in star_ratings:
        numeric_rating.append(rating_conversion[star])
    
    return sum(numeric_rating)

With some helper functions defined, I iterate through each movie review listed on the long page that was previously scrolled from earlier. I store each row as a tuple in a list `reviews`.

In [None]:
reviews = []
for review in soup.find_all(class_='movie review'):
    movie_info = [item for item in review.text.split('\n') if item != '']

    if len(movie_info) == 2:
        movie_info.append('')

    movie_title = movie_info[0]
    movie_year = movie_info[2]

    if movie_year != '':
        movie_year = int(movie_year.replace('(', '').replace(')', ''))

    rating = review.find(class_='star-rating').findChildren()
    star_ratings = [rate['class'][0] for rate in rating]  
    numeric_rating = calculate_numeric_rating(star_ratings)

    review_link = review.find('a', class_='poster')['href']  
    genre, sub_genre, mpaa_rating, running_time = get_movie_details(review_link)

    movie = (movie_title, movie_year, numeric_rating, genre, sub_genre, mpaa_rating, running_time)

    reviews.append(movie) 

In [None]:
df = pd.DataFrame(reviews, columns=['Title', 'Year', 'Star_Score', 'Genre', 'Sub-genre', 'Rating', 'Runtime'])

In [3]:
df.head(10)

Unnamed: 0,Title,Year,Star_Score,Genre,Sub-genre,Rating,Runtime
0,The Spectacular Now,2013,4.0,,,R,99
1,Computer Chess,2013,2.0,Comedy,,,91
2,At Any Price,2012,4.0,Drama,,R,105
3,Blancanieves,2012,4.0,Drama,Fantasy,PG-13,104
4,Deceptive Practice: The Mysteries and Mentors ...,2013,3.0,,,NR,88
5,To the Wonder,2013,3.5,Drama,Romance,R,112
6,From Up on Poppy Hill,2013,2.5,Animation,Drama,PG,91
7,The Host,2013,2.5,Science Fiction,Thriller,PG-13,125
8,Ginger and Rosa,2013,3.0,Drama,,PG-13,89
9,On the Road,2013,2.0,Adventure,Drama,R,137


After all that scraping, I make sure to save the dataframe for future work by pickling. The data still needs to be cleaned!

In [None]:
df.to_pickle('roger.pickle')