# Regression Open-Ended Project

-----

# Define Question / Hypothesis

**Use publicly available data to generate Roger Ebert-esque ratings for recent movies.**

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Obtain Data via Scraping and APIs

- time: to add in a sleep delay when scraping
- tqdm: a nifty tool to show progress bar

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import tqdm
import pickle
import re
import datetime

### Scraping

- Manual - download website files locally
- Procedural - find AJAX script
- Pythonic - headless browser with Selenium

In [3]:
def scrape_eberts_review(num_pages=100):
    """
    Parses through webpage with list of movies and returns DataFrame.
    :num_pages = Number of pages to go through
    """
    url = "http://www.rogerebert.com/reviews?great_movies=0&no_stars=0&title=Cabin+in+the+Woods&filtersgreat_movies%5D%5B%5D=&filters%5Bno_stars%5D%5B%5D=&filters%5Bno_stars%5D%5B%5D=1&filters%5Btitle%5D=&filters%5Breviewers%5D=&filters%5Bgenres%5D=&page={}&sort%5Border%5D=newest"
    pages = list(range(1, num_pages))
    links = [url.format(i) for i in pages]
    
    review_list = list()
    
    for link in links:
        webpage = requests.get(link).text
        soup = BeautifulSoup(webpage, 'lxml')
        all_movies = soup('figure', {'class':'movie review'})
    
        for movie in all_movies:
            url = movie.a.get('href')
            title = movie.find_all('a')[1].text
            stars = len(movie.find_all('i', {'class':'icon-star-full'})) + 0.5 * len(movie.find_all('i', {'class':'icon-star-half'}))

            try:
                year = movie.find('span', {'class':'release-year'}).text[1:-1]
            except:
                year = ''

            review_list.append([title, stars, year, url])
    
    df = pd.DataFrame(review_list, columns = ['Title', 'EbertStars', 'Year', 'URL'])
    return df

In [4]:
review_df = scrape_eberts_review(num_pages=400)

In [5]:
print(review_df.shape)
print(review_df.dtypes)
print(review_df.head())
print(review_df.tail())

(9212, 4)
Title          object
EbertStars    float64
Year           object
URL            object
dtype: object
        Title  EbertStars  Year                       URL
0  All Saints         3.0  2017  /reviews/all-saints-2017
1       Leap!         1.5  2017        /reviews/leap-2017
2  Beach Rats         3.5  2017  /reviews/beach-rats-2017
3  Death Note         1.0  2017  /reviews/death-note-2017
4    Bushwick         1.0  2017    /reviews/bushwick-2017
                                          Title  EbertStars  Year  \
9207                                    The Wiz         3.0  1978   
9208  Who Is Killing the Great Chefs of Europe?         3.0  1978   
9209                           Midnight Express         3.0  1978   
9210                Somebody Killed Her Husband         2.0  1978   
9211                                    Slithis         0.5  1978   

                                                    URL  
9207                              /reviews/the-wiz-1978  
9208  /re

In [6]:
def scrape_webpage(link):
    """
    Parses each individual review page and returns list of key attributes.
    :link = URL for review
    """
    full_link = "http://www.rogerebert.com" + link
    webpage = requests.get(full_link).text
    soup = BeautifulSoup(webpage, 'lxml')

    try:
        mpaa = soup.find('p', {'class':'mpaa-rating'}).strong.text[6:]
    except:
        mpaa = ''

    try: 
        runningtime = int(soup.find('p', {'class':'running-time'}).strong.text[:3].strip())
    except:
        runningtime = ''

    try:
        genres = soup.find('p', {'class':'genres'}).strong.text.replace(',', '').split()
    except:
        genres = []

    try:
        reviewbody = ' '.join([paragraph.text for paragraph in soup.find('div', {'itemprop':'reviewBody'}).find_all('p')])
    except:
        reviewbody = ''
    
    return [link, mpaa, runningtime, reviewbody]

In [7]:
scraped_list = list()

for movie in tqdm.tqdm(review_df['URL']):
    scraped_list.append(scrape_webpage(movie))
    time.sleep(0.5)

review_content = pd.DataFrame(scraped_list, columns = ['URL', 'Rating', 'Runtime', 'Review'])

100%|██████████| 9212/9212 [1:54:54<00:00,  1.30it/s]


In [8]:
review_content.head()

Unnamed: 0,URL,Rating,Runtime,Review
0,http://www.rogerebert.com/reviews/all-saints-2017,PG,108,“All Saints” feels like a bit of a miracle. It...
1,http://www.rogerebert.com/reviews/leap-2017,PG,89,The emotional and visual power of dance is so ...
2,http://www.rogerebert.com/reviews/beach-rats-2017,,95,"""Beach Rats,"" writer/director Eliza Hittman's ..."
3,http://www.rogerebert.com/reviews/death-note-2017,NR,101,I don’t subscribe to the theory that adaptatio...
4,http://www.rogerebert.com/reviews/bushwick-2017,NR,94,This movie will lose the goodwill of any longt...


In [17]:
review_content['URL'] = review_content['URL'].apply(lambda x: x.replace("http://www.rogerebert.com", ""))

In [18]:
combined_df = pd.merge(review_df, review_content, how='left', on='URL')
combined_df.head()

Unnamed: 0,Title,EbertStars,Year,URL,Rating,Runtime,Review
0,All Saints,3.0,2017,/reviews/all-saints-2017,PG,108,“All Saints” feels like a bit of a miracle. It...
1,Leap!,1.5,2017,/reviews/leap-2017,PG,89,The emotional and visual power of dance is so ...
2,Beach Rats,3.5,2017,/reviews/beach-rats-2017,,95,"""Beach Rats,"" writer/director Eliza Hittman's ..."
3,Death Note,1.0,2017,/reviews/death-note-2017,NR,101,I don’t subscribe to the theory that adaptatio...
4,Bushwick,1.0,2017,/reviews/bushwick-2017,NR,94,This movie will lose the goodwill of any longt...


In [19]:
# pickle.dump(combined_df, open('../data/raw/ebert.pkl', 'wb'))
combined_df = pickle.load(open('../data/raw/ebert.pkl', 'rb'))

In [20]:
def scrape_imdb_listing(df):
    """
    Searches IMDB, parses results and returns DataFrame.
    :df = DataFrame with movie titles
    """
    movie_list = list()

    for movie in tqdm.tqdm(df['Title']):
        base_url = 'http://www.imdb.com/find?q='
        url = base_url + movie +'&s=all'
        webpage = requests.get(url).text
        soup = BeautifulSoup(webpage, 'lxml')

        try:
            results = soup('table', {'class':'findList'})[0]
        except:
            continue
            
        title = results.find_all('tr')[0]
        link = title.find('a', href=True)['href']

        url = 'http://www.imdb.com' + link
        webpage = requests.get(url).text
        soup = BeautifulSoup(webpage, 'lxml')

        movie_title = soup.find('title')
        
        try: 
            rate = soup.find('span', itemprop='ratingValue').text
        except:
            rate = ''
        
        try: 
            count = soup.find('span', itemprop='ratingCount').text
        except:
            count = ''
        
        try: 
            des = soup.find('meta',{'name':'description'})['content']
        except:
            des = ''
        
        try: 
            metascore = soup.find('div', class_='metacriticScore').text
        except:
            metascore = ''

        try: 
            reviews_count = soup.find('div', class_='titleReviewbarItemBorder')
            u_reviews = reviews_count.find_all('a')[0].text.split(' ')[0]
            c_reviews = reviews_count.find_all('a')[1].text.split(' ')[0]
        except:
            u_reviews = []
            c_review = []

        try: 
            genre_items = soup.find_all('span', itemprop='genre')
            genre_list = [item.text for item in genre_items]
        except:
            genre_list = []

        try: 
            stars_items = soup.find_all('span', itemprop='actors')
            stars_list = [item.text.strip() for item in stars_items]
        except:
            stars_list = []

        try: 
            director = soup.find('span', itemprop='name').text
        except:
            director = ''

        try: 
            country = soup.find('div', class_='subtext').find_all('a', title=True)[-1].text.split(' ')[-1]
            country = re.sub('[\(\)\{\}<>]', '', country)
        except:
            country = ''
            
        try:
            rel_date = (', ').join(soup.find('div', class_='subtext').find_all('a', 
                                            title=True)[-1].text.split(' ')[:-1])
        except:
            rel_date = ''
    
        movie_list.append([movie, rate, count, des, metascore, u_reviews, c_reviews, 
                       genre_list, stars_list, director, country, rel_date])
        
        time.sleep(0.5)

    
    df = pd.DataFrame(movie_list, columns = ['Title', 'IMDB_Rating', 'Rating_Count', 
        'Description', 'Metascore', 'User_Review_Count', 'Critic_Review_Count',
        'Genre_List', 'Stars_List', 'Director', 'Country', 'Release_Date'])
    return df, movie_list

In [24]:
imdb_listings, temp_list = scrape_imdb_listing(review_df)

In [25]:
# pickle.dump(imdb_listings, open('../data/raw/imdb.pkl', 'wb'))
imdb_listings = pickle.load(open('../data/raw/imdb.pkl', 'rb'))

In [26]:
imdb_listings.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date
0,The Emoji Movie,1.5,2334,"Directed by Tony Leondis. With T.J. Miller, J...",\n9\n,55,15,"[Animation, Adventure, Comedy]","[T.J. Miller,, James Corden,, Anna Faris]",Tony Leondis,USA\n\n,"28, July, 2017"
1,Menashe,6.1,164,Directed by Joshua Z Weinstein. With Menashe ...,\n81\n,1,21,[Drama],"[Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk]",Joshua Z Weinstein,USA\n\n,"28, July, 2017"
2,Detroit,7.9,100,Directed by Kathryn Bigelow. With John Boyega...,\n86\n,1,13,"[Crime, Drama, History]","[John Boyega,, Anthony Mackie,, Algee Smith]",Kathryn Bigelow,USA\n\n,"4, August, 2017"
3,Brigsby Bear,7.6,316,"Directed by Dave McCary. With Mark Hamill, Cl...",\n69\n,4,27,"[Comedy, Drama]","[Mark Hamill,, Claire Danes,, Kyle Mooney]",Dave McCary,Australia\n\n,"21, September, 2017"
4,The Incredible Jessica James,6.4,178,Directed by Jim Strouse. With Lakeith Stanfie...,\n71\n,[],27,[Comedy],"[Lakeith Stanfield,, Chris O'Dowd,, Noël Wells]",Jim Strouse,USA\n\n,"28, July, 2017"


In [27]:
imdb_listings.dtypes

Title                  object
IMDB_Rating            object
Rating_Count           object
Description            object
Metascore              object
User_Review_Count      object
Critic_Review_Count    object
Genre_List             object
Stars_List             object
Director               object
Country                object
Release_Date           object
dtype: object

# Process and Clean Data

In [28]:
print(combined_df.shape)
print(combined_df.dtypes)
combined_df.head()

(9212, 7)
Title          object
EbertStars    float64
Year           object
URL            object
Rating         object
Runtime        object
Review         object
dtype: object


Unnamed: 0,Title,EbertStars,Year,URL,Rating,Runtime,Review
0,All Saints,3.0,2017,/reviews/all-saints-2017,PG,108,“All Saints” feels like a bit of a miracle. It...
1,Leap!,1.5,2017,/reviews/leap-2017,PG,89,The emotional and visual power of dance is so ...
2,Beach Rats,3.5,2017,/reviews/beach-rats-2017,,95,"""Beach Rats,"" writer/director Eliza Hittman's ..."
3,Death Note,1.0,2017,/reviews/death-note-2017,NR,101,I don’t subscribe to the theory that adaptatio...
4,Bushwick,1.0,2017,/reviews/bushwick-2017,NR,94,This movie will lose the goodwill of any longt...


In [21]:
def convert_year(row):
    try:
        year = int(row['Year'])
        return year
    except:
        return np.nan
    
def convert_runtime(row):
    try:
        runtime = int(row['Runtime'])
        return runtime
    except:
        return np.nan

In [None]:
combined_df['Year'] = combined_df.apply(lambda x: convert_year(x), 1)
combined_df['Runtime'] = combined_df.apply(lambda x: convert_runtime(x), 1)

In [28]:
combined_df.head()

Unnamed: 0,URL,Rating,Runtime,Review
0,reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...
1,reviews/menashe-2017,,82.0,Like all great films that nudge the world towa...
2,reviews/detroit-2017,R,143.0,"Watching ""Detroit,"" the latest film directed b..."
3,reviews/brigsby-bear-2017,PG-13,100.0,"Released in the wake of Comic-Con, it’s imposs..."
4,reviews/the-incredible-jessica-james-2017,,85.0,"In the opening credits sequence of ""The Incred..."


In [31]:
imdb_listings.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date
0,The Emoji Movie,1.5,2334,"Directed by Tony Leondis. With T.J. Miller, J...",\n9\n,55,15,"[Animation, Adventure, Comedy]","[T.J. Miller,, James Corden,, Anna Faris]",Tony Leondis,USA\n\n,"28, July, 2017"
1,Menashe,6.1,164,Directed by Joshua Z Weinstein. With Menashe ...,\n81\n,1,21,[Drama],"[Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk]",Joshua Z Weinstein,USA\n\n,"28, July, 2017"
2,Detroit,7.9,100,Directed by Kathryn Bigelow. With John Boyega...,\n86\n,1,13,"[Crime, Drama, History]","[John Boyega,, Anthony Mackie,, Algee Smith]",Kathryn Bigelow,USA\n\n,"4, August, 2017"
3,Brigsby Bear,7.6,316,"Directed by Dave McCary. With Mark Hamill, Cl...",\n69\n,4,27,"[Comedy, Drama]","[Mark Hamill,, Claire Danes,, Kyle Mooney]",Dave McCary,Australia\n\n,"21, September, 2017"
4,The Incredible Jessica James,6.4,178,Directed by Jim Strouse. With Lakeith Stanfie...,\n71\n,[],27,[Comedy],"[Lakeith Stanfield,, Chris O'Dowd,, Noël Wells]",Jim Strouse,USA\n\n,"28, July, 2017"


In [29]:
imdb_listings.dtypes

Title                  object
IMDB_Rating            object
Rating_Count           object
Description            object
Metascore              object
User_Review_Count      object
Critic_Review_Count    object
Genre_List             object
Stars_List             object
Director               object
Country                object
Release_Date           object
dtype: object

In [30]:
def convert_imdb_rating(row):
    try:
        rating = float(row['IMDB_Rating'])
        return rating
    except:
        return np.nan

def convert_rating_count(row):
    try:
        count = float(row['Rating_Count'].replace(',', ''))
        return count
    except:
        return np.nan
    
def user_review_count(row):
    try:
        count = float(row['User_Review_Count'].replace(',', ''))
        return count
    except:
        return np.nan

def critic_review_count(row):
    try:
        count = float(row['Critic_Review_Count'].replace(',', ''))
        return count
    except:
        return np.nan

def convert_metascore(row):
    try:
        score = float(row['Metascore'].strip())
        return score
    except:
        return np.nan
    
def convert_country(row):
    try:
        country = row['Country'].strip()
        return country
    except:
        return np.nan
    
def convert_release_date(row):
    try:
        rel_date = row['Release_Date'].strip()

        if 'TV' in rel_date:
            return np.nan
        else:
            try:
                rel_date = datetime.datetime.strptime(rel_date, "%d, %B, %Y")
                return rel_date
            except:
                return np.nan
            
    except:
        return np.nan
    
def convert_genre(row):
    try:
        genres = ', '.join(row['Genre_List'])
        return genres
    except:
        return np.nan
    
def convert_actors(row):
    try:
        actors = ', '.join(row['Stars_List'])
        return actors
    except:
        return np.nan

In [31]:
imdb_listings['IMDB_Rating'] = imdb_listings.apply(lambda x: convert_imdb_rating(x), 1)
imdb_listings['Rating_Count'] = imdb_listings.apply(lambda x: convert_rating_count(x), 1)
imdb_listings['User_Review_Count'] = imdb_listings.apply(lambda x: user_review_count(x), 1)
imdb_listings['Critic_Review_Count'] = imdb_listings.apply(lambda x: critic_review_count(x), 1)
imdb_listings['Metascore'] = imdb_listings.apply(lambda x: convert_metascore(x), 1)
imdb_listings['Country'] = imdb_listings.apply(lambda x: convert_country(x), 1)
imdb_listings['Release_Date'] = imdb_listings.apply(lambda x: convert_release_date(x), 1)
imdb_listings['Genre_List'] = imdb_listings.apply(lambda x: convert_genre(x), 1)
imdb_listings['Stars_List'] = imdb_listings.apply(lambda x: convert_actors(x), 1)

In [32]:
imdb_listings.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"Animation, Adventure, Comedy","T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,USA,2017-07-28
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,Drama,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,USA,2017-07-28
2,Detroit,7.9,100.0,Directed by Kathryn Bigelow. With John Boyega...,86.0,1.0,13.0,"Crime, Drama, History","John Boyega,, Anthony Mackie,, Algee Smith",Kathryn Bigelow,USA,2017-08-04
3,Brigsby Bear,7.6,316.0,"Directed by Dave McCary. With Mark Hamill, Cl...",69.0,4.0,27.0,"Comedy, Drama","Mark Hamill,, Claire Danes,, Kyle Mooney",Dave McCary,Australia,2017-09-21
4,The Incredible Jessica James,6.4,178.0,Directed by Jim Strouse. With Lakeith Stanfie...,71.0,,27.0,Comedy,"Lakeith Stanfield,, Chris O'Dowd,, Noël Wells",Jim Strouse,USA,2017-07-28


In [33]:
ebert_imdb_df = pd.merge(imdb_listings, combined_df, how='left', on='Title')
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review
0,The Emoji Movie,1.5,2334.0,"Directed by Tony Leondis. With T.J. Miller, J...",9.0,55.0,15.0,"Animation, Adventure, Comedy","T.J. Miller,, James Corden,, Anna Faris",Tony Leondis,USA,2017-07-28,0.5,2017,/reviews/the-emoji-movie-2017,PG,,Since “Toy Story” became an enormous box-offic...
1,Menashe,6.1,164.0,Directed by Joshua Z Weinstein. With Menashe ...,81.0,1.0,21.0,Drama,"Menashe Lustig,, Yoel Falkowitz,, Ruben Niborsk",Joshua Z Weinstein,USA,2017-07-28,4.0,2017,/reviews/menashe-2017,,82.0,Like all great films that nudge the world towa...
2,Detroit,7.9,100.0,Directed by Kathryn Bigelow. With John Boyega...,86.0,1.0,13.0,"Crime, Drama, History","John Boyega,, Anthony Mackie,, Algee Smith",Kathryn Bigelow,USA,2017-08-04,2.0,2017,/reviews/detroit-2017,R,143.0,"Watching ""Detroit,"" the latest film directed b..."
3,Brigsby Bear,7.6,316.0,"Directed by Dave McCary. With Mark Hamill, Cl...",69.0,4.0,27.0,"Comedy, Drama","Mark Hamill,, Claire Danes,, Kyle Mooney",Dave McCary,Australia,2017-09-21,2.5,2017,/reviews/brigsby-bear-2017,PG-13,100.0,"Released in the wake of Comic-Con, it’s imposs..."
4,The Incredible Jessica James,6.4,178.0,Directed by Jim Strouse. With Lakeith Stanfie...,71.0,,27.0,Comedy,"Lakeith Stanfield,, Chris O'Dowd,, Noël Wells",Jim Strouse,USA,2017-07-28,3.0,2017,/reviews/the-incredible-jessica-james-2017,,85.0,"In the opening credits sequence of ""The Incred..."


In [34]:
pickle.dump(ebert_imdb_df, open('../data/interim/ebert_imdb_df_v1.pkl', 'wb'))

# Plan for Following Notebooks

- Exploratory Data Analysis
- Feature Engineering
- Making predictions
- Final analysis