# Regression Open-Ended Project

-----

# Define Question / Hypothesis

**Use publicly available data to generate Roger Ebert-esque ratings for recent movies.**

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Obtain Data via Scraping and APIs

- time: to add in a sleep delay when scraping
- tqdm: a nifty tool to show progress bar

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import tqdm
import pickle
import re
import datetime

### Scraping

- Manual - download website files locally
- Procedural - find AJAX script
- Pythonic - headless browser with Selenium

In [3]:
def scrape_eberts_review(num_pages=100):
    """
    Parses through webpage with list of movies and returns DataFrame.
    :num_pages = Number of pages to go through
    """
    url = "http://www.rogerebert.com/reviews?great_movies=0&no_stars=0&title=Cabin+in+the+Woods&filtersgreat_movies%5D%5B%5D=&filters%5Bno_stars%5D%5B%5D=&filters%5Bno_stars%5D%5B%5D=1&filters%5Btitle%5D=&filters%5Breviewers%5D=&filters%5Bgenres%5D=&page={}&sort%5Border%5D=newest"
    pages = list(range(1, num_pages))
    links = [url.format(i) for i in pages]
    
    review_list = list()
    
    for link in links:
        webpage = requests.get(link).text
        soup = BeautifulSoup(webpage, 'lxml')
        all_movies = soup('figure', {'class':'movie review'})
    
        for movie in all_movies:
            url = movie.a.get('href')
            title = movie.find_all('a')[1].text
            stars = len(movie.find_all('i', {'class':'icon-star-full'})) + 0.5 * len(movie.find_all('i', {'class':'icon-star-half'}))

            try:
                year = movie.find('span', {'class':'release-year'}).text[1:-1]
            except:
                year = ''

            review_list.append([title, stars, year, url])
    
    df = pd.DataFrame(review_list, columns = ['Title', 'EbertStars', 'Year', 'URL'])
    return df

In [4]:
review_df = scrape_eberts_review(num_pages=400)

In [5]:
print(review_df.shape)
print(review_df.dtypes)
print(review_df.head())
print(review_df.tail())

(9212, 4)
Title          object
EbertStars    float64
Year           object
URL            object
dtype: object
                  Title  EbertStars  Year                                 URL
0  Call Me by Your Name         4.0  2017  /reviews/call-me-by-your-name-2017
1              Mudbound         4.0  2017              /reviews/mudbound-2017
2        Justice League         3.0  2017        /reviews/justice-league-2017
3                Wonder         3.0  2017                /reviews/wonder-2017
4         Mr. Roosevelt         3.0  2017          /reviews/mr-roosevelt-2017
                  Title  EbertStars  Year                              URL
9207          Hopscotch         3.0  1980          /reviews/hopscotch-1980
9208   Why Would I Lie?         1.0  1980    /reviews/why-would-i-lie-1980
9209       Terror Train         1.0  1980       /reviews/terror-train-1980
9210     Coast to Coast         2.0  1980     /reviews/coast-to-coast-1980
9211  Somewhere in Time         2.0  1980  /r

In [6]:
# pickle.dump(review_df, open('../data/raw/ebert_reviews.pkl', 'wb'))
review_df = pickle.load(open('../data/raw/ebert_reviews.pkl', 'rb'))

In [7]:
def scrape_movie_info(link):
    """
    Parses each individual review page and returns list of key attributes.
    :link = URL for review
    """
    full_link = "http://www.rogerebert.com" + link
    webpage = requests.get(full_link).text
    soup = BeautifulSoup(webpage, 'lxml')

    try:
        mpaa = soup.find('p', {'class':'mpaa-rating'}).strong.text[6:]
    except:
        mpaa = ''

    try: 
        runtime = int(soup.find('p', {'class':'running-time'}).strong.text[:3].strip())
    except:
        runtime = ''

    try:
        review = ' '.join([paragraph.text for paragraph in soup.find('div', {'itemprop':'reviewBody'}).find_all('p')])
    except:
        review = ''
    
    return [link, mpaa, runtime, review]

In [8]:
scraped_list = list()

for movie in tqdm.tqdm(review_df['URL']):
    scraped_list.append(scrape_movie_info(movie))
    time.sleep(0.5)

review_content_df = pd.DataFrame(scraped_list, columns = ['URL', 'Rating', 'Runtime', 'Review'])

100%|██████████| 9212/9212 [2:10:09<00:00,  1.19it/s]


In [9]:
review_content_df.head()

Unnamed: 0,URL,Rating,Runtime,Review
0,/reviews/call-me-by-your-name-2017,R,130,Luca Guadagnino’s films are all about the tran...
1,/reviews/mudbound-2017,NR,134,“Mudbound” is all about perception. How it can...
2,/reviews/justice-league-2017,PG-13,120,For a film about a band of heroes trying to st...
3,/reviews/wonder-2017,PG,113,Based on the R.J. Palacio novel of the same na...
4,/reviews/mr-roosevelt-2017,NR,90,Emily Martin (Noël Wells) doesn't quite know h...


In [10]:
# pickle.dump(review_content_df, open('../data/raw/review_content_df.pkl', 'wb'))
review_content_df = pickle.load(open('../data/raw/review_content_df.pkl', 'rb'))

In [11]:
ebert_df = pd.merge(review_df, review_content_df, how='left', on='URL')
ebert_df.head()

Unnamed: 0,Title,EbertStars,Year,URL,Rating,Runtime,Review
0,Call Me by Your Name,4.0,2017,/reviews/call-me-by-your-name-2017,R,130,Luca Guadagnino’s films are all about the tran...
1,Mudbound,4.0,2017,/reviews/mudbound-2017,NR,134,“Mudbound” is all about perception. How it can...
2,Justice League,3.0,2017,/reviews/justice-league-2017,PG-13,120,For a film about a band of heroes trying to st...
3,Wonder,3.0,2017,/reviews/wonder-2017,PG,113,Based on the R.J. Palacio novel of the same na...
4,Mr. Roosevelt,3.0,2017,/reviews/mr-roosevelt-2017,NR,90,Emily Martin (Noël Wells) doesn't quite know h...


In [12]:
# pickle.dump(ebert_df, open('../data/raw/ebert_df.pkl', 'wb'))
ebert_df = pickle.load(open('../data/raw/ebert_df.pkl', 'rb'))

In [13]:
def scrape_imdb_listing(df):
    """
    Searches IMDB, parses results and returns DataFrame.
    :df = DataFrame with movie titles
    """
    movie_list = list()

    for movie in tqdm.tqdm(df['Title']):
        base_url = 'http://www.imdb.com/find?q='
        url = base_url + movie +'&s=all'
        webpage = requests.get(url).text
        soup = BeautifulSoup(webpage, 'lxml')

        try:
            results = soup('table', {'class':'findList'})[0]
        except:
            continue
            
        title = results.find_all('tr')[0]
        link = title.find('a', href=True)['href']

        url = 'http://www.imdb.com' + link
        webpage = requests.get(url).text
        soup = BeautifulSoup(webpage, 'lxml')

        movie_title = soup.find('title')
        
        try: 
            rate = soup.find('span', itemprop='ratingValue').text
        except:
            rate = ''
        
        try: 
            count = soup.find('span', itemprop='ratingCount').text
        except:
            count = ''
        
        try: 
            des = soup.find('meta',{'name':'description'})['content']
        except:
            des = ''
        
        try: 
            metascore = soup.find('div', class_='metacriticScore').text
        except:
            metascore = ''

        try: 
            reviews_count = soup.find('div', class_='titleReviewbarItemBorder')
            u_reviews = reviews_count.find_all('a')[0].text.split(' ')[0]
            c_reviews = reviews_count.find_all('a')[1].text.split(' ')[0]
        except:
            u_reviews = []
            c_review = []

        try: 
            genre_items = soup.find_all('span', itemprop='genre')
            genre_list = [item.text for item in genre_items]
        except:
            genre_list = []

        try: 
            stars_items = soup.find_all('span', itemprop='actors')
            stars_list = [item.text.strip() for item in stars_items]
        except:
            stars_list = []

        try: 
            director = soup.find('span', itemprop='name').text
        except:
            director = ''

        try: 
            country = soup.find('div', class_='subtext').find_all('a', title=True)[-1].text.split(' ')[-1]
            country = re.sub('[\(\)\{\}<>]', '', country)
        except:
            country = ''
            
        try:
            rel_date = (', ').join(soup.find('div', class_='subtext').find_all('a', 
                                            title=True)[-1].text.split(' ')[:-1])
        except:
            rel_date = ''
    
        movie_list.append([movie, rate, count, des, metascore, u_reviews, c_reviews, 
                       genre_list, stars_list, director, country, rel_date])
        
        time.sleep(0.5)

    
    df = pd.DataFrame(movie_list, columns = ['Title', 'IMDB_Rating', 'Rating_Count', 
        'Description', 'Metascore', 'User_Review_Count', 'Critic_Review_Count',
        'Genre_List', 'Stars_List', 'Director', 'Country', 'Release_Date'])

    return df, movie_list

In [15]:
imdb_df, temp_list = scrape_imdb_listing(ebert_df)


  0%|          | 0/9212 [00:00<?, ?it/s][A
  0%|          | 1/9212 [00:01<3:34:03,  1.39s/it][A
  0%|          | 2/9212 [00:02<3:30:40,  1.37s/it][A
  0%|          | 3/9212 [00:04<3:38:12,  1.42s/it][A
  0%|          | 4/9212 [00:05<3:34:17,  1.40s/it][A
  0%|          | 5/9212 [00:06<3:33:45,  1.39s/it][A
100%|██████████| 9212/9212 [4:13:13<00:00,  1.63s/it]


In [16]:
imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date
0,Call Me by Your Name,8.4,4766,Directed by Luca Guadagnino. With Armie Hamme...,\n93\n,39,107,"[Drama, Romance]","[Armie Hammer,, Timothée Chalamet,, Michael St...",Luca Guadagnino,USA\n\n,"24, November, 2017"
1,Mudbound,7.4,2666,"Directed by Dee Rees. With Carey Mulligan, Ga...",\n86\n,22,61,[Drama],"[Carey Mulligan,, Garrett Hedlund,, Jason Clarke]",Dee Rees,USA\n\n,"17, November, 2017"
2,Justice League,7.4,78007,"Directed by Zack Snyder. With Ben Affleck, Ga...",\n46\n,709,286,"[Action, Adventure, Fantasy]","[Ben Affleck,, Gal Gadot,, Jason Momoa]",Zack Snyder,USA\n\n,"17, November, 2017"
3,Wonder,8.0,1579,Directed by Stephen Chbosky. With Jacob Tremb...,\n67\n,22,55,[Drama],"[Jacob Tremblay,, Owen Wilson,, Izabela Vidovic]",Stephen Chbosky,USA\n\n,"17, November, 2017"
4,Mr. Roosevelt,6.8,116,"Directed by Noël Wells. With Noël Wells, Nick...",\n73\n,[],55,[Comedy],"[Noël Wells,, Nick Thune,, Britt Lower]",Noël Wells,USA\n\n,"22, November, 2017"


In [17]:
# pickle.dump(imdb_df, open('../data/raw/imdb_df.pkl', 'wb'))
imdb_df = pickle.load(open('../data/raw/imdb_df.pkl', 'rb'))

# Process and Clean Data

In [18]:
print(ebert_df.shape)
print(ebert_df.dtypes)
ebert_df.head()

(9212, 7)
Title          object
EbertStars    float64
Year           object
URL            object
Rating         object
Runtime        object
Review         object
dtype: object


Unnamed: 0,Title,EbertStars,Year,URL,Rating,Runtime,Review
0,Call Me by Your Name,4.0,2017,/reviews/call-me-by-your-name-2017,R,130,Luca Guadagnino’s films are all about the tran...
1,Mudbound,4.0,2017,/reviews/mudbound-2017,NR,134,“Mudbound” is all about perception. How it can...
2,Justice League,3.0,2017,/reviews/justice-league-2017,PG-13,120,For a film about a band of heroes trying to st...
3,Wonder,3.0,2017,/reviews/wonder-2017,PG,113,Based on the R.J. Palacio novel of the same na...
4,Mr. Roosevelt,3.0,2017,/reviews/mr-roosevelt-2017,NR,90,Emily Martin (Noël Wells) doesn't quite know h...


In [19]:
def convert_year(row):
    try:
        year = int(row['Year'])
        return year
    except:
        return np.nan
    
def convert_runtime(row):
    try:
        runtime = int(row['Runtime'])
        return runtime
    except:
        return np.nan

In [20]:
ebert_df['Year'] = ebert_df.apply(lambda x: convert_year(x), 1)
ebert_df['Runtime'] = ebert_df.apply(lambda x: convert_runtime(x), 1)

In [21]:
ebert_df.head()

Unnamed: 0,Title,EbertStars,Year,URL,Rating,Runtime,Review
0,Call Me by Your Name,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...
1,Mudbound,4.0,2017.0,/reviews/mudbound-2017,NR,134.0,“Mudbound” is all about perception. How it can...
2,Justice League,3.0,2017.0,/reviews/justice-league-2017,PG-13,120.0,For a film about a band of heroes trying to st...
3,Wonder,3.0,2017.0,/reviews/wonder-2017,PG,113.0,Based on the R.J. Palacio novel of the same na...
4,Mr. Roosevelt,3.0,2017.0,/reviews/mr-roosevelt-2017,NR,90.0,Emily Martin (Noël Wells) doesn't quite know h...


In [22]:
imdb_df.dtypes

Title                  object
IMDB_Rating            object
Rating_Count           object
Description            object
Metascore              object
User_Review_Count      object
Critic_Review_Count    object
Genre_List             object
Stars_List             object
Director               object
Country                object
Release_Date           object
dtype: object

In [23]:
def convert_imdb_rating(row):
    try:
        rating = float(row['IMDB_Rating'])
        return rating
    except:
        return np.nan

def convert_rating_count(row):
    try:
        count = float(row['Rating_Count'].replace(',', ''))
        return count
    except:
        return np.nan
    
def user_review_count(row):
    try:
        count = float(row['User_Review_Count'].replace(',', ''))
        return count
    except:
        return np.nan

def critic_review_count(row):
    try:
        count = float(row['Critic_Review_Count'].replace(',', ''))
        return count
    except:
        return np.nan

def convert_metascore(row):
    try:
        score = float(row['Metascore'].strip())
        return score
    except:
        return np.nan
    
def convert_country(row):
    try:
        country = row['Country'].strip()
        return country
    except:
        return np.nan
    
def convert_release_date(row):
    try:
        rel_date = row['Release_Date'].strip()

        if 'TV' in rel_date:
            return np.nan
        else:
            try:
                rel_date = datetime.datetime.strptime(rel_date, "%d, %B, %Y")
                return rel_date
            except:
                return np.nan
            
    except:
        return np.nan
    
def convert_genre(row):
    try:
        genres = ', '.join(row['Genre_List'])
        return genres
    except:
        return np.nan
    
def convert_actors(row):
    try:
        actors = ', '.join(row['Stars_List'])
        return actors
    except:
        return np.nan

In [24]:
imdb_df['IMDB_Rating'] = imdb_df.apply(lambda x: convert_imdb_rating(x), 1)
imdb_df['Rating_Count'] = imdb_df.apply(lambda x: convert_rating_count(x), 1)
imdb_df['User_Review_Count'] = imdb_df.apply(lambda x: user_review_count(x), 1)
imdb_df['Critic_Review_Count'] = imdb_df.apply(lambda x: critic_review_count(x), 1)
imdb_df['Metascore'] = imdb_df.apply(lambda x: convert_metascore(x), 1)
imdb_df['Country'] = imdb_df.apply(lambda x: convert_country(x), 1)
imdb_df['Release_Date'] = imdb_df.apply(lambda x: convert_release_date(x), 1)
imdb_df['Genre_List'] = imdb_df.apply(lambda x: convert_genre(x), 1)
imdb_df['Stars_List'] = imdb_df.apply(lambda x: convert_actors(x), 1)

In [25]:
imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Drama, Romance","Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,USA,2017-11-24
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,Drama,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,USA,2017-11-17
2,Justice League,7.4,78007.0,"Directed by Zack Snyder. With Ben Affleck, Ga...",46.0,709.0,286.0,"Action, Adventure, Fantasy","Ben Affleck,, Gal Gadot,, Jason Momoa",Zack Snyder,USA,2017-11-17
3,Wonder,8.0,1579.0,Directed by Stephen Chbosky. With Jacob Tremb...,67.0,22.0,55.0,Drama,"Jacob Tremblay,, Owen Wilson,, Izabela Vidovic",Stephen Chbosky,USA,2017-11-17
4,Mr. Roosevelt,6.8,116.0,"Directed by Noël Wells. With Noël Wells, Nick...",73.0,,55.0,Comedy,"Noël Wells,, Nick Thune,, Britt Lower",Noël Wells,USA,2017-11-22


In [26]:
ebert_imdb_df = pd.merge(imdb_df, ebert_df, how='left', on='Title')
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review
0,Call Me by Your Name,8.4,4766.0,Directed by Luca Guadagnino. With Armie Hamme...,93.0,39.0,107.0,"Drama, Romance","Armie Hammer,, Timothée Chalamet,, Michael Stu...",Luca Guadagnino,USA,2017-11-24,4.0,2017.0,/reviews/call-me-by-your-name-2017,R,130.0,Luca Guadagnino’s films are all about the tran...
1,Mudbound,7.4,2666.0,"Directed by Dee Rees. With Carey Mulligan, Ga...",86.0,22.0,61.0,Drama,"Carey Mulligan,, Garrett Hedlund,, Jason Clarke",Dee Rees,USA,2017-11-17,4.0,2017.0,/reviews/mudbound-2017,NR,134.0,“Mudbound” is all about perception. How it can...
2,Justice League,7.4,78007.0,"Directed by Zack Snyder. With Ben Affleck, Ga...",46.0,709.0,286.0,"Action, Adventure, Fantasy","Ben Affleck,, Gal Gadot,, Jason Momoa",Zack Snyder,USA,2017-11-17,3.0,2017.0,/reviews/justice-league-2017,PG-13,120.0,For a film about a band of heroes trying to st...
3,Wonder,8.0,1579.0,Directed by Stephen Chbosky. With Jacob Tremb...,67.0,22.0,55.0,Drama,"Jacob Tremblay,, Owen Wilson,, Izabela Vidovic",Stephen Chbosky,USA,2017-11-17,3.0,2017.0,/reviews/wonder-2017,PG,113.0,Based on the R.J. Palacio novel of the same na...
4,Mr. Roosevelt,6.8,116.0,"Directed by Noël Wells. With Noël Wells, Nick...",73.0,,55.0,Comedy,"Noël Wells,, Nick Thune,, Britt Lower",Noël Wells,USA,2017-11-22,3.0,2017.0,/reviews/mr-roosevelt-2017,NR,90.0,Emily Martin (Noël Wells) doesn't quite know h...


In [27]:
pickle.dump(ebert_imdb_df, open('../data/interim/ebert_imdb_df.pkl', 'wb'))

# Plan for Following Notebooks

- Exploratory Data Analysis
- Feature Engineering
- Making predictions
- Final analysis