# Predicting Comedy Movie Rating

There is a database of 876 comedy movies on Box Office Mojo across 17 different comedy sub categories


In [96]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import json
import urllib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline
from IPython.display import Image

Call the genres page from box office mojo. This lists movies subgenres (comedy- spoof, comedy- road trip, comedy- spy, etc.)

In [58]:
url = 'http://www.boxofficemojo.com/genres/'
response = requests.get(url)

page = response.text
soup = BeautifulSoup(page,"lxml")

#print(soup.prettify())


In [57]:
def comedy_links(soup):
    '''create a list of link ends that go to comedy genre pages'''
    comedy_links = []
    for elem in soup.find_all('a', href=re.compile('comedy')):
        comedy_links.append(elem['href'])
    return comedy_links

#print comedy_links(soup)



In [56]:
def link_join(first_half, second_half):
    "create a list of full links that go to comedy subpages from movie genre page"
    links = []
    for url in second_half:
        links.append(first_half + url)
    return links
    

genre_links = []
genre_links = (link_join("http://www.boxofficemojo.com/genres/", comedy_links(soup)))
#print genre_links

In [55]:
def open_genre_links(links):
    movie_links = []
    '''open up the link for each subgenre on the genre page and then return second 
    half of movie link for all comedy movies in box office mojo'''
    for link in links:
        #url = link
        response = requests.get(link)
        page = response.text
        soup = BeautifulSoup(page,"lxml")
        
        for elem in soup.find_all('a', href=re.compile('/movies//?')):
            movie_links.append(elem['href'])
    return movie_links


"""<a href="/movies/?id=meninblack.htm"><b>Men in Black</b></a>"""
second_half_movie_links = open_genre_links(genre_links)
#print second_half_movie_links

In [6]:
def link_join_movie_names(first_half, second_half):
    "return a list of full links that go to comedy movie pages from movie subgenre pages"
    links = []
    for url in second_half:
        links.append(first_half + url)
    return links

movie_links = link_join_movie_names("http://www.boxofficemojo.com", second_half_movie_links)
#remove duplicates-
movie_links = set(movie_links)


There should be about 925 movie links stored in the movie_links list. Each link goes to the page of a comedy movie, where each feature will be extracted 

In [7]:
def tester_links(links):
    '''creates smaller list of movie_links to test the build_df function below'''
    links = list(links)
    test_links = []
    for i in range(10):
        test_links.append(links[i])
    return test_links

tester_links = tester_links(movie_links)


In [8]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [30]:
import dateutil.parser

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return None

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [43]:
def build_df(links):
    """returns a dataframe of all scraped data"""
    df = pd.DataFrame()
    for link in links:
        #open link
        response = requests.get(link)
        page = response.text
        soup = BeautifulSoup(page,"lxml")
        #scrape title
        title_string = soup.find('title').text
        title = title_string.split('(')[0].strip()
        #scrape domestic total gross
        domestic_total_gross = get_movie_value(soup,'Domestic Total')
        domestic_total_gross = money_to_int(domestic_total_gross)
        #scrape runtime
        runtime = get_movie_value(soup,'Runtime')
        #runtime = runtime_to_minutes(runtime)
        #scrape rating
        rating = get_movie_value(soup, "MPAA Rating")
        #scrape release date and turn into datetime function
        release_date = get_movie_value(soup, 'Release Date')
        #release_date = to_date(release_date)
        #scrapte distributor
        distributor = get_movie_value(soup, "Distributor")
        #scrape budget
        budget = get_movie_value(soup, "Production Budget")
        #scrape genre
        genre = get_movie_value(soup, "Genre:")
        #scrape opening weekend revenue
        worldwide_total_gross = get_movie_value(soup, "Worldwide: ")
        #add row to data frame
        df = df.append({"runtime" : runtime,
                        "title": title,
                        "domestic_total_gross": domestic_total_gross,
                       "rating": rating,
                       "release_date": release_date,
                       "distributor": distributor,
                       "budget" : budget,
                       "genre" : genre
                       }, ignore_index=True)
    return df
        #break 
        

#print build_df(movie_links, df)
'''test build_df function on smaller list'''
df = build_df(movie_links)


In [102]:
#store df in pickle file
with open('df_box_office_mojo.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)

In [1]:
#df.budget.value_counts()

In [48]:
#df.release_date = str(df['release_date'])
#df.release_date = pd.to_datetime(df['release_date'])

Now that a dataframe of box office mojo data has been scrapped and orangized, we can add additional data from IMDd and Rotten Tomatoes. An API can be used to find data for each movie. This will then be stored and added to the greated data frame. 

In [61]:
def movie_titles(df):
    """takes the data frame of features and returns title in format to use with IMDB api"""
    movie_titles = df['title'].tolist()
    movie_titles2 = []
    #imdb_links = []
    #return movie_titles
    for movie in movie_titles:
        movie = str(movie)
        movie = urllib.quote_plus(movie)
        movie_titles2.append(movie)
        #movie_titles2.append(movie)
    #imdb_links.append("http://www.omdbapi.com/?t="+)
    return movie_titles2

movie_titles = movie_titles(df)
#print movie_titles

In [93]:
def api_link(movie_titles):
    """take the movie_titles list and returns api link for imdb"""
    api_links = []
    for movie in movie_titles:
        api_links.append("http://www.omdbapi.com/?t=" + movie)
    return api_links

api_links = api_link(movie_titles)
        

In [103]:
def api_tester_links(links):
    '''creates smaller list of api_links to test the function below'''
    #links = list(links)
    test_links = []
    for i in range(10):
        #print links
        test_links.append(links[i])
    return test_links

api_tester_links = api_tester_links(api_link(movie_titles))
#print api_tester_links


### Take the list of links for the api and open them to pull dictionaries for each movie. 

In [94]:
#use parameter to get rotten tomatoes rating 
parameters = dict(
    tomatoes=True,
)

def imdb_api(links, parameters):
    """takes api links and parameters for api and returns dataframe for all movies"""
    d = []
    for url in links:
        response = requests.get(url)
        response = requests.get(url=url, params=parameters)
        data = json.loads(response.text)
        d.append(data)
    return d


d = imdb_api(api_links, parameters)
#turn list of dictionaries into dataframe
df_api = pd.DataFrame(d)






In [97]:
#store df_api in pickle file
with open('df_api.pkl', 'wb') as picklefile:
    pickle.dump(df_api, picklefile)

### Merge the box office mojo and imdb data frames

change column name for title in df_api to merge with title in df from box office mojo

In [104]:
df_api = df_api.rename(columns = {'Title':'title'})

merge data frames

In [105]:
df_merge = pd.merge(df, df_api, on='title')

In [115]:
#store merged df in pickle file
with open('df_merge.pkl', 'wb') as picklefile:
    pickle.dump(df_merge, picklefile)

### Testing subplots on data already scraped 

In [70]:

"""plt.figure(figsize=(15,10))
plt.plot_date(df.release_date, df.domestic_total_gross)
plt.xlabel('Runtime')
plt.ylabel('Domestic total gross')
plt.title('Domestic total gross by runtime')"""

"plt.figure(figsize=(15,10))\nplt.plot_date(df.release_date, df.domestic_total_gross)\nplt.xlabel('Runtime')\nplt.ylabel('Domestic total gross')\nplt.title('Domestic total gross by runtime')"