In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']  # or svg
%matplotlib inline

sns.set()

Grab the links of movies from the 2000 to 2019

In [2]:
url = 'https://www.boxofficemojo.com/year/{}/?ref_=bo_yl_table_3'
urls = []
for year in range(2000,2019):
    urls.append(url.format(str(year)))
print(urls)

['https://www.boxofficemojo.com/year/2000/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2001/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2002/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2003/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2004/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2005/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2006/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2007/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2008/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2009/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2010/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2011/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2012/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2013/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2014/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2015/?ref_=bo_yl_t

  Scrapping Domestic Movies for year 2000 - 2019

In [3]:
def get_all_rows(urls):
    rows = []
    for url in urls:
        response = requests.get(url)
        page = response.text
        Soup = BeautifulSoup(page,'lxml')
        table = Soup.find('table')
        for row in table.find_all('tr')[1:]:
            rows.append(row)
    return rows
    
rows = get_all_rows(urls)

In [4]:
len(rows)

3800

Get Table From Page / Scrape Table

In [5]:
def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def generate_movies_dict(rows):
    movies = {}
    for row in rows:
        items = row.find_all('td')
        link = items[1].find('a')
        title, url = link.text, link['href']
        
        # avoid repeted titles 
        if title in movies.keys():
            continue
        else:
            domestic_gross = items[5].text
            domestic_gross = money_to_int(domestic_gross)
            distributor = items[9].text[:-2]
            movies[title] = [url,domestic_gross, distributor]
    return movies
movies = generate_movies_dict(rows)

In [6]:
movies_df = pd.DataFrame(movies).T  #transpose
movies_df.columns =['link', 
                    'domestic_gross','distributor']

movies_df.head()

Unnamed: 0,link,domestic_gross,distributor
How the Grinch Stole Christmas,/release/rl3059189249/?ref_=bo_yld_table_1,251628705,Universal Pictures
Mission: Impossible II,/release/rl1600292353/?ref_=bo_yld_table_2,215409889,Paramount Pictures
Gladiator,/release/rl2136245761/?ref_=bo_yld_table_3,186610052,DreamWorks Distribution
The Perfect Storm,/release/rl661161473/?ref_=bo_yld_table_4,182618434,Warner Bros.
Meet the Parents,/release/rl677545473/?ref_=bo_yld_table_5,161146255,Universal Pictures


In [7]:
import dateutil.parser
import re

def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
        
    if next_element:
        if field_name is 'IMDbPro':
            return next_element.find_all('a')[0].get('href')
        else:
            return next_element.text 
    else:
        return None
    


def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None


def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic_gross
        - total gross
        - budget
        - runtime 
        - Genres
        - MPAA rating
        -distributor
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_gross', 'total_gross','budget', 
               'runtime_minutes', 'genres','rating', 'imd_url','distributor']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()

    #Get World Wide gross
    head = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')
    
    #Domestic Gross
    #print(len(head))
    raw_domestic_gross = (head[0].text)
    domestic_gross = money_to_int(raw_domestic_gross)
    
    #Gross Total
    if len(head) == 3:
        raw_worldwide_total_gross = (head[2].text
                               )
        total_gross = money_to_int(raw_worldwide_total_gross)
    elif (len(head) == 2) and (head[0].txt == head[1].txt):
        raw_worldwide_total_gross = (head[1].txt)
        total_gross = money_to_int(raw_worldwide_total_gross)
    else:
        total_gross = None
        
    
    # Get Budget
    raw_budget = get_movie_value(soup, 'Budget')
    if raw_budget is not None:
        budget = money_to_int(raw_budget)
    else:
        budget = raw_budget
    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)
    # Considering the first Genres
    genres = get_movie_value(soup, 'Genres')
    genres = genres.split('\n')[0]
    #Get rating
    rating = get_movie_value(soup,'MPAA')
    imd_url = get_movie_value(soup, 'IMDbPro')
    #print(f'print link {next_link}')
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title, domestic_gross, total_gross, budget, runtime, genres, rating, imd_url]))

    return movie_dict


In [8]:
def scrape_imdbpro(url):
    response = requests.get(url)
    page = response.text
    imd_soup = BeautifulSoup(page,"lxml")
    
    head = imd_soup.find_all('div', id='const_page_summary_section')
    imd_dict = {}
    spans = head[0].find_all('span')
    for i, span in enumerate(spans):
        #print(f'{i}  {span.text}')
        if 'Director' in span.text:
            if 'director' not in imd_dict:
                imd_dict['director'] = spans[i+1].text
        if 'Cinematographer' in span.text:
             imd_dict['cinematographer'] = spans[i+1].text
        if 'Producer' in span.text:
            imd_dict['producers'] = spans[i+1].text        
    return imd_dict

def scrape_opening(url):
    response = requests.get(url)
    page = response.text
    imd_soup = BeautifulSoup(page,"lxml")
    head = imd_soup.find_all('span', class_='a-color-secondary')
    return head
    
    

In [9]:
movies_info_list = []
for i, link in enumerate(movies_df.link):
    try:
        movie_dict = get_movie_dict(link)
        #print(movie_dict)
        imd_dict = scrape_imdbpro(movie_dict['imd_url'])
        imd_dict['distributor'] = movies_df.iloc[i,2]
        #print(imd_dict)
        movies_info_list.append( dict(movie_dict, **imd_dict))
        #total_gross = movie_dict['total_gross']
        #print(f' The total Gross is {total_gross}')
    except:
        print(f'Error in {link}')

Error in /release/rl1768326657/?ref_=bo_yld_table_66
Error in /release/rl1533511169/?ref_=bo_yld_table_105
Error in /release/rl508593665/?ref_=bo_yld_table_133
Error in /release/rl3611526657/?ref_=bo_yld_table_134
Error in /release/rl912164353/?ref_=bo_yld_table_139
Error in /release/rl3598419457/?ref_=bo_yld_table_140
Error in /release/rl3176433153/?ref_=bo_yld_table_143
Error in /release/rl2168555009/?ref_=bo_yld_table_144
Error in /release/rl2253489665/?ref_=bo_yld_table_146
Error in /release/rl4131816961/?ref_=bo_yld_table_148
Error in /release/rl309822977/?ref_=bo_yld_table_161
Error in /release/rl2572912129/?ref_=bo_yld_table_164
Error in /release/rl945391105/?ref_=bo_yld_table_172
Error in /release/rl1011975681/?ref_=bo_yld_table_175
Error in /release/rl1146783233/?ref_=bo_yld_table_184
Error in /release/rl1499170305/?ref_=bo_yld_table_188
Error in /release/rl1717274113/?ref_=bo_yld_table_197
Error in /release/rl1129612801/?ref_=bo_yld_table_198
Error in /release/rl3296429569/?r

In [10]:
movies_info_list[0]

{'movie_title': 'How the Grinch Stole Christmas',
 'domestic_gross': 260044825,
 'total_gross': 345141403,
 'budget': 123000000,
 'runtime_minutes': 104,
 'genres': 'Comedy',
 'rating': 'PG',
 'imd_url': 'https://pro.imdb.com/title/tt0170016?ref_=mojo_rl_summary&rf=mojo_rl_summary',
 'director': 'Ron Howard',
 'producers': 'Brian Grazer',
 'cinematographer': 'Donald Peterman',
 'distributor': 'Universal Pictures'}

# EDA

In [1390]:
movies = pd.DataFrame(movies_info_list)

In [1391]:
movies.head()

Unnamed: 0,movie_title,domestic_gross,total_gross,budget,runtime_minutes,genres,rating,imd_url,director,producers,cinematographer,distributor
0,How the Grinch Stole Christmas,260044825,345141403,123000000.0,104.0,Comedy,PG,https://pro.imdb.com/title/tt0170016?ref_=mojo...,Ron Howard,Brian Grazer,Donald Peterman,Universal Pictures
1,Mission: Impossible II,215409889,546388108,125000000.0,123.0,Action,PG-13,https://pro.imdb.com/title/tt0120755?ref_=mojo...,John Woo,Tom Cruise,Jeffrey L. Kimball,Paramount Pictures
2,Gladiator,187705427,460583960,103000000.0,155.0,Action,R,https://pro.imdb.com/title/tt0172495?ref_=mojo...,Ridley Scott,David Franzoni,John Mathieson,DreamWorks Distribution
3,The Perfect Storm,182618434,328718434,140000000.0,130.0,Action,PG-13,https://pro.imdb.com/title/tt0177971?ref_=mojo...,Wolfgang Petersen,Gail Katz,John Seale,Warner Bros.
4,Meet the Parents,166244045,330444045,55000000.0,108.0,Comedy,PG-13,https://pro.imdb.com/title/tt0212338?ref_=mojo...,Jay Roach,Robert De Niro,Peter James,Universal Pictures


Deleting the columns with no longer use

In [1392]:
del movies['imd_url']
del movies['domestic_gross']

In [1393]:
movies.rename(columns = {'total_gross':'revenue'}, inplace = True)

In [1394]:
movies.head()

Unnamed: 0,movie_title,revenue,budget,runtime_minutes,genres,rating,director,producers,cinematographer,distributor
0,How the Grinch Stole Christmas,345141403,123000000.0,104.0,Comedy,PG,Ron Howard,Brian Grazer,Donald Peterman,Universal Pictures
1,Mission: Impossible II,546388108,125000000.0,123.0,Action,PG-13,John Woo,Tom Cruise,Jeffrey L. Kimball,Paramount Pictures
2,Gladiator,460583960,103000000.0,155.0,Action,R,Ridley Scott,David Franzoni,John Mathieson,DreamWorks Distribution
3,The Perfect Storm,328718434,140000000.0,130.0,Action,PG-13,Wolfgang Petersen,Gail Katz,John Seale,Warner Bros.
4,Meet the Parents,330444045,55000000.0,108.0,Comedy,PG-13,Jay Roach,Robert De Niro,Peter James,Universal Pictures


In [1395]:
movies.shape

(3288, 10)

Handling missing values

In [1396]:

movies.isna().sum()


movie_title           0
revenue               0
budget             1039
runtime_minutes      80
genres                0
rating              147
director              2
producers             9
cinematographer     135
distributor           0
dtype: int64

In [1397]:
#Drop nan values
movies = movies.dropna()

In [1398]:
movies.shape

(2087, 10)

In [None]:
#visualize the relationship between numeric variables
sns.pairplot(movies, height=2, plot_kws={'s': 5});

Checking the values in each feature

In [1150]:
movies['genres'].value_counts()

Action         692
Comedy         552
Drama          285
Adventure      171
Biography      118
Crime          116
Horror         110
Animation       14
Fantasy          7
Documentary      7
Mystery          5
Thriller         4
Sci-Fi           2
Family           2
Music            1
Romance          1
Name: genres, dtype: int64

In [1151]:
movies.genres.unique()

array(['Comedy', 'Action', 'Drama', 'Adventure', 'Biography', 'Horror',
       'Crime', 'Fantasy', 'Mystery', 'Animation', 'Family',
       'Documentary', 'Thriller', 'Sci-Fi', 'Romance', 'Music'],
      dtype=object)

In [1152]:
#selecting the most popular genres and replacing 'other_genres' in place of the restof the genres

In [1399]:
movies.replace({'genres' : { 'Fantasy' : 'other_genres', 'Mystery' :'other_genres', 'Family' : 'other_genres', 'Documentary' : 'other_genres', 'Thriller' : 'other_genres','Sci-Fi' : 'other_genres','Romance': 'other_genres', 'Music' : 'other_genres'}}, inplace= True)

In [1400]:
movies.genres.unique()

array(['Comedy', 'Action', 'Drama', 'Adventure', 'Biography', 'Horror',
       'Crime', 'other_genres', 'Animation'], dtype=object)

In [1155]:
#2 Rating

In [1156]:
movies['rating'].value_counts()

PG-13    964
R        832
PG       282
G          8
NC-17      1
Name: rating, dtype: int64

In [1401]:
#Replacing the rare ratings with a similar one 
movies['rating'] = movies['rating'].replace(['G'],'PG')
movies['rating'] = movies['rating'].replace(['NC-17'],'PG')


In [1402]:
movies['rating'].value_counts()

PG-13    964
R        832
PG       291
Name: rating, dtype: int64

In [1159]:
#3 director, producer, cinematographer

In [1160]:
movies['director'].value_counts()


Steven Spielberg    16
Clint Eastwood      16
Ridley Scott        13
Ron Howard          11
Lasse Hallström     10
                    ..
Brian Koppelman      1
Harmony Korine       1
Gary McKendry        1
Mark Piznarski       1
Gregory Poirier      1
Name: director, Length: 997, dtype: int64

In [1161]:
movies_df['producers'].value_counts()


In [1162]:
movies_df['cinematographer'].value_counts()


In [1163]:
#4 distributor

In [1164]:
movies['distributor'].value_counts()



Warner Bros.                         293
Universal Pictures                   249
Twentieth Century Fox                221
Sony Pictures Entertainment (SPE)    179
Paramount Pictures                   177
                                    ... 
Annapurna Pictures                     1
Magnolia Pictures                      1
R.S. Entertainment                     1
Aviron Pictures                        1
Vivendi Entertainment                  1
Name: distributor, Length: 67, dtype: int64

In [1165]:
movies.distributor.unique()

array(['Universal Pictures', 'Paramount Pictures',
       'DreamWorks Distribution', 'Warner Bros.', 'Twentieth Century Fox',
       'Miramax', 'Walt Disney Studios Motion Pictures',
       'Sony Pictures Entertainment (SPE)', 'Dimension Films',
       'New Line Cinema', 'USA Films', 'Metro-Goldwyn-Mayer (MGM)',
       'Artisan Entertainment', 'Focus Features', 'Destination Films',
       'Lionsgate', 'Fine Line Features', 'Fox Searchlight Pictures',
       'Paramount Classics', 'Revolution Studios', 'United Artists',
       'Screen Gems', 'Newmarket Films', 'IFC Films',
       'R.S. Entertainment', 'Rogue Pictures', 'IDP Distribution',
       'Warner Independent Pictures (WIP)', 'The Weinstein Company',
       'Sony Pictures Classics', '', 'DreamWorks',
       'Yari Film Group Releasing', 'Freestyle Releasing',
       '8x Entertainment', 'Rocky Mountain Pictures', 'Fox Atomic',
       'Magnolia Pictures', 'Picturehouse', 'Paramount Vantage',
       'Roadside Attractions', 'Chicago Pic

In [1403]:
movies['distributor'] = movies['distributor'].replace(['DreamWorks Distribution','Miramax','Dimension Films',
       'New Line Cinema', 'USA Films', 'Metro-Goldwyn-Mayer (MGM)',
       'Artisan Entertainment', 'Focus Features', 'Destination Films',
       'Lionsgate', 'Fine Line Features', 'Fox Searchlight Pictures',
       'Paramount Classics', 'Revolution Studios', 'United Artists',
       'Screen Gems', 'Newmarket Films', 'IFC Films',
       'R.S. Entertainment', 'Rogue Pictures', 'IDP Distribution',
       'Warner Independent Pictures (WIP)', 'The Weinstein Company',
       'Sony Pictures Classics', '', 'DreamWorks',
       'Yari Film Group Releasing', 'Freestyle Releasing',
       '8x Entertainment', 'Rocky Mountain Pictures', 'Fox Atomic',
       'Magnolia Pictures', 'Picturehouse', 'Paramount Vantage',
       'Roadside Attractions', 'Chicago Pictures', 'Summit Entertainment',
       'Overture Films', 'The Samuel Goldwyn Company', 'TriStar Pictures',
       'Apparition', 'CBS Films', 'Music Box Films', 'Anchor Bay Films',
       'Relativity Media', 'Vivendi Entertainment', 'FilmDistrict',
       'Open Road Films (II)', 'A24', 'STX Entertainment', 'BH Tilt',
       'Bleecker Street Media', 'EuropaCorp', 'Broad Green Pictures',
       'Entertainment Studios Motion Pictures', 'Aviron Pictures',
       'Annapurna Pictures', 'Affirm Films', 'Pantelion Films',
       'Studio 8', 'Orion Pictures'],'other_distributor')


In [1404]:
movies.distributor.unique()

array(['Universal Pictures', 'Paramount Pictures', 'other_distributor',
       'Warner Bros.', 'Twentieth Century Fox',
       'Walt Disney Studios Motion Pictures',
       'Sony Pictures Entertainment (SPE)'], dtype=object)

Dropping columns that doesn't seem to have signifcant relation with the target

In [1405]:
del movies['director']
del movies['producers']
del movies['cinematographer']

In [1406]:
movies.head()

Unnamed: 0,movie_title,revenue,budget,runtime_minutes,genres,rating,distributor
0,How the Grinch Stole Christmas,345141403,123000000.0,104.0,Comedy,PG,Universal Pictures
1,Mission: Impossible II,546388108,125000000.0,123.0,Action,PG-13,Paramount Pictures
2,Gladiator,460583960,103000000.0,155.0,Action,R,other_distributor
3,The Perfect Storm,328718434,140000000.0,130.0,Action,PG-13,Warner Bros.
4,Meet the Parents,330444045,55000000.0,108.0,Comedy,PG-13,Universal Pictures


Converting the categorical features into dummy varables

In [1407]:
del movies['movie_title']

In [1408]:
movies = pd.get_dummies(movies, drop_first = True)

In [1409]:
movies.head()

Unnamed: 0,revenue,budget,runtime_minutes,genres_Adventure,genres_Animation,genres_Biography,genres_Comedy,genres_Crime,genres_Drama,genres_Horror,genres_other_genres,rating_PG-13,rating_R,distributor_Sony Pictures Entertainment (SPE),distributor_Twentieth Century Fox,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.,distributor_other_distributor
0,345141403,123000000.0,104.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
1,546388108,125000000.0,123.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,460583960,103000000.0,155.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,328718434,140000000.0,130.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,330444045,55000000.0,108.0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0


In [1410]:
movies.columns = movies.columns.map(str.strip)
movies.columns

Index(['revenue', 'budget', 'runtime_minutes', 'genres_Adventure',
       'genres_Animation', 'genres_Biography', 'genres_Comedy', 'genres_Crime',
       'genres_Drama', 'genres_Horror', 'genres_other_genres', 'rating_PG-13',
       'rating_R', 'distributor_Sony Pictures Entertainment (SPE)',
       'distributor_Twentieth Century Fox', 'distributor_Universal Pictures',
       'distributor_Walt Disney Studios Motion Pictures',
       'distributor_Warner Bros.', 'distributor_other_distributor'],
      dtype='object')

In [1174]:
movies.corr()

Unnamed: 0,revenue,budget,runtime_minutes,genres_Adventure,genres_Animation,genres_Biography,genres_Comedy,genres_Crime,genres_Drama,genres_Horror,genres_other_genres,rating_PG-13,rating_R,distributor_Sony Pictures Entertainment (SPE),distributor_Twentieth Century Fox,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.,distributor_other_distributor
revenue,1.0,0.727755,0.38541,0.101949,-0.010495,-0.062868,-0.181314,-0.075178,-0.109314,-0.067701,-0.024327,0.18897,-0.222191,0.049695,0.084475,0.035275,0.185472,0.068603,-0.257539
budget,0.727755,1.0,0.422915,0.126852,0.015652,-0.091852,-0.239621,-0.09508,-0.149237,-0.168265,-0.051495,0.229998,-0.294354,0.094752,0.063213,0.039132,0.223929,0.122142,-0.360152
runtime_minutes,0.38541,0.422915,1.0,-0.072703,-0.067884,0.186107,-0.235469,0.070286,0.106116,-0.167108,-0.044928,0.110452,0.016935,0.022813,-0.027907,0.037083,0.09646,0.07637,-0.138823
genres_Adventure,0.101949,0.126852,-0.072703,1.0,-0.024551,-0.073134,-0.17915,-0.072475,-0.118808,-0.070468,-0.035463,-0.157644,-0.136198,0.002081,0.107265,-0.056062,0.083062,0.010022,-0.075263
genres_Animation,-0.010495,0.015652,-0.067884,-0.024551,1.0,-0.020118,-0.049281,-0.019937,-0.032682,-0.019385,-0.009755,-0.064366,-0.054923,-0.004209,0.105256,-0.030248,0.049323,-0.033211,-0.042863
genres_Biography,-0.062868,-0.091852,0.186107,-0.073134,-0.020118,1.0,-0.146803,-0.059389,-0.097356,-0.057745,-0.02906,-0.031231,0.016772,0.013923,-0.03031,0.031498,0.002127,-0.039214,0.025491
genres_Comedy,-0.181314,-0.239621,-0.235469,-0.17915,-0.049281,-0.146803,1.0,-0.145479,-0.238485,-0.141452,-0.071186,0.037104,-0.011226,0.018061,-0.019253,0.040689,-0.005421,-0.054716,0.029305
genres_Crime,-0.075178,-0.09508,0.070286,-0.072475,-0.019937,-0.059389,-0.145479,1.0,-0.096479,-0.057224,-0.028798,-0.119895,0.186896,-0.014558,-0.015521,-0.011871,-0.055858,0.004301,0.05046
genres_Drama,-0.109314,-0.149237,0.106116,-0.118808,-0.032682,-0.097356,-0.238485,-0.096479,1.0,-0.093808,-0.047209,0.076569,-0.01031,-0.062012,-0.059769,-0.051672,-0.020896,0.024053,0.104099
genres_Horror,-0.067701,-0.168265,-0.167108,-0.070468,-0.019385,-0.057745,-0.141452,-0.057224,-0.093808,1.0,-0.028001,-0.0637,0.132036,-0.033959,-0.046332,-0.000821,-0.062522,0.00961,0.084108


In [1175]:
# budget have a high positive correlation with the revenue
#There is no significant correlation between feature therefore no multicollinearity

# Building a baseline model using linear regression 

In [1411]:
#use budget and distributor as initial features
X1, y1 = movies.drop(['revenue', 'runtime_minutes',
       'genres_Adventure', 'genres_Animation', 'genres_Biography',
       'genres_Comedy', 'genres_Crime', 'genres_Drama', 'genres_Horror',
       'genres_other_genres', 'rating_PG-13', 'rating_R'],axis=1), movies['revenue']
X1.head()
y1.head()

0    345141403
1    546388108
2    460583960
3    328718434
4    330444045
Name: revenue, dtype: int64

In [1412]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (cross_val_score, train_test_split, 
                                     KFold, GridSearchCV)
# hold out 20% of the data for final testing
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=.2, random_state=10)

# Spliting the train data into 5 folds for cross validaion
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

#Build a baseline model with linear regression
lm1 = LinearRegression()
scores = cross_val_score(lm1, X1_train, y1_train, cv=kfold)
print(scores)
print("Linear Reg Mean Score: ", np.mean(scores))

#Fitting the model
lm1.fit(X1_train, y1_train)


[0.52331035 0.45021161 0.51984926 0.57065188 0.5284636 ]
Linear Reg Mean Score:  0.5184973411554872


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [1413]:
lm1.score(X1_test,y1_test)

0.5600767726368105

In [1179]:
#The model performed better on the test set than the training set

Model Evaluation

In [1414]:
# Fitted vs. Actual on training data
#y1_pred = lm1.predict(X1_train)

[plt.scatter(y1_train, y1_pred, alpha=0.5)
plt.plot([0, 800000000], [0, 1000000000])
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title("Actual revenue vs Predicted revenue")]

SyntaxError: invalid syntax (<ipython-input-1414-e5ceb9f1171f>, line 5)

In [1181]:
# Fitted vs. Actual on test data
[y_pred_test = lm1.predict(X1_test)

plt.scatter(y1_test, y_pred_test,alpha=0.5)
plt.plot([0, 1000000000], [0, 1000000000])
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title("Actual vs Predicted on test_data_set")]

SyntaxError: invalid syntax (<ipython-input-1181-5cbafd5998f8>, line 2)

In [1182]:
# Plot Residuals vs. predicted

[lin_reg_residuals = y1_train - y_pred

plt.scatter(y1_pred, lin_reg_residuals,alpha=0.5)
plt.plot([-100000000,1000000000], [0,0])
plt.title("Residuals vs. Predictions")

SyntaxError: invalid syntax (<ipython-input-1182-f25fb4495418>, line 3)

# Further data adjustment and feature engineering

In [1183]:
# get rid of outliers

In [1184]:
column = movies['revenue']
print(column.max())
print(column.min())
print(column.mean())

2743577587
2530394
154344908.50359368


In [1185]:
column = movies['budget']
print(column.max())
print(column.min())
print(column.mean())

317000000.0
15000.0
52098565.88404408


In [1415]:
movies.nlargest(25, ['revenue'])

Unnamed: 0,revenue,budget,runtime_minutes,genres_Adventure,genres_Animation,genres_Biography,genres_Comedy,genres_Crime,genres_Drama,genres_Horror,genres_other_genres,rating_PG-13,rating_R,distributor_Sony Pictures Entertainment (SPE),distributor_Twentieth Century Fox,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.,distributor_other_distributor
1571,2743577587,237000000.0,162.0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
2598,2068223624,245000000.0,138.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2597,1670400637,150000000.0,124.0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
2081,1518812988,220000000.0,143.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2601,1515047671,190000000.0,137.0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
2599,1402805868,250000000.0,141.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2946,1332539889,317000000.0,152.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
3118,1308467944,170000000.0,128.0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
2954,1236005118,250000000.0,136.0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
2251,1214811252,200000000.0,130.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [1187]:
movies.nsmallest(10, ['revenue'])

Unnamed: 0,revenue,budget,runtime_minutes,genres_Adventure,genres_Animation,genres_Biography,genres_Comedy,genres_Crime,genres_Drama,genres_Horror,genres_other_genres,rating_PG-13,rating_R,distributor_Sony Pictures Entertainment (SPE),distributor_Twentieth Century Fox,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.,distributor_other_distributor
2076,2530394,10000000.0,97.0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0
2074,2820490,5000000.0,97.0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
1903,2908893,3000000.0,110.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2415,2940411,2000000.0,80.0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
175,3003296,44000000.0,95.0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
171,3169930,14000000.0,112.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1
2239,3332854,18000000.0,115.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2582,3485383,6000000.0,107.0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0
167,3548556,18000000.0,118.0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1
170,3728888,7000000.0,90.0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1


In [None]:
#Check the distribution of the data  

In [1188]:

def get_iqr_values(df_in, col_name):
    median = df_in[col_name].median()
    q1 = df_in[col_name].quantile(0.25) # 25th percentile / 1st quartile
    q3 = df_in[col_name].quantile(0.75) # 7th percentile / 3rd quartile
    iqr = q3-q1 #Interquartile range
    minimum  = q1-iqr # The minimum value or the |- marker in the box plot
    maximum = q3 + iqr # The maximum value or the -| marker in the box plot
    return median, q1, q3, iqr, minimum, maximum

def get_iqr_text(df_in, col_name):
    median, q1, q3, iqr, minimum, maximum = get_iqr_values(df_in, col_name)
    text = f"median={median:.2f}, q1={q1:.2f}, q3={q3:.2f}, iqr={iqr:.2f}, suggested_minimum={minimum:.2f}, sudgested_maximum={maximum:.2f}"
    return text

def remove_outliers(df_in, col_name, minimum, maxmimum):
    df_out = df_in.loc[(df_in[col_name] > minimum) & (df_in[col_name] < maximum)]
    return df_out

def count_outliers(df_in, col_name):
    _, _, _, _, minimum, maximum = get_iqr_values(df_in, col_name)
    df_outliers = df_in.loc[(df_in[col_name] <= minimum) | (df_in[col_name] >= maximum)]
    return df_outliers.shape[0]

def box_and_whisker(df_in, col_name):
    title = get_iqr_text(df_in, col_name)
    sns.boxplot(df_in[col_name])
    plt.title(title)
    plt.show()

In [1189]:
#box_and_whisker(movies, 'revenue')
text = get_iqr_text(movies, 'revenue')
print(text)


median=80547866.00, q1=35301209.50, q3=178783345.50, iqr=143482136.00, suggested_minimum=-108180926.50, sudgested_maximum=322265481.50


In [1416]:
new_movies = movies.loc[movies['revenue'] < 1500000000, :]
new_movies = new_movies.loc[new_movies['revenue'] > 10000000, :]
new_movies.shape

(1999, 19)

In [1417]:
#check for skewness
from scipy.stats import skew
print(skew(new_movies['revenue']))

#plt.figure()
#sns.distplot(movies['revenue'])
#plt.show()

2.6192363083322303


In [1418]:

#very high posoive skewness
#lets correct it by applying a log transformation
y = np.log(new_movies['revenue'])
skew(y)

0.13341215861490488

In [1366]:
# Add a new feature - Running_time

In [1419]:
X2 = new_movies.drop(['revenue','genres_Adventure', 'genres_Animation', 'genres_Biography',
       'genres_Comedy', 'genres_Crime', 'genres_Drama', 'genres_Horror',
       'genres_other_genres', 'rating_PG-13', 'rating_R'],axis=1)
X2.head()

Unnamed: 0,budget,runtime_minutes,distributor_Sony Pictures Entertainment (SPE),distributor_Twentieth Century Fox,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.,distributor_other_distributor
0,123000000.0,104.0,0,0,1,0,0,0
1,125000000.0,123.0,0,0,0,0,0,0
2,103000000.0,155.0,0,0,0,0,0,1
3,140000000.0,130.0,0,0,0,0,1,0
4,55000000.0,108.0,0,0,1,0,0,0


In [1420]:
# hold out 20% of the data for final testing
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=.2, random_state=10)

# Spliting the train data into 5 folds for cross validaion
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

#Building a linear regression model
lm2 = LinearRegression()
scores = cross_val_score(lm2, X2_train, y2_train, cv=kfold)
print(scores)
print("Linear Reg Mean Score: ", np.mean(scores))
lm2.fit(X2_train, y2_train)

[0.43520689 0.49743458 0.51738597 0.4464944  0.46487202]
Linear Reg Mean Score:  0.47227877234679844


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [1421]:
lm2.score(X2_test,y2_test)

0.4617678837692991

In [1255]:
# Fitted vs. Actual on training data
[y2_pred = lm2.predict(X2_train)

plt.scatter(y2_train, y2_pred, alpha=0.5)
plt.plot([0, 800000000], [0, 1000000000])
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title("Actual revenue vs Predicted revenue")]

SyntaxError: invalid syntax (<ipython-input-1255-f880ca8fbb9d>, line 2)

In [1256]:
# Plot Residuals vs. predicted
[lin_reg_residuals = y2_train - y2_pred

plt.scatter(y2_pred, lin_reg_residuals,alpha=0.5)
plt.plot([-100000000,1000000000], [0,0])
plt.title("Residuals vs. Predictions")]

SyntaxError: invalid syntax (<ipython-input-1256-1ebeac44c86f>, line 2)

In [1257]:
# Add a new feature - genres  'rating_PG-13', 'rating_R'

In [1422]:
X3 = new_movies.drop(['revenue','rating_PG-13', 'rating_R',],axis=1)

X3.head()

Unnamed: 0,budget,runtime_minutes,genres_Adventure,genres_Animation,genres_Biography,genres_Comedy,genres_Crime,genres_Drama,genres_Horror,genres_other_genres,distributor_Sony Pictures Entertainment (SPE),distributor_Twentieth Century Fox,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.,distributor_other_distributor
0,123000000.0,104.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
1,125000000.0,123.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,103000000.0,155.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,140000000.0,130.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,55000000.0,108.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0


In [1423]:
# hold out 20% of the data for final testing
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=.2, random_state=10)

# Spliting the train data into 5 folds for cross validaion
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

#Building a linear regression model
lm3 = LinearRegression()
scores = cross_val_score(lm3, X3_train, y_train, cv=kfold)
print(scores)
print("Linear Reg Mean Score: ", np.mean(scores))
lm3.fit(X3_train, y_train)

[0.44705116 0.50090584 0.52074614 0.45699997 0.47072483]
Linear Reg Mean Score:  0.4792855889036124


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [1424]:
lm3.score(X3_test,y_test)

0.4605816454270679

In [1261]:
#Add a new feature - rating

In [1425]:
X = new_movies.drop(['revenue'],axis=1)
X.head()


Unnamed: 0,budget,runtime_minutes,genres_Adventure,genres_Animation,genres_Biography,genres_Comedy,genres_Crime,genres_Drama,genres_Horror,genres_other_genres,rating_PG-13,rating_R,distributor_Sony Pictures Entertainment (SPE),distributor_Twentieth Century Fox,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.,distributor_other_distributor
0,123000000.0,104.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
1,125000000.0,123.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,103000000.0,155.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,140000000.0,130.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,55000000.0,108.0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0


In [1427]:
# hold out 20% of the data for final testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)

# Spliting the train data into 5 folds for cross validaion
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

#Building a linear regression model
lm = LinearRegression()
scores = cross_val_score(lm, X_train, y_train, cv=kfold)
print(scores)
print("Linear Reg Mean Score: ", np.mean(scores))
lm.fit(X_train, y_train)

[0.44492242 0.50293925 0.52373109 0.46253557 0.46794896]
Linear Reg Mean Score:  0.48041545795052054


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [1428]:
lm.score(X_test,y_test)

0.4582384264764612

In [1429]:
y_preds = lm.predict(X_test)

In [1430]:
mean_absolute_error(y_test,y_preds)

0.6765341356506701

In [1267]:
# Fitted vs. Actual on training data
[y_pred = lm.predict(X_train)

plt.scatter(y_train, y_pred, alpha=0.5)
#plt.plot([0, 800000000], [0, 1000000000])
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title("Actual revenue vs Predicted revenue")]

SyntaxError: invalid syntax (<ipython-input-1267-5988c09e1ff6>, line 2)

In [1268]:
# Fitted vs. Actual on test data
#y_pred_test = lm.predict(X_test)

plt.scatter(y_test, y_pred_test,alpha=0.5)
plt.plot([0, 1000000000], [0, 1000000000])
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title("Actual vs Predicted on test_data_set")

SyntaxError: invalid syntax (<ipython-input-1268-6c5927ccf899>, line 8)

In [693]:
# Plot Residuals vs. predicted
#y_pred_test = lm.predict(X_test)
lin_reg_residuals = y_test - y_pred_test

plt.scatter(y_pred_test, lin_reg_residuals,alpha=0.5)
plt.plot([17,21], [0,0])
plt.title("Residuals vs. Predictions")
plt.xlabel('Actual')
plt.ylabel('Predicted')

SyntaxError: invalid syntax (<ipython-input-693-871ac52fd912>, line 7)

Evaluation 

#The model's r2 on the test set increased with the new feature added  
#The model is still overfitting
#To improve that we will do a regularizaion

# Regularization

In [1431]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
pd.options.display.max_columns = None
%matplotlib inline
from IPython.display import display


In [1432]:
#Scalling the data
scaler = StandardScaler()

x_train_scaled= scaler.fit_transform(X_train.values)
X_test_scaled= scaler.transform(X_test.values)

In [1435]:
x_train = pd.DataFrame(x_train_scaled)

In [1437]:
x_train.shape


(1599, 18)

In [1438]:
y_train.shape

(1599,)

In [1439]:
alphavec = 10**np.linspace(-2,2,200)
lasso_model = LassoCV(alphas = alphavec, cv=5)
lasso_model.fit(X_train, y_train)

LassoCV(alphas=array([1.00000000e-02, 1.04737090e-02, 1.09698580e-02, 1.14895100e-02,
       1.20337784e-02, 1.26038293e-02, 1.32008840e-02, 1.38262217e-02,
       1.44811823e-02, 1.51671689e-02, 1.58856513e-02, 1.66381689e-02,
       1.74263339e-02, 1.82518349e-02, 1.91164408e-02, 2.00220037e-02,
       2.09704640e-02, 2.19638537e-02, 2.30043012e-02, 2.40940356e-02,
       2.52353917e-02, 2.64308149e-0...
       6.01027678e+01, 6.29498899e+01, 6.59318827e+01, 6.90551352e+01,
       7.23263390e+01, 7.57525026e+01, 7.93409667e+01, 8.30994195e+01,
       8.70359136e+01, 9.11588830e+01, 9.54771611e+01, 1.00000000e+02]),
        copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=1000,
        n_alphas=100, n_jobs=None, normalize=False, positive=False,
        precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
        verbose=False)

In [1440]:
list(zip(X_train.columns, lasso_model.coef_))

[('budget', 1.3122391622849242e-08),
 ('runtime_minutes', 0.006112890214918336),
 ('genres_Adventure', 0.0),
 ('genres_Animation', 0.0),
 ('genres_Biography', -0.0),
 ('genres_Comedy', -0.0),
 ('genres_Crime', -0.0),
 ('genres_Drama', -0.04882957741617074),
 ('genres_Horror', 0.11188683457428869),
 ('genres_other_genres', 0.0),
 ('rating_PG-13', 0.0),
 ('rating_R', -0.06458184403168224),
 ('distributor_Sony Pictures Entertainment (SPE)', 0.0),
 ('distributor_Twentieth Century Fox', 0.12024343328666441),
 ('distributor_Universal Pictures', 0.1168351677662735),
 ('distributor_Walt Disney Studios Motion Pictures', -0.0),
 ('distributor_Warner Bros.', -0.0),
 ('distributor_other_distributor', -0.19538263298719966)]

In [1443]:
#intrerpreting the coefficients(This is needed because the target variable had a log transformaion)
import math
budget = (math.exp(1.3122391622849242e-08) - 1) * 100 
print(budget)
runtime_minutes = (math.exp(0.006112890214918336) - 1) * 100 
print(runtime_minutes)
genres_Drama = (math.exp(-0.04882957741617074) - 1) * 100 
print(genres_Drama)
genres_Horror = (math.exp(0.11188683457428869) - 1) * 100
print(genres_Horror)

1.312239161777029e-06
0.6131612057055813
-4.76565833048066
11.838629082280573


In [1445]:
scores = cross_val_score(lasso_model, X_train, y_train, cv=kfold)

In [1446]:
print(scores)
print(scores.mean())

[0.44393893 0.49915659 0.52311633 0.44678714 0.46888241]
0.47637628063523324


In [1455]:
lasso_model.score(X_test,y_test)

0.47355932010070134

In [1449]:
from sklearn.metrics import mean_absolute_error

In [1452]:
y_pred = lasso_model.predict(X_test)

In [1453]:
MAE = mean_absolute_error(y_test,y_pred)
MAE

0.662922488657275

In [1360]:
#Actual vs predicted on test set
#plt.scatter(y_pred, y_test)
#plt.plot([17,21], [16,21])
#plt.title("Residuals vs. Predictions")

In [1350]:
# Plot Residuals vs. predicted

lasso_reg_residuals = y_test - y_pred

#plt.scatter(y_pred, lasso_reg_residuals,alpha=0.5)
#plt.plot([17,21], [0,0])
#plt.title("Residuals vs. Predictions")


In [1467]:
#ridge regularization
ridge_model = Ridge(alpha = 0.5)
ridge_model.fit(X_train, y_train)

list(zip(X_train.columns, lr_model_ridge.coef_))

  overwrite_a=True).T


[('budget', 1.2383014219349912e-08),
 ('runtime_minutes', 0.008202548719288301),
 ('genres_Adventure', 0.08033324999658602),
 ('genres_Animation', 0.11626809269287862),
 ('genres_Biography', -0.1492537069548378),
 ('genres_Comedy', -0.04183590917840805),
 ('genres_Crime', -0.15037963485407363),
 ('genres_Drama', -0.15959395078978886),
 ('genres_Horror', 0.3005665449249468),
 ('genres_other_genres', 0.20349852683993563),
 ('rating_PG-13', -0.023724492910749573),
 ('rating_R', -0.1398707898160156),
 ('distributor_Sony Pictures Entertainment (SPE)', 0.13621184901458802),
 ('distributor_Twentieth Century Fox', 0.2779273564212247),
 ('distributor_Universal Pictures', 0.2887598906995394),
 ('distributor_Walt Disney Studios Motion Pictures', -0.026422754278726507),
 ('distributor_Warner Bros.', 0.08218013715820525),
 ('distributor_other_distributor', -0.1347313985422708)]

In [1468]:
scores = cross_val_score(lr_model_ridge, X_train, y_train, cv=kfold)
print(scores)
print("Ridge Mean Score: ", np.mean(scores))
lr_model_ridge.fit(X_train, y_train)

[0.44517049 0.50296041 0.52382726 0.46250299 0.46808911]
Ridge Mean Score:  0.4805100535720168


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [1469]:
ridge_model.score(X_test,y_test)

0.4584712631412101

In [725]:
#The model is overfiting 