# Imdb and budgets join

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from helper import *

In [2]:
budgets = pd.read_pickle('pickle/budget_cleaned.p')
imdb = pd.read_pickle('pickle/imdb_cleaned.p')

Let's put the data toghether.

Note that we have some duplicate titles in the budgets dataframe, so we cannot use only the movie title to join the datasets. The duplicates are movies with the same title but from different years, so we will use also the year to perform the join.

In [3]:
budgets.movie.value_counts().head()

King Kong                     3
Home                          3
Ben-Hur                       3
The Last House on the Left    2
Fantastic Four                2
Name: movie, dtype: int64

In [4]:
budgets[budgets.movie=='Brothers']

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year,avg_cumulative_inflation,domestic_ROI,worldwide_ROI,adj_production_budget,adj_domestic_gross,adj_worldwide_gross,budget_type,release_month_day
1991,2009-12-04,Brothers,26000000.0,28544157.0,45043870.0,2009,1.118721,0.097852,0.732457,29086740.0,31932940.0,50391510.0,normal,2016-12-04
3095,2015-08-14,Brothers,13000000.0,656688.0,17856688.0,2015,1.012615,-0.949486,0.373591,13164000.0,664972.2,18081950.0,normal,2016-08-14


With this join we don't duplicate the rows on the imdb dataframe but we have several missing values:

In [5]:
temp = imdb.merge(budgets, left_on=['movie_title', 'title_year'], right_on=['movie', 'release_year'], how='left')
len(temp[pd.isnull(temp.budget_type)].movie_title)

898

By looking at the missing data we can distinguish three cases:

1. *Different dates*: on the imdb dataframe we have production dates, while on the budgets dataframe we have release dates; these dates aren't always in the same year even if we are looking at the same movie.
2. *Different titles*: some movies have the titles spelled in different ways, or with different punctuation and so on.
3. *Different languages*: on the imdb dataframe most of the titles are in english, while on the budgets dataframe they mostly are in the original language.

In [6]:
print(budgets.loc[budgets.movie.str.contains('El Mariachi'), ['movie', 'release_year']])
print(imdb.loc[imdb.movie_title.str.contains('El Mariachi'), ['movie_title', 'title_year']])

            movie  release_year
5394  El Mariachi          1993
      movie_title  title_year
5035  El Mariachi      1992.0


In [7]:
print(budgets.loc[budgets.movie.str.contains('WALL-E'), ['movie', 'release_year']])
print(imdb.loc[imdb.movie_title.str.contains('WALL'), ['movie_title', 'title_year']])

     movie  release_year
74  WALL-E          2008
   movie_title  title_year
58      WALL·E      2008.0


In [8]:
print(budgets.loc[budgets.movie.str.contains('La grande bellezza'), ['movie', 'release_year']])
print(imdb.loc[imdb.movie_title.str.contains('The Great Beauty'), ['movie_title', 'title_year']])

                   movie  release_year
3205  La grande bellezza          2013
           movie_title  title_year
2992  The Great Beauty      2013.0


For the first item we define a function that joins the datasets on title and year and, if there is no match, we soften the condition on the year by asking that the production year is at most 10 years earlier than the release year.

We also decide to add only the budget index to the imdb dataset and perform the join in one step after we treat all the cases identified above.

In [9]:
def match_movies(movie, year):
    temp = budgets.reset_index().loc[(budgets.movie == movie) & (year == budgets.release_year)]
    if not temp.empty:
        return temp.iloc[0, 0]

    temp = budgets.reset_index().loc[(budgets.movie == movie) & (year <= budgets.release_year) & (year >= budgets.release_year - 10)]
    if not temp.empty:
        return temp.iloc[0, 0]
    
    return -1

In [10]:
imdb['budget_index'] = imdb.apply(lambda x: match_movies(x.movie_title, x.title_year), axis=1)

For the second and third items we have to manually match the titles, so we have saved the titles for which the budget index is missing and analyzed them on excel.

In [11]:
imdb.loc[imdb.budget_index == -1].movie_title.to_csv('data/unmatched.csv')

In [12]:
budgets.movie.to_csv('data/budget_titles.csv')

After matching the data manually we can import the result and add the missing indexes to the dataframe:

In [13]:
missing_indexes = pd.read_csv('data/missing_indexes.csv', sep=';')

In [14]:
imdb = imdb.merge(missing_indexes, how='left', left_index=True, right_on='index').drop('index', axis=1)

In [15]:
imdb['budget_index_x'] = imdb.apply(lambda x: x.budget_index_y if x.budget_index_x == -1 else x.budget_index_x, axis=1)

In [16]:
imdb.drop('budget_index_y', axis=1, inplace=True)

(We still have some missing values, which we can add manually column by column, but for this movies I wasn't able to find the budget data, so we'll simply ignore them for the time being.)

In [17]:
# dateparse = lambda x: pd.datetime.strptime(x, '%m/%d/%Y')

# missing_data = pd.read_csv('data/missing_data.csv', sep=';', parse_dates=['Release Date'], date_parser=dateparse)
# missing_data.columns = ['movie', 'domestic_gross', 'worldwide_gross', 'release_date']

In [18]:
# inflation = pd.read_csv('data/us_dollar_inflation.csv', sep=';')

# missing_data['release_year'] = missing_data.release_date.dt.year

# missing_data = missing_data.merge(inflation[['year', 'avg_cumulative_inflation']],
#                                 left_on='release_year',
#                                 right_on='year',
#                                 how='left').drop('year', axis=1)

# missing_data['domestic_ROI'] = (missing_data.domestic_gross - missing_data.production_budget) / missing_data.production_budget
# missing_data['worldwide_ROI'] = (missing_data.worldwide_gross - missing_data.production_budget) / missing_data.production_budget
# missing_data['adj_production_budget'] = missing_data.production_budget * missing_data.avg_cumulative_inflation
# missing_data['adj_domestic_gross'] = missing_data.domestic_gross * missing_data.avg_cumulative_inflation
# missing_data['adj_worldwide_gross'] = missing_data.worldwide_gross * missing_data.avg_cumulative_inflation
# missing_data['budget_type'] = missing_data.adj_production_budget.apply(budget_type)
# missing_data['release_month_day'] = missing_data.apply(lambda x: pd.to_datetime('2016-' + str(x.release_date.month) + '-' + str(x.release_date.day)), axis=1)

Finally, we join the imdb and budget datasets and save them in a pickle to analyze them.

In [19]:
imdb = imdb.merge(budgets, left_on='budget_index_x', right_index=True, how='left').drop(['movie', 'budget_index_x'])

In [20]:
imdb.to_pickle('pickle/imdb_full.p')