### TMDB Budget Exploration Project

In [15]:
import pandas as pd
import numpy as np
import ast
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

In [2]:
data_train_raw = pd.read_csv('data/tmdb_train.csv')
data_train = data_train_raw.copy(deep=True)
data_test_raw = pd.read_csv('data/tmdb_test.csv')
data_test = data_test_raw.copy(deep=True)

In [5]:
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df
        
data_train = text_to_dict(data_train)
data_test = text_to_dict(data_test)

production_companies

In [6]:
data_train['num_companies'] = data_train['production_companies'].apply(lambda x: len(x) if x != {} else 0)

Release Date

In [7]:
def fix_date(x):
    """
    Fixes dates which are in 20xx
    """
    year = x.split('/')[2]
    if int(year) <= 19:
        return x[:-2] + '20' + year
    else:
        return x[:-2] + '19' + year

In [8]:
data_train['release_date'] = data_train['release_date'].apply(lambda x: fix_date(x))
data_train['release_date'] = pd.to_datetime(data_train['release_date'])

In [9]:
# creating features based on dates
def process_date(df):
    date_parts = ["year", "weekday", "month", 'weekofyear', 'day', 'quarter']
    for part in date_parts:
        part_col = 'release_date' + "_" + part
        df[part_col] = getattr(df['release_date'].dt, part).astype(int)
    
    return df

data_train = process_date(data_train)

crew

In [17]:
data_train['num_crew'] = data_train['crew'].apply(lambda x: len(x) if x != {} else 0)

#### Plot 1. Avaerage company count per film vs average budget per film per comany each year

In [16]:
d1 = data_train.groupby(['release_date_year'])['num_companies'].mean()
d2 = data_train.groupby(['release_date_year'])['budget'].mean()
d3 = data_train.groupby(['release_date_year'])['budget'].mean().divide(data_train.groupby(['release_date_year'])['num_companies'].mean())
data = [go.Scatter(x=d1.index, y=d1.values, name='avaerage num companies per film'), 
       go.Scatter(x=d2.index, y=d2.values, name='average budget per film', yaxis='y2'),
       go.Scatter(x=d3.index, y=d3.values, name='average budget per company per film' , yaxis='y2')]
layout = go.Layout(dict(title = "Average number of companies and average budget per company each year",
                  xaxis = dict(title = 'Year'),
                  yaxis = dict(title = 'Average num companies per film'),
                  yaxis2=dict(title='Average budget per company', overlaying='y', side='right')
                  ),legend=dict(
                orientation="v"))
py.iplot(dict(data=data, layout=layout))

#### Plot 2. Number of crews vs average budget per company per film each year

In [18]:
d1 = data_train.groupby(['release_date_year'])['num_crew'].mean()
d2 = data_train.groupby(['release_date_year'])['budget'].mean()
data = [go.Scatter(x=d1.index, y=d1.values, name='average num crews per film'),
       go.Scatter(x=d2.index, y=d2.values, name='average budget per film', yaxis='y2')]
layout = go.Layout(dict(title = "Average number of crews and average budget per film each year",
                  xaxis = dict(title = 'Year'),
                  yaxis = dict(title = 'Average num crews'),
                  yaxis2=dict(title='Average budget per film', overlaying='y', side='right')     
                  ),legend=dict(
                orientation="v"))
py.iplot(dict(data=data, layout=layout))