# Predicting Movie Revenue

## Features  

**Variable to Predict:** Revenue


- Budget  
- Runtime  
- Genre  
- Release Date  
 - Season  
 - Holiday   
- Production Companies  
 - Number of companies involved  
 - Number of movies a company has made (previously)  
- Crew
 - Number of crewmembers  
- Cast
 - Gender Ratio for top 2/5/10/25
 - Number of previous movies top 10 cast have been in
- Collection
 - Is sequel (order)  


In [1]:
import numpy as np
import pandas as pd
import sqlite3
import holidays
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta, date
from scipy import stats

pd.set_option('display.float_format','{:,.2f}'.format)

In [2]:
df = pd.read_csv(r'../../../Data Science Data/Unit 3/movie_data.csv')

In [3]:
df['complete_budget'] = df['budget'].replace(0,np.nan).fillna(df['imdb_budget'])
df['complete_revenue'] = df['revenue'].replace(0,np.nan).fillna(df['imdb_revenue'])

df['complete_budget'] = pd.to_numeric(df['complete_budget'],errors='coerce',downcast='integer')
df['complete_revenue'] = pd.to_numeric(df['complete_revenue'],errors='coerce',downcast='integer')

df['runtime'] = pd.to_numeric(df['runtime'],downcast='integer')
df['gender'] = pd.to_numeric(df['gender'],downcast='integer')

In [4]:
df['is_sequel'] = df['order_in_collection'] > 1
df['profit'] = df['complete_revenue'] - df['complete_budget']
df['ROI'] = df['profit'] / df['complete_budget']
df['gross_margin'] = df['complete_revenue'] / df['complete_budget']
df['log_revenue'] = df['complete_revenue'].apply(np.log)
df['log_budget'] = df['complete_budget'].apply(np.log)

In [5]:
cols_to_drop = ['budget','revenue','imdb_budget','imdb_revenue','collection','order_in_collection']
df.drop(cols_to_drop,axis=1,inplace=True)
df.dropna(inplace=True)
df.drop(df[df['release_date'].dt.year < 1965].index,inplace=True)
df.info();

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
# Top 10 box office grossing movies
df.sort_values('complete_revenue',ascending=False,inplace=True)
df.head(n=10)

In [None]:
# The Hobbit animated movie?? Really? Let's remove... 
df.drop(df[df['title'] == 'The Hobbit'].index, inplace=True)

In [None]:
# Look at the lowest quantile to check for outliers
df[df['complete_revenue'] < df['complete_revenue'].quantile(q=.0075)]

# Notes
# Trojan War: Only released in one theater in the US over one weekend. Not a good representation of what we are trying to predict
# Sunrise: Foreign movie released in theaters in the US 2 years after originally released in France
# Skin Trade: Terrible movie that starred Dolph Lungren. With the exception of Rocky, his movies have been terrible. Something to model in the future?
# Next Time I'll Aim for the Heart: French movie. Grossed 4mil Euros, but did very little in the USA. 

In [None]:
# Remove movies with a budget less than 100,000
df.drop(df[df['complete_budget'] < 100000].index,inplace=True)

upper_quantile_rev = df['complete_revenue'].quantile(q=.9)
bottom_quantile_rev = df['complete_revenue'].quantile(q=.1)
df.drop(df[df['complete_revenue'] < bottom_quantile_rev].index,inplace=True)
df.drop(df[df['complete_revenue'] > upper_quantile_rev].index,inplace=True)

df.describe()

In [None]:
sns.boxplot(x=df['complete_revenue'])
plt.xlim(0,500000000)
plt.show()

In [None]:
sns.regplot(y=df['complete_revenue'],x=df['complete_budget'])
plt.show() 

sns.regplot(y=df['ROI'],x=df['complete_budget'])
plt.show()

In [None]:
g = sns.FacetGrid(df[['is_holiday','complete_revenue','complete_budget']],row='is_holiday')
g.map(sns.boxplot, 'complete_revenue')
plt.show()

stats.kruskal(df[df['is_holiday'] == True]['complete_revenue'], df[df['is_holiday'] == False]['complete_revenue'])

In [None]:
genres = list(genre_df.groupby('genre_name').count().index)

genre_count = {}

for genre in genres:
    genre_count[genre] = df[df[genre] == 1]['complete_revenue'].count()

genre_list = [key for key,values in genre_count.items() if values > 200]

genre_mean = {}
genre_ROI = {}

for genre in genre_list:
    genre_mean[genre] = df[df[genre] == 1]['complete_revenue'].mean()
    genre_ROI[genre] = df[df[genre] == 1]['ROI'].mean()
    
sns.barplot(x=list(genre_mean.keys()),y=[value for value in genre_mean.values()])
plt.title('Revenue by Genre')
plt.xticks(rotation=33, horizontalalignment='right')
plt.show()

sns.barplot(x=list(genre_mean.keys()),y=[value for value in genre_ROI.values()])
plt.title('ROI by Genre')
plt.xticks(rotation=33, horizontalalignment='right')
plt.show()

In [None]:
sns.distplot(df['gender'])
plt.show()

sns.distplot(df['movies_produced'])
plt.show()

In [None]:
cols = ['runtime','prod_company_count','crewmember_count','gender','movie_experience','movies_produced','complete_budget','complete_revenue']

g = sns.PairGrid(df[cols], diag_sharey=False)
g.map_upper(plt.scatter)
g.map_lower(sns.regplot)
g.map_diag(sns.kdeplot)

plt.show()

In [None]:
df.corr()[['complete_revenue','complete_budget','ROI','gross_margin']]

In [None]:
drop_genres = set(genres) - set(genre_list)

model_data = df.drop(list(drop_genres),axis=1)
model_data = model_data.drop(['profit','ROI','release_date','title','log_revenue','gross_margin'],axis=1)

In [None]:
df_dum = pd.get_dummies(model_data)

x = df_dum.loc[:,df_dum.columns != 'complete_revenue']
y = df_dum['complete_revenue']

x_train = x.sample(frac=.8, random_state=30)
y_train = y[x_train.index]

x_test = x.drop(x_train.index)
y_test = y.drop(y_train.index)

x.columns

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import median_absolute_error


dt = DecisionTreeRegressor(max_depth=10,max_features=None)

dt.fit(x_train,y_train)
y_predict = dt.predict(x_test)
dt.score(x_train,y_train)

In [None]:
percent_error = (y_predict - y_test) / y_test
plt.hist(percent_error, bins=np.arange(-10,10,1))
plt.show()

np.mean((percent_error < .1) & (percent_error > -.1))

In [None]:
x_test['y_actual'] = y_test
x_test['y_predict'] = y_predict
x_test['y_delta'] = (y_predict - y_test) / y_test
x_test.sort_values('y_delta', ascending=True);

In [None]:
df_dum = pd.get_dummies(model_data)

x = df_dum.loc[:,df_dum.columns != 'complete_revenue']
y = df_dum['complete_revenue']

x_train = x.sample(frac=.8, random_state=30)
y_train = y[x_train.index]

x_test = x.drop(x_train.index)
y_test = y.drop(y_train.index)

x.columns

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=12,max_features=None)

rf.fit(x_train,y_train)
y_predict = dt.predict(x_test)
rf.score(x_train,y_train)

In [None]:
percent_error = (y_predict - y_test) / y_test
plt.hist(percent_error, bins=np.arange(-10,10,1))
plt.show()

np.mean((percent_error < .1) & (percent_error > -.1))

In [None]:
x_test['y_actual'] = y_test
x_test['y_predict'] = y_predict
x_test['y_delta'] = (y_predict - y_test) / y_test
x_test.sort_values('y_delta', ascending=True);

In [None]:
accurate_range = x_test.loc[(x_test['y_delta'] > -.1) & (x_test['y_delta'] < .1)]
accurate_range = accurate_range.join(df[['title','release_date']])

for column in accurate_range.columns:
    print(column)
    print(accurate_range[column].describe())
    print('-'*100)

In [None]:
accurate_range.groupby(accurate_range['release_date'].dt.year).count()

In [None]:
genre_p = accurate_range[genre_list].sum() / accurate_range.shape[0]
genre_t = df[genre_list].sum() / df.shape[0]

pd.concat([genre_p,genre_t], axis=1)

In [None]:
drop_genres = set(genres) - set(genre_list)

model_data = df.drop(list(drop_genres),axis=1)
model_data.drop(['profit','ROI','title','log_revenue','gross_margin'],axis=1,inplace=True)
model_data.drop(model_data[model_data['release_date'].dt.year < 1998].index,inplace=True)
model_data.drop('release_date',axis=1,inplace=True)

In [None]:
df_dum = pd.get_dummies(model_data)

df_dum

x = df_dum.loc[:,df_dum.columns != 'complete_revenue']
y = df_dum['complete_revenue']

x_train = x.sample(frac=.8, random_state=30)
y_train = y[x_train.index]

x_test = x.drop(x_train.index)
y_test = y.drop(y_train.index)

x.columns

In [None]:
rf.fit(x_train,y_train)
y_predict = dt.predict(x_test)
rf.score(x_train,y_train)

In [None]:
percent_error = (y_predict - y_test) / y_test
plt.hist(percent_error, bins=np.arange(-10,10,1))
plt.show()

np.mean((percent_error < .1) & (percent_error > -.1))

- Try tpot
- Look at feature importance
- What movies is this model good at predicting
 - Group by genre, group by decade etc...
- Separate categorical variables and then try PCA
- Predicting based on decade
- Visualizations around the accuracy. X is actual revenue and y is predicted with a straight line to see the difference. Look at log as well. Create a function which does this.