# Box Office Revenue Predictor

##### *Eilert Skram, Torbjørn Moen, 18.11.2022*

In [None]:
%matplotlib inline
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from pathlib import Path
import math
from functools import reduce


DATA = Path('/kaggle/input/tmdb-box-office-prediction')
list(DATA.iterdir())

train = pd.read_csv(DATA/'train.csv')
test = pd.read_csv(DATA/'test.csv')
sampleSubmission = pd.read_csv(DATA/'sample_submission.csv')



Imports, naming of variables

## Data

In [None]:
pd.set_option('display.max_columns', None)
train.head()

In [None]:
train.describe()

A lot of non-numerical values. Will have to handle. Also see trivial columns, i.e poster path

In [None]:
train.info()

In [None]:
train.hist(bins=50, figsize=(15, 8))

In [None]:
train.plot(kind="scatter", x="budget", y="revenue", grid=True)

Seems that if higher budget, likely higher revenue

In [None]:
train.plot(kind="scatter", x="runtime", y="budget", grid=True)

In [None]:
train.plot(kind="scatter", x="runtime", y="revenue", grid=True)

Seems most movies are withing 90-150min , over 200 min stricly negative. Guessing this is either due to older movies(not scaled for inflation) or a lot of documentaries

In [None]:
train.plot(kind="scatter", x="popularity", y="revenue", grid=True)

## Label

In [None]:
train_label = train['revenue'].copy()
train = train.drop('revenue', axis = 1)

## Top + genre 

In [None]:
largest_movie_companies = [
    'Warner Bros.',                             # Warner Bros. Entertainment
    'Universal Pictures',                       # NBCUniversal  
    'Paramount Pictures',                       # Paramount Global
    'Twentieth Century Fox Film Corporation',   # Walt Disney Studios
    'Columbia Pictures',                        # Sony Pictures
    'Metro-Goldwyn-Mayer (MGM)',                # Amazon
    'New Line Cinema',                          # Warner Bros. Entertainment
    'Touchstone Pictures',                      # Walt Disney Studios
    'Walt Disney Pictures',                     # Walt Disney Studios
    'Columbia Pictures Corporation',            # Sony Pictures
    'TriStar Pictures'                          # Sony Pictures
]

top5_spoken_languages = [
    'English',
    'Français',
    'Español',
    'Deutsch',
    'Pусский'
]

top5_original_languages = [
    'en',
    'fr',
    'ru',
    'es',
    'hi'
]

top5_production_countries = [
    'United States of America', 
    'United Kingdom',
    'France',
    'Germany',
    'Canada'
]

genres = [
    'Crime',
    'History',
    'Family',
    'Horror',
    'Thriller',
    'Foreign',
    'Fantasy',
    'Music',
    'Action',
    'Romance',
    'Documentary',
    'Comedy',
    'TV Movie',
    'War',
    'Animation',
    'Drama',
    'Science Fiction',
    'Western',
    'Adventure',
    'Mystery'
]

## Dataprocessing pipeline

In [None]:
# failsafe eval
def m_eval(x):
    try:
        out = eval(x)
    except:
        out = {}
    return out

# uniques
def unique_values(dataset, column, value):
    data = dataset[column]
    
    data_values = data.map(lambda x: [d[value] for d in m_eval(x)])
    unique_values = list(set(reduce(lambda x, y: x + y, data_values, [])))
    
    return unique_values, data_values, len(unique_values)




Creating a custom encoder, essentially works like a one hot encoder

In [None]:
def dictEncode(dataframe, column, key, categories, oneHot=True, destColumn=None):
    if oneHot == False and destColumn == None:
        raise Exception('You must have a destination column if oneHot is set to False.')
    
    # Inserts new columns into the dataframe
    # if oneHot then add all categories, else add targetColumn
    if oneHot:
        for category in categories:
            dataframe.insert(len(dataframe.columns), category, 0)
    else:
        dataframe.insert(len(dataframe.columns), destColumn, 0)

    # loops through all the rows
    # if oneHot, then set each respective category to 1, else set targetColumn to 1
    for index, row in dataframe.iterrows():
        # reads string of form '{'name':'value'}' to dictionary
        dictionary = m_eval(row[column])
        
        # creates a list of every element of a particular key
        elements = list(map(lambda x: x[key], dictionary))
        
        for element in elements:
            if element in categories:
                if oneHot:
                    dataframe.at[index, element] = 1
                else:
                    dataframe.at[index, destColumn] = 1

Simple function to replace NaN with 0 and any value with 1. Contain or not = True/False

In [None]:
def containOrNot (df, columnName, addedName=''):
    df[addedName+columnName] = df[columnName].notnull().astype('int')
    if addedName != '':
        df.drop(columnName, inplace=True, axis=1)

function to split date into numeric year, and what quarter movie was released

In [None]:
def rdConversion(dataframe):
    data = dataframe.copy()
    data['release_date'] = data['release_date'].fillna('01/01/01')
    data['release_date'] = pd.to_datetime(data['release_date'])
    data['year'] = data['release_date'].dt.year
    
    for i in range(1,5):
        data.insert(len(data.columns), f'quarter_{i}', 0)
        
    for index, row in data.iterrows():
        quarter = row['release_date'].quarter
        if 1 <= quarter and quarter <= 4:
            data.at[index, f'quarter_{quarter}'] = 1
            
    
    return data
            
    

Custom transformer:

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if set(['status','crew','cast','overview','poster_path','imdb_id','title', 'original_title']).issubset(X.columns):
            X.drop( ['status','crew','cast','overview', 'poster_path', 'imdb_id', 'title', 'original_title'], inplace=True, axis=1)
            
        
        
        dictEncode(X, 'genres', 'name', genres)
        dictEncode(X, 'production_countries', 'name', top5_production_countries)
        dictEncode(X, 'production_companies', 'name', largest_movie_companies, oneHot=False, destColumn='top_movie_companies')
        dictEncode(X, 'spoken_languages', 'name', top5_spoken_languages, oneHot=False, destColumn='top5_spoken_languages')
        
        for org in top5_original_languages:
            X.insert(len(X.columns), 'org_' + org, 0)

        for index, row in X.iterrows():
            if row['original_language'] in top5_original_languages:
                X.at[index, 'org_' + row['original_language']] = 1
                
        containOrNot(X, 'homepage','has_')
        containOrNot(X, 'belongs_to_collection')
        containOrNot(X, 'tagline', 'has_')
        containOrNot(X, 'Keywords', 'has_')
        
        rdConversion(X)
        X.drop('release_date', inplace=True, axis=1)
        X.drop( ['genres','production_countries','production_companies','spoken_languages', 'original_language'], inplace=True, axis=1)
        
        return X

- If statement to check if allready dropped.
- encoding date to year, then splitting into what quarter
- using the custom encoder to map what genres the movie is in
- using the containorNot function on homepage, belongs to collection and tagline.

Alot of columns are being dropped in first iteration of the model. At later stage would like to explore more how we could use

[Source of guide how to do custom transformers](https://www.andrewvillazon.com/custom-scikit-learn-transformers/)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

#test = train[['homepage', 'tagline']]
#num_attri = list(train[['budget','popularity','runtime']])

#numPipeline= Pipeline([
#('imputer', SimpleImputer(strategy="median")),
#('std_scaler', StandardScaler()),
#])

                  
customPipe = Pipeline(steps=[("custom", CustomTransformer()),('imputer', SimpleImputer(strategy="median")), 
                             ('std_scaler', StandardScaler())])
custom_attri = list(train)


processed = customPipe.fit_transform(train)
#fullPipeline = ColumnTransformer([("num", numPipeline, num_attri)])
# ,('std_scaler', StandardScaler())
#,('imputer', SimpleImputer(strategy="median")),
#('std_scaler', StandardScaler())

## Modelling

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor


In [None]:
xgb = XGBRegressor()
xgb.fit(processed, train_label)
gbr = GradientBoostingRegressor()
gbr.fit(processed, train_label)
rfr = RandomForestRegressor()
rfr.fit(processed, train_label)
dtr = DecisionTreeRegressor()
dtr.fit(processed, train_label)
lr = LinearRegression()
lr.fit(processed, train_label)

In [None]:


from sklearn.model_selection import cross_val_score

mms = {
    "XGBRegressor": xgb,
    "Gradient Boosting Regressor": gbr,
    "Random Forest Regressor": rfr,
    "Decision Tree Regressor": dtr,
    "Linear Regression": lr
}

for key,value in mms.items():
    score = cross_val_score(value, processed, train_label, scoring="neg_mean_squared_error", cv=10)
    rmse_score = np.sqrt(-score)
    print(key)
    print('Mean:', rmse_score.mean())
    print('Standard deviation:',rmse_score.std())
    print("-----------------------------------------")

Random forest is performing best

## Param tuning

In [None]:
rfr.get_params()

In [None]:
from sklearn.model_selection import RandomizedSearchCV


In [None]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

#Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

Used is assignmetn 1, [Source](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74)



rf_random = RandomizedSearchCV(estimator = rfr, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(processed, train_label)
best_random = rf_random.best_estimator_
score = cross_val_score(best_random, processed, train_label, scoring="neg_mean_squared_error", cv=10)
rmse_score = np.sqrt(-score)
print("Best Random")
print('Mean:', rmse_score.mean())
print('Standard deviation:',rmse_score.std())

from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

parameters = {"learning_rate": sp_randFloat(),
                  "subsample"    : sp_randFloat(),
                  "n_estimators" : sp_randInt(100, 1000),
                  "max_depth"    : sp_randInt(4, 10)
                 }
gbr_random = RandomizedSearchCV(estimator=gbr, param_distributions = parameters,
                               cv = 2, n_iter = 10, n_jobs=-1)
gbr_random.fit(processed, train_label)

best_random = gbr_random.best_estimator_
score = cross_val_score(best_random, processed, train_label, scoring="neg_mean_squared_error", cv=10)
rmse_score = np.sqrt(-score)
print("Best Random")
print('Mean:', rmse_score.mean())
print('Standard deviation:',rmse_score.std())

In [None]:
test_id = test.id
test_processed = customPipe.transform(test)
prediction=rfr.predict(test_processed)

In [None]:




submission = pd.DataFrame({'id': test_id, 'revenue': prediction})
submission = submission[['id', 'revenue']]

submission.to_csv("submission.csv", index=False)

## Saving model

In [None]:
import pickle

# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(rfr, open(filename, 'wb'))
