# Regression Model For Predicting Movie Box Office Gross

Group Members: Ying Wu (A20370189), Yingjuan Wu (A20326320), Sahand Zeinali (A20318383)

Project Description: 
In this project, we will explore the relationship between a movie’s theatrical revenue and other key features. We will use worldwide box-office gross (numerical) as the target variable, and use relevant information that are available prior a movie's release as input variables, including general information like number of critic reviews, duration of movie (in mins), face number in poster, genres, budget, country, content-rating, imdb score, as well as social media factors like number of director facebook likes, number of cast total facebook likes, etc. 

The objective of this project is to build a regression model to predict movie box office gross. Categorical input variables include genres, country, and content-rating; numerical input variables are number of critic reviews, duration, face number in poster, budget, imdb score, number of director facebook likes, and number of cast total facebook likes. 

In [9]:
import numpy as np 
import pandas as pd
from sklearn import linear_model 
from sklearn import preprocessing
from sklearn import model_selection

movies = pd.read_csv("Processed_Data.csv", header = 0)
original_headers = list(movies.columns.values)#save headers in a list
movies.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,gross,cast_total_facebook_likes,facenumber_in_poster,budget,imdb_score,Action,Adventure,...,PG-13,PG,G,R,Not Rated,NC-17,Approved,M,GP,X
0,302.0,169.0,563.0,309000000.0,48400.0,0.0,300000000.0,7.1,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,813.0,164.0,22000.0,448000000.0,107000.0,0.0,250000000.0,8.5,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,462.0,132.0,475.0,73100000.0,1870.0,1.0,264000000.0,6.6,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,324.0,100.0,15.0,201000000.0,2040.0,1.0,260000000.0,7.8,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,375.0,153.0,282.0,302000000.0,58800.0,3.0,250000000.0,7.5,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
movies.shape

(3321, 87)

In [11]:
# Scale continuous features to have 0 mean and 1 variance

# num_critic_for_reviews
movies['num_critic_for_reviews'] = preprocessing.scale(movies['num_critic_for_reviews'])

# duration
movies['duration'] = preprocessing.scale(movies['duration']) 

# director_facebook_likes
movies['director_facebook_likes'] = preprocessing.scale(movies['director_facebook_likes']) 

# gross
movies['gross'] = preprocessing.scale(movies['gross']) 

# cast_total_facebook_likes
movies['cast_total_facebook_likes'] = preprocessing.scale(movies['cast_total_facebook_likes']) 

# facenumber_in_poster
movies['facenumber_in_poster'] = preprocessing.scale(movies['facenumber_in_poster']) 

# budget
movies['budget'] = preprocessing.scale(movies['budget']) 

# imdb_score
movies['imdb_score'] = preprocessing.scale(movies['imdb_score']) 

# Check the shape of data
movies.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,gross,cast_total_facebook_likes,facenumber_in_poster,budget,imdb_score,Action,Adventure,...,PG-13,PG,G,R,Not Rated,NC-17,Approved,M,GP,X
0,1.33346,3.055853,-0.107198,4.272333,1.966068,-0.69516,3.015063,0.678332,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.844645,2.806854,6.49821,6.526978,5.016847,-0.69516,2.434369,2.029763,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.745964,1.213263,-0.134314,0.445926,-0.456334,-0.186635,2.596963,0.195678,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.527679,-0.380328,-0.276054,2.520523,-0.447483,-0.186635,2.550507,1.354047,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.977915,2.259057,-0.193783,4.15879,2.507503,0.830414,2.434369,1.064455,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#Define target variable (gross) as target, define independent variables as data
movies_array = movies.as_matrix()
target = movies_array[:, 3]
data = movies_array[:, list(range(0,3))+list(range(4,len(movies_array[0])))]

In [42]:
##Ordinary Least Squares Regression Model
#Build model with default settings
reg = linear_model.LinearRegression()
reg.fit(data, target)
#Use 10 fold cross-validation to calculate MSE
MSE_array = model_selection.cross_val_score(reg, data, target, cv=10, scoring = 'neg_mean_squared_error')
MSE = np.absolute(np.mean(MSE_array))
print(MSE)
#Modify parameter fit_intercept
reg1 = linear_model.LinearRegression(fit_intercept=False)
reg1.fit(data, target)
MSE1_array = model_selection.cross_val_score(reg1, data, target, cv=10, scoring = 'neg_mean_squared_error')
MSE1 = np.absolute(np.mean(MSE1_array))
print(MSE1)
#Modify parameter normalize
reg2 = linear_model.LinearRegression(normalize=True)
reg2.fit(data, target)
MSE2_array = model_selection.cross_val_score(reg2, data, target, cv=10, scoring = 'neg_mean_squared_error')
MSE2 = np.absolute(np.mean(MSE2_array))
print(MSE2)
#Modify parameter copy_X
reg3 = linear_model.LinearRegression(copy_X=False)
reg3.fit(data, target)
MSE3_array = model_selection.cross_val_score(reg3, data, target, cv=10, scoring = 'neg_mean_squared_error')
MSE3 = np.absolute(np.mean(MSE3_array))
print(MSE3)
#Modify parameter n_jobs
reg4 = linear_model.LinearRegression(n_jobs=-1)
reg4.fit(data, target)
MSE4_array = model_selection.cross_val_score(reg4, data, target, cv=10, scoring = 'neg_mean_squared_error')
MSE4 = np.absolute(np.mean(MSE4_array))
print(MSE4)

8.06698448183e+24
9.97498584381e+22
1.90405187299e+57
6.93796399644e+24
6.93560531941e+24
