# Movie Revenue Prediction

### What's the best model: Linear, Polynomial, Ridge, Lasso regression. 

### Goal: Select best features and get the smallest rMSE and greatest r2

In [4]:
#First round of imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
import statsmodels.formula.api as smf
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [5]:
dt = pd.read_csv("movie_metadata.csv")
dt.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [6]:
dt.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [7]:
dt.shape

(5043, 28)

In [8]:
dt.dropna(how="any", inplace = True)

In [9]:
dt.isnull().sum()

color                        0
director_name                0
num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
gross                        0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
facenumber_in_poster         0
plot_keywords                0
movie_imdb_link              0
num_user_for_reviews         0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
aspect_ratio                 0
movie_facebook_likes         0
dtype: int64

In [10]:
dt.shape

(3756, 28)

In [11]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3756 entries, 0 to 5042
Data columns (total 28 columns):
color                        3756 non-null object
director_name                3756 non-null object
num_critic_for_reviews       3756 non-null float64
duration                     3756 non-null float64
director_facebook_likes      3756 non-null float64
actor_3_facebook_likes       3756 non-null float64
actor_2_name                 3756 non-null object
actor_1_facebook_likes       3756 non-null float64
gross                        3756 non-null float64
genres                       3756 non-null object
actor_1_name                 3756 non-null object
movie_title                  3756 non-null object
num_voted_users              3756 non-null int64
cast_total_facebook_likes    3756 non-null int64
actor_3_name                 3756 non-null object
facenumber_in_poster         3756 non-null float64
plot_keywords                3756 non-null object
movie_imdb_link              3756 non-

In [12]:
dr =['color','director_name','genres','actor_2_name', 'actor_1_name','actor_3_name','movie_title', 'plot_keywords', 
     'movie_imdb_link','language','country', 'content_rating']

In [13]:
dt.drop(dr, axis = 1, inplace = True)

In [14]:
dt.shape

(3756, 16)

In [15]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3756 entries, 0 to 5042
Data columns (total 16 columns):
num_critic_for_reviews       3756 non-null float64
duration                     3756 non-null float64
director_facebook_likes      3756 non-null float64
actor_3_facebook_likes       3756 non-null float64
actor_1_facebook_likes       3756 non-null float64
gross                        3756 non-null float64
num_voted_users              3756 non-null int64
cast_total_facebook_likes    3756 non-null int64
facenumber_in_poster         3756 non-null float64
num_user_for_reviews         3756 non-null float64
budget                       3756 non-null float64
title_year                   3756 non-null float64
actor_2_facebook_likes       3756 non-null float64
imdb_score                   3756 non-null float64
aspect_ratio                 3756 non-null float64
movie_facebook_likes         3756 non-null int64
dtypes: float64(13), int64(3)
memory usage: 498.8 KB


In [16]:
lista = [elem for elem in dt.columns if elem != 'gross']
feature_cols = lista

X = dt[feature_cols]
ss = StandardScaler()
Xs = ss.fit_transform(X)

y = dt.gross

In [17]:
# Polynomial Regression

pipe = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
pipe.fit(Xs, y)
preds = pipe.predict(Xs)
print r2_score(y, preds)
scores = cross_val_score(pipe, Xs,y,cv=5, scoring='mean_squared_error')
print 'Polynomial rMSE :',  np.sqrt(abs(scores)).mean()


#Linear
lr = LinearRegression()
lr.fit(Xs, y)
preds = lr.predict(Xs)
print r2_score(y, preds)
scores = cross_val_score(lr, Xs,y,cv=5, scoring='mean_squared_error')
print 'Linear rMSE :',  np.sqrt(abs(scores)).mean()

#Ridge
ridge = Ridge(alpha=0.4)
ridge.fit(Xs, y)
preds = ridge.predict(Xs)
print r2_score(y, preds)
scores = cross_val_score(ridge, Xs,y,cv=5, scoring='mean_squared_error')
print 'Ridge rMSE :',  np.sqrt(abs(scores)).mean()
       
#Lasso
lasso = Lasso(alpha=0.4)
lasso.fit(Xs, y)
preds = lasso.predict(Xs)
print r2_score(y, preds)
scores = cross_val_score(lasso, Xs,y,cv=5, scoring='mean_squared_error')
print 'Lasso rMSE :',  np.sqrt(abs(scores)).mean()

0.721896385721
Polynomial rMSE : 171664817.803
0.470727337733
Linear rMSE : 56241732.3608
0.470650269305
Ridge rMSE : 56302594.2543
0.470710765573
Lasso rMSE : 56289009.8856


#### Selecting Features

In [18]:
coef = lasso.coef_
coef

array([  1.18152011e+07,   2.84393892e+06,  -3.97654198e+06,
        -2.16289721e+07,  -1.59756045e+08,   3.47167800e+07,
         1.96404618e+08,  -1.87862479e+06,   4.74067629e+06,
         2.95046605e+06,  -3.59678710e+06,  -4.44742638e+07,
        -7.64082931e+06,  -6.91208328e+05,  -2.36365861e+06])

In [19]:
sorted(dict(zip(dt.columns, coef)).items(), key = lambda x:abs(x[1]), reverse = True)

[('num_voted_users', 196404617.76095155),
 ('actor_1_facebook_likes', -159756045.37294361),
 ('title_year', -44474263.787847109),
 ('gross', 34716779.961256228),
 ('actor_3_facebook_likes', -21628972.14421089),
 ('num_critic_for_reviews', 11815201.090267843),
 ('actor_2_facebook_likes', -7640829.3135258071),
 ('facenumber_in_poster', 4740676.2883446999),
 ('director_facebook_likes', -3976541.9761272506),
 ('budget', -3596787.0982643422),
 ('num_user_for_reviews', 2950466.0481312717),
 ('duration', 2843938.9205829548),
 ('aspect_ratio', -2363658.61497417),
 ('cast_total_facebook_likes', -1878624.7887219372),
 ('imdb_score', -691208.32761315221)]

In [20]:
lista2 = [elem for elem in dt.columns 
         if  elem != 'gross' and
             elem != 'imdb_score' and
             elem != 'cast_total_facebook_likes' and
             elem != 'aspect_ratio' and
             elem != 'duration' and
             elem != 'num_user_for_reviews' and
             elem != 'budget' and
             elem != 'director_facebook_likes' and
             elem != 'facenumber_in_poster' and
             elem != 'actor_2_facebook_likes' #and
             #elem != 'num_critic_for_reviews' and
             #elem != 'actor_3_facebook_likes' and
             #elem != 'title_year' and
             #elem != 'actor_1_facebook_likes'
                                            ]
         
feature_cols = lista2

X = dt[feature_cols]
ss = StandardScaler()
Xs = ss.fit_transform(X)

y = dt.gross

In [23]:
lista2

['num_critic_for_reviews',
 'actor_3_facebook_likes',
 'actor_1_facebook_likes',
 'num_voted_users',
 'title_year',
 'movie_facebook_likes']

In [21]:
X.shape

(3756, 6)

In [22]:
# Polynomial Regression

pipe = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
pipe.fit(Xs, y)
preds = pipe.predict(Xs)
print r2_score(y, preds)
scores = cross_val_score(pipe, Xs,y,cv=5, scoring='mean_squared_error')
print 'Polynomial rMSE :',  np.sqrt(abs(scores)).mean()


#Linear
lr = LinearRegression()
lr.fit(Xs, y)
preds = lr.predict(Xs)
print r2_score(y, preds)
scores = cross_val_score(lr, Xs,y,cv=5, scoring='mean_squared_error')
print 'Linear rMSE :',  np.sqrt(abs(scores)).mean()

#Ridge
ridge = Ridge(alpha=0.4)
ridge.fit(Xs, y)
preds = ridge.predict(Xs)
print r2_score(y, preds)
scores = cross_val_score(ridge, Xs,y,cv=5, scoring='mean_squared_error')
print 'Ridge rMSE :',  np.sqrt(abs(scores)).mean()
       
#Lasso
lasso = Lasso(alpha=0.4)
lasso.fit(Xs, y)
preds = lasso.predict(Xs)
print r2_score(y, preds)
scores = cross_val_score(lasso, Xs,y,cv=5, scoring='mean_squared_error')
print 'Lasso rMSE :',  np.sqrt(abs(scores)).mean()

0.519227170532
Polynomial rMSE : 53361792.2418
0.421565261508
Linear rMSE : 55530561.7827
0.421565256797
Ridge rMSE : 55530690.125
0.421565261508
Lasso rMSE : 55530561.7503


### Conclusion: the best model is Polynomial: with 6 features I got the smallest rMSE and greatest r2!