In [1]:
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn import metrics
warnings.filterwarnings('ignore')


from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

In [2]:
movie_df = pd.read_csv(r'data/data_regression.csv')

In [3]:
print(movie_df.shape)
movie_df.dtypes

(3878, 50)


imdb_id                       object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
movie_facebook_likes           int64
Genre_Group1                   int64
Genre_Group2                   int64
G

In [4]:
movie_df = movie_df[movie_df.gross_budget_ratio<15]

In [5]:
movie_df.head()

Unnamed: 0,imdb_id,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,Mystery_Thriller_Horror,Sci-Fi_Fantasy,Family_Animation,Action_Adventure,History_War,Others,director_rank,actor1_rank,actor2_rank,actor3_rank
0,tt0035423,James Mangold,125.0,123.0,446.0,821.0,Natasha Lyonne,20000.0,47095453.0,Comedy|Fantasy|Romance,...,0,1,0,0,0,0,0,6,3,4
2,tt0080453,Randal Kleiser,36.0,104.0,116.0,83.0,Christopher Atkins,1000.0,58853106.0,Adventure|Drama|Romance,...,0,0,0,1,0,0,0,3,3,3
3,tt0080455,John Landis,125.0,148.0,644.0,326.0,Aretha Franklin,1000.0,54200000.0,Action|Comedy|Crime|Music,...,0,0,0,1,0,1,0,3,3,4
4,tt0080487,Harold Ramis,71.0,98.0,11000.0,484.0,Rodney Dangerfield,13000.0,39800000.0,Comedy|Sport,...,0,0,0,0,0,1,0,7,3,5
5,tt0080492,Nancy Walker,26.0,124.0,42.0,161.0,Randy Jones,801.0,2000000.0,Biography|Comedy|Musical,...,0,0,0,0,0,1,0,3,2,3


# Profitability ratio prediction


## Data preparation

In [6]:
data = movie_df.drop(['imdb_id','director_name','num_critic_for_reviews','director_facebook_likes',
'actor_3_facebook_likes',
'actor_2_name',
'actor_1_facebook_likes',
'gross',
'genres',
'actor_1_name',
'movie_title',
'num_voted_users',
'actor_3_name',
'num_user_for_reviews',
'title_year',
'actor_2_facebook_likes',
'imdb_score',
'movie_facebook_likes',
'Genre_Group1',
'Genre_Group2',
'Genre_Group3',
'Genre_Group4',
'Genre_Group5',
'Genre_Group6',
'Genre_Group7',
'Genre_Group8',
'Genre_Group9',
'gross_budget_ratio',
'country',
'language'],axis = 1)

In [7]:
y = movie_df['gross_budget_ratio']

In [8]:
data_dummies = pd.get_dummies(data, columns=['content_rating'])

In [9]:
data_dummies.head()

Unnamed: 0,duration,cast_total_facebook_likes,facenumber_in_poster,budget,Biography,Comedy,Crime,Drama,Romance,Mystery_Thriller_Horror,...,actor1_rank,actor2_rank,actor3_rank,content_rating_G,content_rating_NC-17,content_rating_Not Rated,content_rating_PG,content_rating_PG-13,content_rating_R,content_rating_X
0,123.0,22209,1.0,48000000.0,0,1,0,0,1,0,...,6,3,4,0,0,0,0,1,0,0
2,104.0,1662,0.0,4500000.0,0,0,0,1,1,0,...,3,3,3,0,0,0,0,0,1,0
3,148.0,2566,1.0,27000000.0,0,1,1,0,0,0,...,3,3,4,0,0,0,0,0,1,0
4,98.0,14921,3.0,6000000.0,0,1,0,0,0,0,...,7,3,5,0,0,0,0,0,1,0
5,124.0,1462,0.0,20000000.0,1,1,0,0,0,0,...,3,2,3,0,0,0,1,0,0,0


In [10]:
print(data_dummies.shape)
data_dummies.dtypes

(3807, 26)


duration                     float64
cast_total_facebook_likes      int64
facenumber_in_poster         float64
budget                       float64
Biography                      int64
Comedy                         int64
Crime                          int64
Drama                          int64
Romance                        int64
Mystery_Thriller_Horror        int64
Sci-Fi_Fantasy                 int64
Family_Animation               int64
Action_Adventure               int64
History_War                    int64
Others                         int64
director_rank                  int64
actor1_rank                    int64
actor2_rank                    int64
actor3_rank                    int64
content_rating_G               uint8
content_rating_NC-17           uint8
content_rating_Not Rated       uint8
content_rating_PG              uint8
content_rating_PG-13           uint8
content_rating_R               uint8
content_rating_X               uint8
dtype: object

## Ridge

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data_dummies, y, test_size=0.30)

# Scale data 
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)

In [12]:
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = linear_model.Ridge()
gridridge = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
gridridge.fit(X_train_scaled, y_train)

y_pred = gridridge.predict(X_test_scaled)

print("ridge best parameters:", gridridge.best_params_)
print("ridge score:", gridridge.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, gridridge.predict(X_test_scaled)))
print("ridge best estimator coef:", gridridge.best_estimator_.coef_)


ridge best parameters: {'alpha': 1.3000000000000003}
ridge score: -4.394812983362963
ridge MSE: 4.394812983362963
ridge best estimator coef: [-0.39590432 -0.1298624  -1.32503168 -3.46577471 -0.04598472  0.00567089
 -0.53435562 -0.28773791 -0.05708807 -0.11906485 -0.40975532 -0.21873153
 -0.30902048 -0.60891316  0.07286958  2.4731354  -0.08022986  0.28880673
 -0.21611833  0.35770729 -0.92955383 -1.28388202  0.05436123 -0.31713626
 -0.31850168  2.43700528]


In [13]:
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1

Unnamed: 0,Actual,Predicted
2164,0.562663,1.512893
3034,0.866744,1.864650
3708,1.314898,2.503721
1498,1.000577,1.025627
1076,1.129367,1.019291
...,...,...
3001,0.008227,1.948167
714,1.356606,1.739165
1722,0.726677,1.746037
3224,0.214136,1.194248


## Lasso

In [14]:
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
lasso = linear_model.Lasso()
gridlasso = GridSearchCV(lasso, parameters, scoring ='neg_mean_squared_error')
gridlasso.fit(X_train_scaled, y_train)

y_pred = gridlasso.predict(X_test_scaled)

print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)

lasso best parameters: {'alpha': 0.01}
lasso score: -4.379859521810947
lasso MSE: 4.379859521810947
lasso best estimator coef: [-0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  1.66854235e-04 -4.92181774e-01 -2.48185252e-01
 -0.00000000e+00 -4.67214929e-02 -2.89477021e-01  0.00000000e+00
 -2.83986158e-01 -4.50077142e-01  3.65357535e-02  1.12211501e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00  7.84486978e-02
 -0.00000000e+00 -6.37579825e-01  1.40051780e-01 -1.47958626e-02
 -0.00000000e+00  0.00000000e+00]


In [15]:
df2 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df2

Unnamed: 0,Actual,Predicted
2164,0.562663,1.580688
3034,0.866744,1.870165
3708,1.314898,1.920774
1498,1.000577,1.176414
1076,1.129367,1.176247
...,...,...
3001,0.008227,1.967211
714,1.356606,1.519085
1722,0.726677,1.787654
3224,0.214136,1.393748


## ElasticNet

In [16]:
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
EN = linear_model.ElasticNet()
gridEN = GridSearchCV(EN, parameters, scoring ='neg_mean_squared_error')
gridEN.fit(X_train_scaled, y_train)

y_pred = gridEN.predict(X_test_scaled)

print("ElasticNet best parameters:", gridEN.best_params_)
print("ElasticNet score:", gridEN.score(X_test_scaled, y_test))
print("ElasticNet MSE:", mean_squared_error(y_test, gridEN.predict(X_test_scaled)))
print("ElasticNet best estimator coef:", gridEN.best_estimator_.coef_)

ElasticNet best parameters: {'alpha': 0.1}
ElasticNet score: -4.480569884399375
ElasticNet MSE: 4.480569884399375
ElasticNet best estimator coef: [-0.          0.         -0.         -0.         -0.          0.01457362
 -0.15044937 -0.00230544  0.         -0.         -0.          0.
 -0.0750678  -0.          0.          0.         -0.          0.
  0.          0.         -0.         -0.          0.         -0.
 -0.          0.        ]


In [17]:
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1

Unnamed: 0,Actual,Predicted
2164,0.562663,1.618016
3034,0.866744,1.618016
3708,1.314898,1.618016
1498,1.000577,1.467567
1076,1.129367,1.452993
...,...,...
3001,0.008227,1.678510
714,1.356606,1.618016
1722,0.726677,1.690778
3224,0.214136,1.603442
