In [14]:
import numpy as np
import pandas as pd

# load data
data = pd.read_csv("movie_data.csv")
data.head()

Unnamed: 0,runtime,vote_average,release_month,Comedy,Crime,Drama,Action,Thriller,Romance,Adventure,...,Documentary,Western,Empty,TVMovie,Foreign,PC_class,log_budget,log_revenue,log_popularity,log_vote_count
0,105.0,8.2,5,1,1,0,0,0,0,0,...,0,0,0,0,0,2,14.616391,17.661131,2.289601,8.460199
1,111.0,8.3,9,0,1,1,1,0,0,0,...,0,0,0,0,0,3,17.181426,18.221813,3.572093,9.278279
2,81.0,7.3,3,0,0,1,1,1,0,0,...,0,0,0,0,0,3,14.741554,16.29971,2.411977,7.303843
3,104.0,7.8,1,1,1,0,0,0,0,0,...,0,0,0,0,0,1,16.562782,18.685736,2.897458,8.78263
4,119.0,7.4,3,1,0,0,0,0,1,0,...,0,0,0,0,0,1,17.172408,20.671077,3.953932,8.633375


In [15]:
# show information of data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4473 entries, 0 to 4472
Data columns (total 29 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   runtime         4473 non-null   float64
 1   vote_average    4473 non-null   float64
 2   release_month   4473 non-null   int64  
 3   Comedy          4473 non-null   int64  
 4   Crime           4473 non-null   int64  
 5   Drama           4473 non-null   int64  
 6   Action          4473 non-null   int64  
 7   Thriller        4473 non-null   int64  
 8   Romance         4473 non-null   int64  
 9   Adventure       4473 non-null   int64  
 10  Family          4473 non-null   int64  
 11  Fantasy         4473 non-null   int64  
 12  Animation       4473 non-null   int64  
 13  Mystery         4473 non-null   int64  
 14  ScienceFiction  4473 non-null   int64  
 15  Horror          4473 non-null   int64  
 16  History         4473 non-null   int64  
 17  War             4473 non-null   i

In [16]:
# convert numeric data into categorical data
data['release_month']=data['release_month'].astype("category")

# show information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4473 entries, 0 to 4472
Data columns (total 29 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   runtime         4473 non-null   float64 
 1   vote_average    4473 non-null   float64 
 2   release_month   4473 non-null   category
 3   Comedy          4473 non-null   int64   
 4   Crime           4473 non-null   int64   
 5   Drama           4473 non-null   int64   
 6   Action          4473 non-null   int64   
 7   Thriller        4473 non-null   int64   
 8   Romance         4473 non-null   int64   
 9   Adventure       4473 non-null   int64   
 10  Family          4473 non-null   int64   
 11  Fantasy         4473 non-null   int64   
 12  Animation       4473 non-null   int64   
 13  Mystery         4473 non-null   int64   
 14  ScienceFiction  4473 non-null   int64   
 15  Horror          4473 non-null   int64   
 16  History         4473 non-null   int64   
 17  War           

In [17]:
# set inputs
X = pd.concat([data['runtime'],data['vote_average'],data['release_month'],data['Comedy'],data['Crime'],data['Drama'],data['Action'],data['Thriller']
    ,data['Romance'],data['Adventure'],data['Family'],data['ScienceFiction'],data['Fantasy'],data['Horror'],
    data['Mystery'],data['Animation'],data['History'],data['War'],data['Music'],data['Western'],data['Documentary'],data['Empty'],data['TVMovie'],data['Foreign'],
              data['PC_class'],data['log_budget'],data['log_popularity'],data['log_vote_count']],axis=1)
# set target value
Y = data['log_revenue']

In [18]:
from sklearn.model_selection import train_test_split

#split data into train dataset and test data set randomly
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)

####  Build 3 different models and see which one performs best.<br><br>1. Linear Regression Model <br>2. Ridge Regression Model  <br>3. LASSO Regression Model

### Linear Regression Model

In [19]:
from sklearn.linear_model import LinearRegression

# build model
model_LR = LinearRegression()

# fit model
model_LR.fit(X_train,y_train)

# print the coefficients
coeff_parameter = pd.DataFrame(model_LR.coef_,X.columns,columns=['Coefficient'])
print(coeff_parameter)

                Coefficient
runtime            0.011427
vote_average       0.061129
release_month      0.001057
Comedy             0.129893
Crime             -0.201047
Drama             -0.274428
Action             0.166781
Thriller          -0.063536
Romance           -0.001079
Adventure         -0.030449
Family             0.358029
ScienceFiction    -0.337089
Fantasy           -0.156302
Horror             0.053585
Mystery            0.024176
Animation          0.137266
History           -0.205167
War                0.012163
Music              0.132228
Western           -0.466179
Documentary        1.002023
Empty              2.242427
TVMovie            2.494095
Foreign           -1.216886
PC_class          -0.100673
log_budget         0.501587
log_popularity    -0.033913
log_vote_count     0.563470


In [20]:
# make a prediction
prediction_LR_test = model_LR.predict(X_test)
prediction_LR_train = model_LR.predict(X_train)

from sklearn.metrics import mean_squared_error

# print the Mean Squared Error (MSE) on Train Data and compare to the training
print("Performance on Train Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_train, prediction_LR_train))
print("Explained Variance (R^2) on test \t:", model_LR.score(X_train, y_train))

# print the Mean Squared Error (MSE) on Test Data and compare to the training
print("Performance on Test Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_test, prediction_LR_test))
print("Explained Variance (R^2) on test \t:", model_LR.score(X_test, y_test))

Performance on Train Data:
Mean Squared Error (MSE) on test 	: 1.4951434028166748
Explained Variance (R^2) on test 	: 0.6354464504980601
Performance on Test Data:
Mean Squared Error (MSE) on test 	: 1.5230025677291887
Explained Variance (R^2) on test 	: 0.6774463555841578


### Ridge Regression Model

In [8]:
from sklearn.linear_model import Ridge

# define model
model_RR = Ridge(alpha=1.0)

# fit model
model_RR.fit(X_train, y_train)

# print the coefficients
coeff_parameter = pd.DataFrame(model_RR.coef_,X.columns,columns=['Coefficient'])
print(coeff_parameter)

                Coefficient
runtime            0.010520
vote_average       0.137152
release_month     -0.002024
Comedy             0.083483
Crime             -0.152564
Drama             -0.295993
Action             0.066235
Thriller          -0.037294
Romance            0.065115
Adventure         -0.001352
Family             0.369403
ScienceFiction    -0.397771
Fantasy           -0.219172
Horror             0.176917
Mystery           -0.039181
Animation         -0.042450
History           -0.145870
War               -0.502502
Music              0.213291
Western           -0.710167
Documentary        0.780495
Empty              1.455418
TVMovie            2.128055
Foreign           -1.416758
PC_class          -0.060976
log_budget         0.609979
log_popularity    -0.090984
log_vote_count     0.530805


In [9]:
# make a prediction
prediction_RR_test = model_RR.predict(X_test)
prediction_RR_train = model_RR.predict(X_train)

# print the Mean Squared Error (MSE) on Train Data and compare to the training
print("Performance on Train Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_train, prediction_RR_train))
print("Explained Variance (R^2) on test \t:", model_RR.score(X_train, y_train))

# print the Mean Squared Error (MSE) on Test Data and compare to the training
print("Performance on Test Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_test, prediction_RR_test))
print("Explained Variance (R^2) on test \t:", model_RR.score(X_test, y_test))

Performance on Train Data:
Mean Squared Error (MSE) on test 	: 1.3542780891723525
Explained Variance (R^2) on test 	: 0.650045887027092
Performance on Test Data:
Mean Squared Error (MSE) on test 	: 1.7310927088019243
Explained Variance (R^2) on test 	: 0.6575060940630808


### LASSO Regression Model

In [10]:
from sklearn.linear_model import Lasso

# define model
model_LAR = Lasso(alpha=1.0)

# fit model
model_LAR.fit(X_train, y_train)

# print the coefficients
coeff_parameter = pd.DataFrame(model_LAR.coef_,X.columns,columns=['Coefficient'])
print(coeff_parameter)

                Coefficient
runtime            0.010149
vote_average       0.000000
release_month      0.000000
Comedy             0.000000
Crime             -0.000000
Drama             -0.000000
Action             0.000000
Thriller          -0.000000
Romance           -0.000000
Adventure          0.000000
Family             0.000000
ScienceFiction     0.000000
Fantasy            0.000000
Horror            -0.000000
Mystery           -0.000000
Animation          0.000000
History           -0.000000
War               -0.000000
Music              0.000000
Western           -0.000000
Documentary        0.000000
Empty              0.000000
TVMovie            0.000000
Foreign           -0.000000
PC_class          -0.000000
log_budget         0.262507
log_popularity     0.000000
log_vote_count     0.306852


In [11]:
# make a prediction
prediction_LAR_test = model_LAR.predict(X_test)
prediction_LAR_train = model_LAR.predict(X_train)

# print the Mean Squared Error (MSE) on Train Data and compare to the training
print("Performance on Train Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_train, prediction_LAR_train))
print("Explained Variance (R^2) on test \t:", model_LAR.score(X_train, y_train))

# print the Mean Squared Error (MSE) on Test Data and compare to the training
print("Performance on Test Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_test, prediction_LAR_test))
print("Explained Variance (R^2) on test \t:", model_LAR.score(X_test, y_test))

Performance on Train Data:
Mean Squared Error (MSE) on test 	: 2.0261168284094127
Explained Variance (R^2) on test 	: 0.47643846331529893
Performance on Test Data:
Mean Squared Error (MSE) on test 	: 2.547596559934137
Explained Variance (R^2) on test 	: 0.49596212142377016


### Summary

In [12]:
print("Prediction performance --- Linear Regression Model:")
# print the Mean Squared Error (MSE) on Train Data and compare to the training
print("Performance on Train Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_train, prediction_LR_train))
print("Explained Variance (R^2) on test \t:", model_LR.score(X_train, y_train))

# print the Mean Squared Error (MSE) on Test Data and compare to the training
print("Performance on Test Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_test, prediction_LR_test))
print("Explained Variance (R^2) on test \t:", model_LR.score(X_test, y_test))

print()

print("Prediction performance --- Ridge Regression Model:")
# print the Mean Squared Error (MSE) on Train Data and compare to the training
print("Performance on Train Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_train, prediction_RR_train))
print("Explained Variance (R^2) on test \t:", model_RR.score(X_train, y_train))

# print the Mean Squared Error (MSE) on Test Data and compare to the training
print("Performance on Test Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_test, prediction_RR_test))
print("Explained Variance (R^2) on test \t:", model_RR.score(X_test, y_test))

print()

print("Prediction performance --- LASSO Regression Model:")
# print the Mean Squared Error (MSE) on Train Data and compare to the training
print("Performance on Train Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_train, prediction_LAR_train))
print("Explained Variance (R^2) on test \t:", model_LAR.score(X_train, y_train))

# print the Mean Squared Error (MSE) on Test Data and compare to the training
print("Performance on Test Data:")
print("Mean Squared Error (MSE) on test \t:", mean_squared_error(y_test, prediction_LAR_test))
print("Explained Variance (R^2) on test \t:", model_LAR.score(X_test, y_test))

Prediction performance --- Linear Regression Model:
Performance on Train Data:
Mean Squared Error (MSE) on test 	: 1.3527436007576856
Explained Variance (R^2) on test 	: 0.6504424086398357
Performance on Test Data:
Mean Squared Error (MSE) on test 	: 1.731637263684772
Explained Variance (R^2) on test 	: 0.6573983547560667

Prediction performance --- Ridge Regression Model:
Performance on Train Data:
Mean Squared Error (MSE) on test 	: 1.3542780891723525
Explained Variance (R^2) on test 	: 0.650045887027092
Performance on Test Data:
Mean Squared Error (MSE) on test 	: 1.7310927088019243
Explained Variance (R^2) on test 	: 0.6575060940630808

Prediction performance --- LASSO Regression Model:
Performance on Train Data:
Mean Squared Error (MSE) on test 	: 2.0261168284094127
Explained Variance (R^2) on test 	: 0.47643846331529893
Performance on Test Data:
Mean Squared Error (MSE) on test 	: 2.547596559934137
Explained Variance (R^2) on test 	: 0.49596212142377016


### <font color=green> According to the above results, we find the <font size=5>Ridge Regression Model</font> most useful in our case. And we will use this model for prediction use. </font>