In [66]:
from collections import Counter

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_validate

movie_df = pd.read_csv(r'data/data_regression.csv')

In [67]:
print(movie_df.shape)
movie_df.dtypes

(3368, 38)


imdb_id                       object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
movie_facebook_likes           int64
revenue                      float64
release_year                 float64
p

In [68]:
movie_df.head()

Unnamed: 0,imdb_id,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,Genre_Group1,Genre_Group2,Genre_Group3,Genre_Group4,Genre_Group5,Genre_Group6,Genre_Group7,Genre_Group8,Genre_Group9,gross_budget_ratio
0,tt0035423,James Mangold,125.0,123.0,446.0,821.0,Natasha Lyonne,20000.0,47095453.0,Comedy|Fantasy|Romance,...,0,1,0,0,0,0,1,1,0,0.981155
1,tt0080339,Jim Abrahams,134.0,88.0,104.0,318.0,Lloyd Bridges,628.0,83400000.0,Comedy,...,0,1,0,0,0,0,0,0,0,23.828571
2,tt0080453,Randal Kleiser,36.0,104.0,116.0,83.0,Christopher Atkins,1000.0,58853106.0,Adventure|Drama|Romance,...,1,0,1,0,0,0,0,1,0,13.078468
3,tt0080455,John Landis,125.0,148.0,644.0,326.0,Aretha Franklin,1000.0,54200000.0,Action|Comedy|Crime|Music,...,1,1,0,0,0,1,0,0,1,2.007407
4,tt0080487,Harold Ramis,71.0,98.0,11000.0,484.0,Rodney Dangerfield,13000.0,39800000.0,Comedy|Sport,...,0,1,0,0,0,0,0,0,1,6.633333


In [69]:
y = movie_df.gross
x_test = (x_test[['budget','Genre_Group1','Genre_Group2','Genre_Group3','Genre_Group4', 
              'Genre_Group5','Genre_Group6','Genre_Group7','Genre_Group8','Genre_Group9',
               'director_facebook_likes','actor_1_facebook_likes']]) # example only to try
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=20) 

# Min max scaler: divide evertything by the difference between max and min value
# because we want that our features are all on the same dimensionality order (for eample budget is in million and likes in thousends/hundreds)
# MinMAx is better than StandardScaler because we cannot suppose that the distribution is normal!

# if some variables are normal (we can check from the previous analysis) we can use StandardScaler for them. we can fix it later when the analysis part is completed

MM = MinMaxScaler(feature_range=(0, 1), copy=True)

x_train_MM = MM.fit_transform(x_train) # standardize X_train
x_test_MM = MM.transform(x_test) # standardize X_test

# rebuild the DataFrame:
x_train_scaled = pd.DataFrame(data=x_train_MM,  columns=x_train.columns)
x_test_scaled = pd.DataFrame(data=x_test_MM,  columns=x_test.columns)

print(f"Dimensionality of x_train_scaled: {x_train_scaled.shape}")
print(f"Dimensionality of x_test_scaled : {x_test_scaled.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")


Dimensionality of x_train_scaled: (2357, 12)
Dimensionality of x_test_scaled : (1011, 12)
Dimensionality of y_train: (2357,)
Dimensionality of y_test : (1011,)


## Decision Tree Regressor

In [70]:
DTOptimalAcc=0
DTSplitList = np.logspace(-8,0,10)
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
param_dictionary = {"criterion": ['mse'], "min_samples_split": DTSplitList, "min_samples_leaf": DTLeafList}

DecTree = DecisionTreeRegressor()
clf = GridSearchCV(DecTree, param_dictionary)

clf.fit(x_train_scaled, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=None,
                                             splitter='best'),
             iid='deprecated', n_jobs=...
             param_grid={'criterion': ['mse'],
                         'min_samples_leaf': [0.0005, 0.005, 0.025, 0.05, 0.1,
                            

In [71]:
clf.best_params_

{'criterion': 'mse', 'min_samples_leaf': 0.05, 'min_samples_split': 1e-08}

In [72]:
clf.best_score_ # loss

0.36116996672738794

The best model will be the one with the lowest loss

## Random Forest

In [73]:
DTOptimalAcc=0
DTSplitList = np.logspace(-2,0,5)
DTLeafList = np.logspace(-5,-3,5)
param_dictionary = {"criterion": ['mse'], "min_samples_split": DTSplitList, "min_samples_leaf": DTLeafList}

RanFor = RandomForestRegressor()
clf = GridSearchCV(RanFor, param_dictionary)

clf.fit(x_train_scaled, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, r...state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [74]:
clf.best_params_

{'criterion': 'mse', 'min_samples_leaf': 0.001, 'min_samples_split': 0.01}

In [75]:
clf.best_score_ # loss

0.4003215504899675