In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier

In [3]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 'genres',
    'studios', 'actor_popularity_mean', 'director_popularity_mean'
]

target = ['revenue']

In [4]:
# Load the data
file_path = Path('../Tables/meta_ml.csv')
df = pd.read_csv(file_path)[:-2]
# , index_col=0
# df = df.loc[:, columns].copy()

# # Drop the null columns where all values are null
# df = df.dropna(axis='columns', how='all')

# # Drop the null rows
# df = df.dropna()

# Encoded Age Ratings
ratings_enc = {
    'G': 1,
    'PG': 2,
    'PG-13': 3,
    'R': 4,
    'NC-17': 5,
    'NR':6
}

df['ratings_enc'] = df['ratings'].apply(lambda x: ratings_enc[x])

df['release_month'] = pd.DatetimeIndex(df['release_date']).month
df.head()

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,studios,genres,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.993965,2.566,2,11
1,557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,"[''Other'', '' Columbia Pictures'', '' Sony Pi...","[''Fantasy'', ''Action'']",2.387022,2.914,3,5
2,672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.696712,2.566,2,11
3,673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,"[''Warner Bros. Pictures'', '' Other'', '' Hey...","[''Adventure'', ''Fantasy'']",3.523069,3.333,2,5
4,674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'', ''Family'']",3.234944,2.695,3,11


In [5]:
df = df.drop(['release_date', 'title', 'ratings'], axis=1)
df.head()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,revenue,studios,genres,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month
0,671,268.472,7.9,21429,125000000,976475550,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.993965,2.566,2,11
1,557,480.954,7.2,14421,139000000,821708551,"[''Other'', '' Columbia Pictures'', '' Sony Pi...","[''Fantasy'', ''Action'']",2.387022,2.914,3,5
2,672,246.027,7.7,17294,100000000,876688482,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.696712,2.566,2,11
3,673,225.882,8.0,17001,130000000,789804554,"[''Warner Bros. Pictures'', '' Other'', '' Hey...","[''Adventure'', ''Fantasy'']",3.523069,3.333,2,5
4,674,244.428,7.8,16341,150000000,895921036,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'', ''Family'']",3.234944,2.695,3,11


In [6]:
# Create our features
X = pd.get_dummies(df.drop(columns='revenue'))


# Create our target
y = pd.get_dummies(df['revenue'])

In [7]:
X.describe()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month,"studios_[''Columbia Pictures'', '' Other'', '' Marvel Studios'']",...,"genres_[''Horror'', ''Action'', ''Thriller'']","genres_[''Horror'', ''Mystery'', ''Thriller'']","genres_[''Horror'', ''Thriller'']","genres_[''Romance'', ''Drama'']","genres_[''Science Fiction'', ''Action'', ''Adventure'']","genres_[''Science Fiction'', ''Action'']","genres_[''Science Fiction'', ''Adventure'']","genres_[''Science Fiction'', ''Thriller'', ''Horror'']","genres_[''Thriller'', ''Horror'']",genres_['Horror']
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,...,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,373058.625,641.271958,7.427778,6849.861111,114752800.0,3.763272,3.496674,3.027778,6.972222,0.013889,...,0.013889,0.041667,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.027778,0.013889
std,231905.622188,1006.99416,0.629604,6819.869858,84837080.0,1.819889,2.548206,0.768614,2.973935,0.117851,...,0.117851,0.201229,0.165489,0.117851,0.165489,0.117851,0.117851,0.117851,0.165489,0.117851
min,557.0,177.734,6.0,71.0,1000000.0,1.318528,0.6,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,250246.0,233.73375,6.975,1565.5,39750000.0,2.680786,1.76525,3.0,4.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,437800.0,305.3115,7.6,3982.5,105000000.0,3.208295,2.8185,3.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,554498.0,624.7645,7.9,10683.75,185000000.0,4.285281,4.24825,3.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,791373.0,5884.885,8.4,25738.0,356000000.0,11.495923,13.329,6.0,12.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Check the balance of our target values
y = df['revenue']
y.value_counts()

19000000     2
132000000    1
217000000    1
155446362    1
355692760    1
            ..
757930663    1
167381210    1
25814306     1
22039969     1
45000000     1
Name: revenue, Length: 71, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({136384442: 1,
         367000000: 1,
         217000000: 1,
         19000000: 2,
         36964325: 1,
         2797800564: 1,
         76981630: 1,
         17635215: 1,
         430238384: 1,
         331096766: 1,
         880166924: 1,
         734000000: 1,
         789804554: 1,
         103966489: 1,
         297372261: 1,
         132000000: 1,
         3310000: 1,
         31478826: 1,
         155446362: 1,
         708962323: 1,
         976475550: 1,
         721077945: 1,
         629443428: 1,
         127000000: 1,
         199166992: 1,
         1341511219: 1,
         800526015: 1,
         25814306: 1,
         503063688: 1,
         30763855: 1,
         128000000: 1,
         2046239637: 1,
         895921036: 1,
         77389310: 1,
         355692760: 1,
         4700000: 1,
         1131927996: 1,
         83601013: 1,
         2847246203: 1,
         45000000: 1,
         16500000: 1,
         336000000: 1,
         175302354: 1,
         49010641: 1,

# Balanced Random Forest Classifier

In [10]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [11]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.0

In [12]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0,

In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     344931       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   22039969       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   61768190       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   90112510       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  148810604       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  153000000       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  165160005       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  167381210       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  168285000       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  233274812       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  375540831       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  3

In [14]:
# List the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

[(0.060609804551206174, 'movie_id'),
 (0.05885864435046919, 'actor_popularity_mean'),
 (0.057635508152823096, 'movie_popularity'),
 (0.056678740828718786, 'vote_average'),
 (0.05565082415514239, 'vote_count'),
 (0.05524433474150893, 'budget'),
 (0.05436511800003308, 'director_popularity_mean'),
 (0.0468107457738519, 'release_month'),
 (0.028771625080955775, 'ratings_enc'),
 (0.013340736887027907, "genres_[''Action'', ''Adventure'', ''Fantasy'']"),
 (0.013138434919365883, "genres_[''Adventure'', ''Fantasy'']"),
 (0.01281829601057753, "studios_['Marvel Studios']"),
 (0.012780812572451632, "genres_[''Horror'', ''Mystery'', ''Thriller'']"),
 (0.01271656486052767,
  "studios_[''Other'', '' Other'', '' Other'', '' Other'']"),
 (0.012042648278939878, "genres_[''Action'', ''Crime'', ''Thriller'']"),
 (0.011208595262456801,
  "studios_[''Warner Bros. Pictures'', '' Heyday Films'']"),
 (0.010059975488326302,
  "genres_[''Action'', ''Adventure'', ''Science Fiction'']"),
 (0.01005049314902336,
  "

# Easy Ensemble AdaBoost Classifier

In [15]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [16]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.0

In [17]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     344931       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   22039969       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   61768190       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   90112510       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  127000000       0.00      0.00      0.94      0.00      0.00      0.00       0.0
  148810604       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  153000000       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  165160005       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  167381210       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  168285000       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  233274812       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  3