In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 
    'actor_id', 'actor_gender', 'character', 'actor_popularity', 'name', 'director_id', 
    'director_gender', 'director_popularity', 'studio_id', 'studio_name', 'origin_country'
]

target = ['revenue']

In [5]:
# Load the data
file_path = Path('meta.csv')
df = pd.read_csv(file_path)[:-2]
# df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_id,actor_gender,character,actor_popularity,name,director_id,director_gender,director_popularity,studio_id,studio_name,origin_country
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,10990.0,1.0,Hermione Granger,16.930,Chris Columbus,10965,2,2.589,436,1492 Pictures,US
1,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,10990.0,1.0,Hermione Granger,16.930,Chris Columbus,10965,2,2.589,174,Warner Bros. Pictures,US
2,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,10993.0,2.0,Draco Malfoy,10.918,Chris Columbus,10965,2,2.589,436,1492 Pictures,US
3,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,10993.0,2.0,Draco Malfoy,10.918,Chris Columbus,10965,2,2.589,437,Heyday Films,GB
4,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,10993.0,2.0,Draco Malfoy,10.918,Chris Columbus,10965,2,2.589,174,Warner Bros. Pictures,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,1532.0,2.0,Dr. Peter Venkman,14.048,Jason Reitman,52443,2,4.105,5,Columbia Pictures,US
1263,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,1532.0,2.0,Dr. Peter Venkman,14.048,Jason Reitman,52443,2,4.105,2364,The Montecito Picture Company,US
1264,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,1532.0,2.0,Dr. Peter Venkman,14.048,Jason Reitman,52443,2,4.105,13240,Bron Studios,CA
1265,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,1308445.0,1.0,Callie Spengler,11.375,Jason Reitman,52443,2,4.105,34,Sony Pictures,US


In [6]:
# Create our features
X = pd.get_dummies(df.drop(columns='revenue'))

# Create our target
y = pd.get_dummies(df['revenue'])

In [7]:
X.describe()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,actor_id,actor_gender,actor_popularity,director_id,director_gender,...,studio_name_Zero Gravity Management,studio_name_ufotable,origin_country_AU,origin_country_CA,origin_country_CN,origin_country_DE,origin_country_FR,origin_country_GB,origin_country_JP,origin_country_US
count,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,...,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0
mean,366383.858369,779.779928,7.608927,9050.109013,137777800.0,176638.0,1.55794,18.331684,228153.9,1.945064,...,0.001717,0.001717,0.00515,0.008584,0.0103,0.00515,0.016309,0.060086,0.008584,0.885837
std,241007.374882,1209.165937,0.643609,7240.485878,90166380.0,412865.7,0.496845,9.934214,445189.1,0.256336,...,0.041416,0.041416,0.071611,0.092289,0.10101,0.071611,0.126715,0.237748,0.092289,0.318146
min,557.0,173.026,6.0,40.0,10000000.0,63.0,1.0,10.005,956.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,271110.0,256.019,7.2,2347.0,70000000.0,5469.0,1.0,11.672,12891.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,370172.0,391.146,7.7,6850.0,116000000.0,17605.0,2.0,14.904,19272.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,550988.0,672.641,8.3,15236.0,200000000.0,73968.0,2.0,21.049,87257.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,791373.0,8304.971,8.4,25788.0,356000000.0,2979464.0,2.0,60.132,1932178.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Check the balance of our target values
y = df['revenue']
y.value_counts()

657000000     104
375540831      84
2797800564     56
2046239637     42
708000000      40
             ... 
61779301        4
148810604       3
112000000       2
467863133       2
136384442       2
Name: revenue, Length: 67, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({378328978: 3,
         167381210: 31,
         375540831: 59,
         168285000: 6,
         130000000: 11,
         895921036: 9,
         880166924: 27,
         657000000: 83,
         354264482: 8,
         976475550: 7,
         2046239637: 34,
         2797800564: 40,
         1341511219: 14,
         783766341: 18,
         4700000: 18,
         708000000: 33,
         757930663: 6,
         442000000: 24,
         25814306: 6,
         175302354: 10,
         938212738: 11,
         153000000: 12,
         331096766: 19,
         430238384: 8,
         821708551: 15,
         89500000: 5,
         348319861: 15,
         789804554: 15,
         16000000: 23,
         127000000: 9,
         629443428: 18,
         503063688: 7,
         155446362: 6,
         103966489: 6,
         34560577: 15,
         297372261: 4,
         721077945: 16,
         933959197: 7,
         1153296293: 12,
         708962323: 11,
         1518815515: 6,
         855013954: 12,
         

# Balanced Random Forest Classifier

In [10]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [11]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.046875

In [12]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     344931       0.00      0.00      1.00      0.00      0.00      0.00         6
    4700000       0.00      0.00      1.00      0.00      0.00      0.00         6
   16000000       0.00      0.00      1.00      0.00      0.00      0.00        10
   17635215       0.00      0.00      1.00      0.00      0.00      0.00         2
   19000000       0.00      0.00      1.00      0.00      0.00      0.00         3
   22039969       0.00      0.00      1.00      0.00      0.00      0.00         7
   24640084       0.00      0.00      1.00      0.00      0.00      0.00         1
   25814306       0.00      0.00      1.00      0.00      0.00      0.00         1
   34560577       0.00      0.00      1.00      0.00      0.00      0.00         3
   36964325       0.00      0.00      1.00      0.00      0.00      0.00         4
   45000000       0.00      0.00      1.00      0.00      0.00      0.00         1
   

In [14]:
# List the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

[(0.029703102934743465, 'vote_count'),
 (0.027870936126212442, 'movie_id'),
 (0.02521749257730073, 'budget'),
 (0.024518324596515508, 'actor_id'),
 (0.02324835838298816, 'vote_average'),
 (0.023012895805820326, 'director_popularity'),
 (0.022662430982058075, 'movie_popularity'),
 (0.021667203577739713, 'director_id'),
 (0.020939458868685108, 'studio_id'),
 (0.017892540050142614, 'actor_popularity'),
 (0.012505243243600344, 'actor_gender'),
 (0.009198410617247976, 'ratings_PG-13'),
 (0.009143123093834176, 'ratings_PG'),
 (0.008994538897057048, 'director_gender'),
 (0.007946033054820471, 'origin_country_US'),
 (0.007542294315502437, 'studio_name_Marvel Studios'),
 (0.007487386231998015, 'studio_name_Warner Bros. Pictures'),
 (0.007243065999759247, 'ratings_R'),
 (0.006530438597806123, 'studio_name_Columbia Pictures'),
 (0.005837007092035716, 'release_date_2021-09-01'),
 (0.005759453607493995, 'name_Sam Raimi'),
 (0.00528164209102674, 'release_date_2021-05-25'),
 (0.005175596668698626, 's

# Easy Ensemble AdaBoost Classifier

In [15]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [16]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7612276785714286

In [17]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  3,  0, ...,  0,  0,  0],
       [ 0,  0, 10, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  8,  0,  0],
       [ 0,  0,  0, ...,  0, 16,  0],
       [ 0,  0,  0, ...,  0,  0,  6]], dtype=int64)

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     344931       0.00      0.00      1.00      0.00      0.00      0.00         6
    4700000       0.75      0.50      1.00      0.60      0.71      0.47         6
   16000000       1.00      1.00      1.00      1.00      1.00      1.00        10
   17635215       1.00      1.00      1.00      1.00      1.00      1.00         2
   19000000       1.00      1.00      1.00      1.00      1.00      1.00         3
   22039969       1.00      0.43      1.00      0.60      0.65      0.40         7
   24640084       0.50      1.00      1.00      0.67      1.00      1.00         1
   25814306       1.00      1.00      1.00      1.00      1.00      1.00         1
   34560577       1.00      1.00      1.00      1.00      1.00      1.00         3
   36964325       1.00      1.00      1.00      1.00      1.00      1.00         4
   45000000       1.00      1.00      1.00      1.00      1.00      1.00         1
   