In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier

In [3]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 'genres',
    'studios', 'actor_popularity_mean', 'director_popularity_mean'
]

target = ['revenue']

In [4]:
# Load the data
file_path = Path('../Tables/meta_ml.csv')
df = pd.read_csv(file_path)[:-2]
# , index_col=0
# df = df.loc[:, columns].copy()

# # Drop the null columns where all values are null
# df = df.dropna(axis='columns', how='all')

# # Drop the null rows
# df = df.dropna()

# Encoded Age Ratings
ratings_enc = {
    'G': 1,
    'PG': 2,
    'PG-13': 3,
    'R': 4,
    'NC-17': 5,
    'NR':6
}

df['ratings_enc'] = df['ratings'].apply(lambda x: ratings_enc[x])

df['release_month'] = pd.DatetimeIndex(df['release_date']).month
df.head()

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,studios,genres,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.993965,2.566,2,11
1,557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,"[''Other'', '' Columbia Pictures'', '' Sony Pi...","[''Fantasy'', ''Action'']",2.387022,2.914,3,5
2,672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.696712,2.566,2,11
3,673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,"[''Warner Bros. Pictures'', '' Other'', '' Hey...","[''Adventure'', ''Fantasy'']",3.523069,3.333,2,5
4,674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'', ''Family'']",3.234944,2.695,3,11


In [5]:
df = df.drop(['release_date', 'title', 'ratings'], axis=1)
df.head()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,revenue,studios,genres,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month
0,671,268.472,7.9,21429,125000000,976475550,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.993965,2.566,2,11
1,557,480.954,7.2,14421,139000000,821708551,"[''Other'', '' Columbia Pictures'', '' Sony Pi...","[''Fantasy'', ''Action'']",2.387022,2.914,3,5
2,672,246.027,7.7,17294,100000000,876688482,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.696712,2.566,2,11
3,673,225.882,8.0,17001,130000000,789804554,"[''Warner Bros. Pictures'', '' Other'', '' Hey...","[''Adventure'', ''Fantasy'']",3.523069,3.333,2,5
4,674,244.428,7.8,16341,150000000,895921036,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'', ''Family'']",3.234944,2.695,3,11


In [6]:
df.dtypes

movie_id                      int64
movie_popularity            float64
vote_average                float64
vote_count                    int64
budget                        int64
revenue                       int64
studios                      object
genres                       object
actor_popularity_mean       float64
director_popularity_mean    float64
ratings_enc                   int64
release_month                 int64
dtype: object

In [7]:
# Create our features
X = pd.get_dummies(df.drop(columns='revenue'))


# Create our target
y = pd.get_dummies(df['revenue'])

In [8]:
X.describe()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month,"studios_[''Columbia Pictures'', '' Other'', '' Marvel Studios'']",...,"genres_[''Horror'', ''Action'', ''Thriller'']","genres_[''Horror'', ''Mystery'', ''Thriller'']","genres_[''Horror'', ''Thriller'']","genres_[''Romance'', ''Drama'']","genres_[''Science Fiction'', ''Action'', ''Adventure'']","genres_[''Science Fiction'', ''Action'']","genres_[''Science Fiction'', ''Adventure'']","genres_[''Science Fiction'', ''Thriller'', ''Horror'']","genres_[''Thriller'', ''Horror'']",genres_['Horror']
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,...,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,373058.625,641.271958,7.427778,6849.861111,114752800.0,3.763272,3.496674,3.027778,6.972222,0.013889,...,0.013889,0.041667,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.027778,0.013889
std,231905.622188,1006.99416,0.629604,6819.869858,84837080.0,1.819889,2.548206,0.768614,2.973935,0.117851,...,0.117851,0.201229,0.165489,0.117851,0.165489,0.117851,0.117851,0.117851,0.165489,0.117851
min,557.0,177.734,6.0,71.0,1000000.0,1.318528,0.6,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,250246.0,233.73375,6.975,1565.5,39750000.0,2.680786,1.76525,3.0,4.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,437800.0,305.3115,7.6,3982.5,105000000.0,3.208295,2.8185,3.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,554498.0,624.7645,7.9,10683.75,185000000.0,4.285281,4.24825,3.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,791373.0,5884.885,8.4,25738.0,356000000.0,11.495923,13.329,6.0,12.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y = df['revenue']
y.value_counts()

19000000     2
132000000    1
217000000    1
155446362    1
355692760    1
            ..
757930663    1
167381210    1
25814306     1
22039969     1
45000000     1
Name: revenue, Length: 71, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({136384442: 1,
         367000000: 1,
         217000000: 1,
         19000000: 2,
         36964325: 1,
         2797800564: 1,
         76981630: 1,
         17635215: 1,
         430238384: 1,
         331096766: 1,
         880166924: 1,
         734000000: 1,
         789804554: 1,
         103966489: 1,
         297372261: 1,
         132000000: 1,
         3310000: 1,
         31478826: 1,
         155446362: 1,
         708962323: 1,
         976475550: 1,
         721077945: 1,
         629443428: 1,
         127000000: 1,
         199166992: 1,
         1341511219: 1,
         800526015: 1,
         25814306: 1,
         503063688: 1,
         30763855: 1,
         128000000: 1,
         2046239637: 1,
         895921036: 1,
         77389310: 1,
         355692760: 1,
         4700000: 1,
         1131927996: 1,
         83601013: 1,
         2847246203: 1,
         45000000: 1,
         16500000: 1,
         336000000: 1,
         175302354: 1,
         49010641: 1,

# Naive Random Oversampling

In [11]:
# Resample the training data with the RandomOversampler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({136384442: 2,
         367000000: 2,
         217000000: 2,
         19000000: 2,
         36964325: 2,
         2797800564: 2,
         76981630: 2,
         17635215: 2,
         430238384: 2,
         331096766: 2,
         880166924: 2,
         734000000: 2,
         789804554: 2,
         103966489: 2,
         297372261: 2,
         132000000: 2,
         3310000: 2,
         31478826: 2,
         155446362: 2,
         708962323: 2,
         976475550: 2,
         721077945: 2,
         629443428: 2,
         127000000: 2,
         199166992: 2,
         1341511219: 2,
         800526015: 2,
         25814306: 2,
         503063688: 2,
         30763855: 2,
         128000000: 2,
         2046239637: 2,
         895921036: 2,
         77389310: 2,
         355692760: 2,
         4700000: 2,
         1131927996: 2,
         83601013: 2,
         2847246203: 2,
         45000000: 2,
         16500000: 2,
         336000000: 2,
         175302354: 2,
         49010641: 2,

In [12]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [13]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [14]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.0

In [15]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     344931       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   19000000       0.00      0.00      0.83      0.00      0.00      0.00       0.0
   22039969       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   61768190       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   90112510       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  103966489       0.00      0.00      0.94      0.00      0.00      0.00       0.0
  148810604       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  153000000       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  165160005       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  167381210       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  168285000       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  2

# Undersampling

In [16]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({3310000: 1,
         4700000: 1,
         16500000: 1,
         17635215: 1,
         19000000: 1,
         24640084: 1,
         25814306: 1,
         30763855: 1,
         31478826: 1,
         34560577: 1,
         36964325: 1,
         45000000: 1,
         49010641: 1,
         76981630: 1,
         77389310: 1,
         83601013: 1,
         103966489: 1,
         127000000: 1,
         128000000: 1,
         130000000: 1,
         132000000: 1,
         136384442: 1,
         155446362: 1,
         175302354: 1,
         199166992: 1,
         217000000: 1,
         297372261: 1,
         331096766: 1,
         336000000: 1,
         355692760: 1,
         367000000: 1,
         430238384: 1,
         503063688: 1,
         629443428: 1,
         708962323: 1,
         721077945: 1,
         734000000: 1,
         757930663: 1,
         789804554: 1,
         800526015: 1,
         821708551: 1,
         855013954: 1,
         880166924: 1,
         890871626: 1,
      

In [17]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [18]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.0

In [19]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     344931       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   19000000       0.00      0.00      0.83      0.00      0.00      0.00       0.0
   22039969       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   61768190       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   90112510       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  103966489       0.00      0.00      0.94      0.00      0.00      0.00       0.0
  148810604       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  153000000       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  165160005       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  167381210       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  168285000       0.00      0.00      1.00      0.00      0.00      0.00       1.0
  2