In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 'actor_name',
    'actor_id', 'character', 'actor_gender', 'actor_popularity', 'director_name', 'director_id', 
    'director_gender', 'director_popularity', 'studio_id', 'studio_name', 'origin_country'
]

target = ['revenue']

In [5]:
# Load the data
file_path = Path('meta.csv')
df = pd.read_csv(file_path)[:-2]
# , index_col=0
# df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.actor_gender[df.actor_gender == 2] = 'male'
df.actor_gender[df.actor_gender == 1] = 'female'

df.director_gender[df.director_gender == 2] = 'male'
df.director_gender[df.director_gender == 1] = 'female'

df

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_name,...,actor_gender,character,actor_popularity,director_name,director_id,director_gender,director_popularity,studio_id,studio_name,origin_country
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,...,female,Hermione Granger,16.930,Chris Columbus,10965,male,2.589,436,1492 Pictures,US
1,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,...,female,Hermione Granger,16.930,Chris Columbus,10965,male,2.589,174,Warner Bros. Pictures,US
2,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,...,male,Draco Malfoy,10.918,Chris Columbus,10965,male,2.589,436,1492 Pictures,US
3,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,...,male,Draco Malfoy,10.918,Chris Columbus,10965,male,2.589,437,Heyday Films,GB
4,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,...,male,Draco Malfoy,10.918,Chris Columbus,10965,male,2.589,174,Warner Bros. Pictures,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,...,male,Dr. Peter Venkman,14.048,Jason Reitman,52443,male,4.105,5,Columbia Pictures,US
1263,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,...,male,Dr. Peter Venkman,14.048,Jason Reitman,52443,male,4.105,2364,The Montecito Picture Company,US
1264,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,...,male,Dr. Peter Venkman,14.048,Jason Reitman,52443,male,4.105,13240,Bron Studios,CA
1265,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Carrie Coon,...,female,Callie Spengler,11.375,Jason Reitman,52443,male,4.105,34,Sony Pictures,US


In [6]:
# More data cleaning
noID_df = df.drop(columns=['movie_id', 'actor_id', 'director_id', 'studio_id'])

noID_df

Unnamed: 0,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_name,actor_gender,character,actor_popularity,director_name,director_gender,director_popularity,studio_name,origin_country
0,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,female,Hermione Granger,16.930,Chris Columbus,male,2.589,1492 Pictures,US
1,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,female,Hermione Granger,16.930,Chris Columbus,male,2.589,Warner Bros. Pictures,US
2,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,1492 Pictures,US
3,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,Heyday Films,GB
4,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,Warner Bros. Pictures,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,Columbia Pictures,US
1263,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,The Montecito Picture Company,US
1264,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,Bron Studios,CA
1265,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Carrie Coon,female,Callie Spengler,11.375,Jason Reitman,male,4.105,Sony Pictures,US


In [7]:
# More data cleaning
noID_df = df.drop(columns=['movie_id', 'actor_id', 'director_id', 'studio_id'])

noID_df

Unnamed: 0,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_name,actor_gender,character,actor_popularity,director_name,director_gender,director_popularity,studio_name,origin_country
0,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,female,Hermione Granger,16.930,Chris Columbus,male,2.589,1492 Pictures,US
1,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,female,Hermione Granger,16.930,Chris Columbus,male,2.589,Warner Bros. Pictures,US
2,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,1492 Pictures,US
3,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,Heyday Films,GB
4,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,Warner Bros. Pictures,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,Columbia Pictures,US
1263,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,The Montecito Picture Company,US
1264,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,Bron Studios,CA
1265,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Carrie Coon,female,Callie Spengler,11.375,Jason Reitman,male,4.105,Sony Pictures,US


In [8]:
# Changing to date and time
from datetime import datetime

noID_df['release_date'] = pd.to_datetime(noID_df['release_date'])

noID_df

Unnamed: 0,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_name,actor_gender,character,actor_popularity,director_name,director_gender,director_popularity,studio_name,origin_country
0,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,female,Hermione Granger,16.930,Chris Columbus,male,2.589,1492 Pictures,US
1,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,female,Hermione Granger,16.930,Chris Columbus,male,2.589,Warner Bros. Pictures,US
2,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,1492 Pictures,US
3,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,Heyday Films,GB
4,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,Warner Bros. Pictures,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,Columbia Pictures,US
1263,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,The Montecito Picture Company,US
1264,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,Bron Studios,CA
1265,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Carrie Coon,female,Callie Spengler,11.375,Jason Reitman,male,4.105,Sony Pictures,US


In [9]:
# trying out grouping
grouped_df = noID_df.groupby('title')

grouped_lists = grouped_df['studio_name'].agg(lambda column: ", ".join(column))

grouped_lists = grouped_lists.reset_index(name='studio_name')

grouped_lists

Unnamed: 0,title,studio_name
0,A Quiet Place Part II,"Paramount, Platinum Dunes, Platinum Dunes, Sun..."
1,After We Fell,"CalMaple Films, Vertical Entertainment, CalMap..."
2,Avatar,"20th Century Fox, 20th Century Fox, Ingenious ..."
3,Avengers: Endgame,"Marvel Studios, Marvel Studios, Marvel Studios..."
4,Avengers: Infinity War,"Marvel Studios, Marvel Studios, Marvel Studios..."
...,...,...
62,Venom,"Pascal Pictures, Matt Tolmach Productions, Avi..."
63,Venom: Let There Be Carnage,"Sony Pictures, Columbia Pictures, Pascal Pictu..."
64,Wonder Woman 1984,"DC Entertainment, Atlas Entertainment, DC Comi..."
65,Wrath of Man,"Metro-Goldwyn-Mayer, Lionsgate, CAA Media Fina..."


In [10]:
# Create our features
X = pd.get_dummies(noID_df.drop(columns='revenue'))


# Create our target
y = pd.get_dummies(noID_df['revenue'])

In [11]:
X.describe()

Unnamed: 0,movie_popularity,vote_average,vote_count,budget,actor_popularity,director_popularity,title_A Quiet Place Part II,title_After We Fell,title_Avatar,title_Avengers: Endgame,...,studio_name_Zero Gravity Management,studio_name_ufotable,origin_country_AU,origin_country_CA,origin_country_CN,origin_country_DE,origin_country_FR,origin_country_GB,origin_country_JP,origin_country_US
count,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,...,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0,1165.0
mean,779.779928,7.608927,9050.109013,137777800.0,18.331684,4.505484,0.00515,0.013734,0.013734,0.048069,...,0.001717,0.001717,0.00515,0.008584,0.0103,0.00515,0.016309,0.060086,0.008584,0.885837
std,1209.165937,0.643609,7240.485878,90166380.0,9.934214,3.665962,0.071611,0.116434,0.116434,0.214003,...,0.041416,0.041416,0.071611,0.092289,0.10101,0.071611,0.126715,0.237748,0.092289,0.318146
min,173.026,6.0,40.0,10000000.0,10.005,0.728,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,256.019,7.2,2347.0,70000000.0,11.672,1.836,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,391.146,7.7,6850.0,116000000.0,14.904,2.857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,672.641,8.3,15236.0,200000000.0,21.049,6.15,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,8304.971,8.4,25788.0,356000000.0,60.132,16.39,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# Check the balance of our target values
# y['loan_status'].value_counts()
y = noID_df['revenue']
y.value_counts()

657000000     104
375540831      84
2797800564     56
2046239637     42
708000000      40
             ... 
61779301        4
148810604       3
112000000       2
467863133       2
136384442       2
Name: revenue, Length: 67, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({821708551: 15,
         168285000: 6,
         657000000: 78,
         855013954: 11,
         2046239637: 31,
         789804554: 13,
         783766341: 21,
         954305868: 9,
         297372261: 5,
         375540831: 63,
         167381210: 30,
         629443428: 18,
         890871626: 8,
         2797800564: 42,
         1153296293: 15,
         16000000: 25,
         876688482: 9,
         708962323: 11,
         175302354: 12,
         721077945: 16,
         933959197: 8,
         2847246203: 12,
         155446362: 6,
         331096766: 21,
         378328978: 4,
         503063688: 6,
         757930663: 7,
         165160005: 9,
         880166924: 26,
         4700000: 18,
         348319861: 13,
         36964325: 8,
         708000000: 30,
         25814306: 5,
         344931: 22,
         1341511219: 13,
         938212738: 10,
         34560577: 13,
         61779301: 3,
         442000000: 22,
         22039969: 12,
         45000000: 5,
         11319

# Naive Random Oversampling

In [14]:
# Resample the training data with the RandomOversampler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

TypeError: invalid type promotion

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

# Undersampling

In [None]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))