In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter


In [6]:
# DB libraries
import psycopg2
import postgres_data
from postgres_data import host
from postgres_data import password
from postgres_data import user
from sqlalchemy import create_engine

ModuleNotFoundError: No module named 'postgres_data'

In [8]:
# ML libraries
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

ModuleNotFoundError: No module named 'imblearn'

In [45]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 'actor_name',
    'actor_id', 'character', 'actor_gender', 'actor_popularity', 'director_name', 'director_id', 
    'director_gender', 'director_popularity', 'studio_id', 'studio_name', 'origin_country'
]

target = ['revenue']

In [46]:
# Create an engine instance
alchemyEngine = create_engine(f'postgresql://{user}:{password}@{host}:5432/Movie_Data')

In [47]:
# Connect to PostgreSQL server
dbConnection = alchemyEngine.connect()

# Read data from PostgreSQL database table and load into a DataFrame instance
df = pd.read_sql("select * from meta", dbConnection)
dbConnection.close()

In [48]:

df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.actor_gender[df.actor_gender == 2] = 'male'
df.actor_gender[df.actor_gender == 1] = 'female'

df.director_gender[df.director_gender == 2] = 'male'
df.director_gender[df.director_gender == 1] = 'female'

df

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_name,...,actor_gender,character,actor_popularity,director_name,director_id,director_gender,director_popularity,studio_id,studio_name,origin_country
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,...,female,Hermione Granger,16.930,Chris Columbus,10965.0,male,2.589,436.0,1492 Pictures,US
1,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,...,female,Hermione Granger,16.930,Chris Columbus,10965.0,male,2.589,174.0,Warner Bros. Pictures,US
2,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,...,male,Draco Malfoy,10.918,Chris Columbus,10965.0,male,2.589,436.0,1492 Pictures,US
3,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,...,male,Draco Malfoy,10.918,Chris Columbus,10965.0,male,2.589,437.0,Heyday Films,GB
4,671,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,...,male,Draco Malfoy,10.918,Chris Columbus,10965.0,male,2.589,174.0,Warner Bros. Pictures,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1264,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,...,male,Dr. Peter Venkman,14.048,Jason Reitman,52443.0,male,4.105,13240.0,Bron Studios,CA
1265,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Carrie Coon,...,female,Callie Spengler,11.375,Jason Reitman,52443.0,male,4.105,34.0,Sony Pictures,US
1266,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Carrie Coon,...,female,Callie Spengler,11.375,Jason Reitman,52443.0,male,4.105,5.0,Columbia Pictures,US
1267,425909,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Finn Wolfhard,...,male,Trevor,17.350,Jason Reitman,52443.0,male,4.105,84042.0,Ghost Corps,US


In [49]:
# More data cleaning
noID_df = df.drop(columns=['movie_id', 'actor_id', 'director_id', 'studio_id'])

noID_df

Unnamed: 0,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_name,actor_gender,character,actor_popularity,director_name,director_gender,director_popularity,studio_name,origin_country
0,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,female,Hermione Granger,16.930,Chris Columbus,male,2.589,1492 Pictures,US
1,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Emma Watson,female,Hermione Granger,16.930,Chris Columbus,male,2.589,Warner Bros. Pictures,US
2,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,1492 Pictures,US
3,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,Heyday Films,GB
4,Harry Potter and the Philosopher's Stone,2001-11-16,305.856,7.9,21398,125000000,976475550,PG,Tom Felton,male,Draco Malfoy,10.918,Chris Columbus,male,2.589,Warner Bros. Pictures,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1264,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Bill Murray,male,Dr. Peter Venkman,14.048,Jason Reitman,male,4.105,Bron Studios,CA
1265,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Carrie Coon,female,Callie Spengler,11.375,Jason Reitman,male,4.105,Sony Pictures,US
1266,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Carrie Coon,female,Callie Spengler,11.375,Jason Reitman,male,4.105,Columbia Pictures,US
1267,Ghostbusters: Afterlife,2021-11-11,428.352,7.6,40,75000000,16000000,PG-13,Finn Wolfhard,male,Trevor,17.350,Jason Reitman,male,4.105,Ghost Corps,US


In [50]:
# # Changing to date and time
# from datetime import datetime

# noID_df['release_date'] = pd.to_datetime(noID_df['release_date'])

# noID_df

In [51]:
# trying out grouping
grouped_df = noID_df.groupby('title')

grouped_lists = grouped_df['studio_name'].agg(lambda column: ", ".join(column))

grouped_lists = grouped_lists.reset_index(name='studio_name')

grouped_lists

Unnamed: 0,title,studio_name
0,A Quiet Place Part II,"Paramount, Platinum Dunes, Platinum Dunes, Sun..."
1,After We Fell,"CalMaple Films, Vertical Entertainment, CalMap..."
2,Avatar,"20th Century Fox, 20th Century Fox, Ingenious ..."
3,Avengers: Endgame,"Marvel Studios, Marvel Studios, Marvel Studios..."
4,Avengers: Infinity War,"Marvel Studios, Marvel Studios, Marvel Studios..."
...,...,...
62,Venom,"Pascal Pictures, Matt Tolmach Productions, Avi..."
63,Venom: Let There Be Carnage,"Sony Pictures, Columbia Pictures, Pascal Pictu..."
64,Wonder Woman 1984,"DC Entertainment, Atlas Entertainment, DC Comi..."
65,Wrath of Man,"Metro-Goldwyn-Mayer, Lionsgate, CAA Media Fina..."


In [52]:
grouped_df

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D1C747FB08>

In [53]:
# Create our features
X = pd.get_dummies(noID_df.drop(columns='revenue'))

# Create our target
y = pd.get_dummies(noID_df['revenue'])

In [54]:
X.describe()

Unnamed: 0,movie_popularity,vote_average,vote_count,budget,actor_popularity,director_popularity,title_A Quiet Place Part II,title_After We Fell,title_Avatar,title_Avengers: Endgame,...,studio_name_Zero Gravity Management,studio_name_ufotable,origin_country_AU,origin_country_CA,origin_country_CN,origin_country_DE,origin_country_FR,origin_country_GB,origin_country_JP,origin_country_US
count,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,...,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0,1167.0
mean,779.177652,7.608912,9034.667524,137670200.0,18.330002,4.504798,0.005141,0.01371,0.01371,0.047986,...,0.001714,0.001714,0.005141,0.008569,0.010283,0.005141,0.016281,0.059983,0.008569,0.886033
std,1208.21599,0.643057,7243.874912,90126460.0,9.925773,3.662854,0.07155,0.116336,0.116336,0.213829,...,0.04138,0.04138,0.07155,0.092211,0.100925,0.07155,0.126609,0.237557,0.092211,0.317908
min,173.026,6.0,40.0,10000000.0,10.005,0.728,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,256.019,7.2,2347.0,70000000.0,11.672,1.836,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,391.146,7.7,6850.0,116000000.0,14.904,2.915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,672.641,8.3,15236.0,200000000.0,21.049,6.15,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,8304.971,8.4,25788.0,356000000.0,60.132,16.39,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [55]:
X.dtypes

movie_popularity     float64
vote_average         float64
vote_count             int64
budget                 int64
actor_popularity     float64
                      ...   
origin_country_DE      uint8
origin_country_FR      uint8
origin_country_GB      uint8
origin_country_JP      uint8
origin_country_US      uint8
Length: 760, dtype: object

In [56]:
# Check the balance of our target values
y = df['revenue']
y.value_counts()

657000000     104
375540831      84
2797800564     56
2046239637     42
708000000      40
             ... 
61779301        4
148810604       3
112000000       2
467863133       2
136384442       2
Name: revenue, Length: 67, dtype: int64

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({16000000: 26,
         783766341: 19,
         821708551: 16,
         880166924: 28,
         175302354: 10,
         24640084: 7,
         657000000: 80,
         136384442: 2,
         895921036: 9,
         167381210: 31,
         375540831: 54,
         976475550: 7,
         2046239637: 36,
         442000000: 26,
         331096766: 14,
         1341511219: 14,
         2797800564: 40,
         708000000: 36,
         757930663: 6,
         4700000: 16,
         213644366: 15,
         36964325: 5,
         344931: 23,
         938212738: 11,
         354264482: 7,
         89500000: 4,
         348319861: 15,
         430238384: 9,
         789804554: 15,
         130000000: 9,
         127000000: 9,
         297372261: 3,
         855013954: 13,
         629443428: 18,
         22039969: 9,
         199166992: 4,
         155446362: 6,
         103966489: 8,
         34560577: 15,
         721077945: 19,
         933959197: 7,
         1153296293: 12,
         1530000

# Balanced Random Forest Classifier

In [58]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [59]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.01675485008818342

In [60]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [61]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     344931       0.00      0.00      1.00      0.00      0.00      0.00         7
    4700000       0.00      0.00      1.00      0.00      0.00      0.00         8
   16000000       0.00      0.00      1.00      0.00      0.00      0.00         9
   17635215       0.00      0.00      1.00      0.00      0.00      0.00         3
   19000000       0.00      0.00      1.00      0.00      0.00      0.00         1
   22039969       0.00      0.00      1.00      0.00      0.00      0.00         7
   24640084       0.00      0.00      1.00      0.00      0.00      0.00         1
   25814306       0.00      0.00      1.00      0.00      0.00      0.00         1
   34560577       0.00      0.00      1.00      0.00      0.00      0.00         3
   36964325       0.00      0.00      1.00      0.00      0.00      0.00         5
   61779301       0.00      0.00      1.00      0.00      0.00      0.00         2
   

In [62]:
# List the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

[(0.028363300200955815, 'movie_popularity'),
 (0.027670737096059096, 'vote_count'),
 (0.02688692345529223, 'vote_average'),
 (0.024319930441392537, 'director_popularity'),
 (0.021163426291278067, 'actor_popularity'),
 (0.02005167846837679, 'budget'),
 (0.009210989563675337, 'ratings_PG-13'),
 (0.00871125021471441, 'studio_name_Marvel Studios'),
 (0.008700460106141557, 'origin_country_US'),
 (0.00849250090549582, 'actor_gender_male'),
 (0.008368254046251945, 'ratings_R'),
 (0.008256770435715632, 'actor_gender_female'),
 (0.008034421459532008, 'ratings_PG'),
 (0.005927617865884456, 'director_name_David Yates'),
 (0.005854857653303137, 'studio_name_Warner Bros. Pictures'),
 (0.005746228865077976, 'director_gender_male'),
 (0.0056860220939049955, 'release_date_2021-09-01'),
 (0.005661012859564464, 'studio_name_Columbia Pictures'),
 (0.005296509347260998, 'director_name_Jon Watts'),
 (0.0051868941497491385, 'director_name_Sam Raimi'),
 (0.0048338585842465824, 'director_gender_female'),
 (0.

# Easy Ensemble AdaBoost Classifier

In [63]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [64]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8253968253968254

In [65]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 7,  0,  0, ...,  0,  0,  0],
       [ 0,  8,  0, ...,  0,  0,  0],
       [ 0,  0,  9, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  6,  0,  0],
       [ 0,  0,  0, ...,  0, 16,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [66]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     344931       1.00      1.00      1.00      1.00      1.00      1.00         7
    4700000       1.00      1.00      1.00      1.00      1.00      1.00         8
   16000000       1.00      1.00      1.00      1.00      1.00      1.00         9
   17635215       1.00      1.00      1.00      1.00      1.00      1.00         3
   19000000       0.00      0.00      1.00      0.00      0.00      0.00         1
   22039969       1.00      1.00      1.00      1.00      1.00      1.00         7
   24640084       0.50      1.00      1.00      0.67      1.00      1.00         1
   25814306       1.00      1.00      1.00      1.00      1.00      1.00         1
   34560577       1.00      1.00      1.00      1.00      1.00      1.00         3
   36964325       1.00      1.00      1.00      1.00      1.00      1.00         5
   61779301       1.00      1.00      1.00      1.00      1.00      1.00         2
   