In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 'genres',
    'studios', 'actor_popularity_mean', 'director_popularity_mean'
]

target = ['revenue']

In [4]:
# Importing
# Load the data
file_path = Path('../Tables/ML_Test_pgs_checked.csv')
df = pd.read_csv(file_path)[:-2]
# , index_col=0
# df = df.loc[:, columns].copy()

# # Drop the null columns where all values are null
# df = df.dropna(axis='columns', how='all')

# # Drop the null rows
# df = df.dropna()

# Encoded Age Ratings
ratings_enc = {
    'G': 1,
    'PG': 2,
    'PG-13': 3,
    'R': 4,
    'NC-17': 5,
    'NR':6
}

df['ratings_enc'] = df['ratings'].apply(lambda x: ratings_enc[x])

df['release_month'] = pd.DatetimeIndex(df['release_date']).month
df.head()

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,studios,genres,ratings_enc,release_month
0,244,King Kong,1933-03-15,19.367,7.6,1017,672000,10000000,NR,1.102,1.3365,['RKORadioPictures'],"['Adventure', 'Horror', 'ScienceFiction']",6,3
1,408,Snow White and the Seven Dwarfs,1938-02-04,76.361,7.1,5935,1488423,184925486,G,1.185,1.38,['WaltDisneyProductions'],"['Animation', 'Family', 'Fantasy']",1,2
2,630,The Wizard of Oz,1939-08-15,40.364,7.6,4340,2777000,33754967,G,1.233611,1.204,['Metro-Goldwyn-Mayer'],"['Adventure', 'Family', 'Fantasy']",1,8
3,770,Gone with the Wind,1940-02-16,23.816,8.0,3115,4000000,402352579,G,1.045947,1.204,"['Metro-Goldwyn-Mayer', 'Other']","['Drama', 'Romance', 'War']",1,2
4,10895,Pinocchio,1940-02-23,82.431,7.1,4632,2600000,84300000,G,1.116071,1.547,"['RKORadioPictures', 'WaltDisneyProductions']","['Animation', 'Family']",1,2


In [5]:
def profitability_df(df):
    if (df['revenue'] >= df['budget']*2):
        return True
    else:
        return False

df['profitability'] = df.apply(profitability_df, axis = 1)
df

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,studios,genres,ratings_enc,release_month,profitability
0,244,King Kong,1933-03-15,19.367,7.6,1017,672000,10000000,NR,1.102000,1.3365,['RKORadioPictures'],"['Adventure', 'Horror', 'ScienceFiction']",6,3,True
1,408,Snow White and the Seven Dwarfs,1938-02-04,76.361,7.1,5935,1488423,184925486,G,1.185000,1.3800,['WaltDisneyProductions'],"['Animation', 'Family', 'Fantasy']",1,2,True
2,630,The Wizard of Oz,1939-08-15,40.364,7.6,4340,2777000,33754967,G,1.233611,1.2040,['Metro-Goldwyn-Mayer'],"['Adventure', 'Family', 'Fantasy']",1,8,True
3,770,Gone with the Wind,1940-02-16,23.816,8.0,3115,4000000,402352579,G,1.045947,1.2040,"['Metro-Goldwyn-Mayer', 'Other']","['Drama', 'Romance', 'War']",1,2,True
4,10895,Pinocchio,1940-02-23,82.431,7.1,4632,2600000,84300000,G,1.116071,1.5470,"['RKORadioPictures', 'WaltDisneyProductions']","['Animation', 'Family']",1,2,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1898,610253,Halloween Kills,2021-10-14,434.457,6.9,1290,20000000,127000000,R,3.805955,2.7780,"['BlumhouseProductions', 'Miramax', 'Other', '...","['Horror', 'Thriller']",4,10,True
1899,542178,The French Dispatch,2021-10-21,45.780,7.4,629,25000000,33000000,R,5.813298,6.1810,"['IndianPaintbrush', 'Other', 'StudioBabelsberg']","['Comedy', 'Drama', 'Romance']",4,10,False
1900,576845,Last Night in Soho,2021-10-21,772.004,7.5,615,43000000,19000000,R,2.733038,6.8020,"['BigTalkProductions', 'Film4Productions', 'Fo...","['Horror', 'Mystery', 'Thriller']",4,10,False
1901,524434,Eternals,2021-11-03,1339.598,7.1,1293,200000000,368000000,PG-13,4.528703,2.5580,['MarvelStudios'],"['Action', 'Adventure', 'Drama', 'ScienceFicti...",3,11,False


In [6]:
# creating instance of labelencoder
labelencoder = LabelEncoder()

df['studios_cat'] = labelencoder.fit_transform(df.studios.values)
df['genres_cat'] = labelencoder.fit_transform(df.genres.values)

In [7]:
df = df.drop(['release_date', 'title', 'ratings', 'revenue'], axis=1)
df = df.drop(['studios', 'genres'], axis=1)
df.head()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month,profitability,studios_cat,genres_cat
0,244,19.367,7.6,1017,672000,1.102,1.3365,6,3,True,1183,220
1,408,76.361,7.1,5935,1488423,1.185,1.38,1,2,True,1225,241
2,630,40.364,7.6,4340,2777000,1.233611,1.204,1,8,True,985,211
3,770,23.816,8.0,3115,4000000,1.045947,1.204,1,2,True,983,346
4,10895,82.431,7.1,4632,2600000,1.116071,1.547,1,2,True,1182,244


In [8]:
df.dtypes

movie_id                      int64
movie_popularity            float64
vote_average                float64
vote_count                    int64
budget                        int64
actor_popularity_mean       float64
director_popularity_mean    float64
ratings_enc                   int64
release_month                 int64
profitability                  bool
studios_cat                   int32
genres_cat                    int32
dtype: object

In [9]:
# Create our features
X = pd.get_dummies(df.drop(columns='profitability'))


# Create our target
y = pd.get_dummies(df['profitability'])

In [10]:
X.describe()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month,studios_cat,genres_cat
count,1903.0,1903.0,1903.0,1903.0,1903.0,1903.0,1903.0,1903.0,1903.0,1903.0,1903.0
mean,139010.456647,65.741284,6.669679,4269.530741,57899160.0,2.670408,2.865496,3.162375,6.89753,712.017867,211.034682
std,175038.196108,324.635011,0.813943,4221.778637,57950850.0,1.135447,4.849721,0.933326,3.285809,378.165082,114.110577
min,11.0,18.4,2.9,48.0,6.0,0.615463,0.6,1.0,1.0,0.0,0.0
25%,9314.5,23.369,6.1,1337.0,15787470.0,1.851008,1.111,3.0,4.0,394.5,107.0
50%,41233.0,32.354,6.6,2906.0,36000000.0,2.456742,1.8,3.0,7.0,752.0,218.0
75%,271843.5,56.028,7.3,5572.0,80000000.0,3.258971,3.202,4.0,10.0,1061.0,306.5
max,800497.0,13071.277,8.7,30334.0,380000000.0,10.473125,122.79,6.0,12.0,1231.0,381.0


In [11]:
# Check the balance of our target values
y = df['profitability']
y.value_counts()

True     1300
False     603
Name: profitability, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({False: 464, True: 963})

# Naive Random Oversampling

In [13]:
# Resample the training data with the RandomOversampler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({False: 963, True: 963})

In [14]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [15]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 93,  46],
       [ 93, 244]], dtype=int64)

In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.6965501782550221

In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      False       0.50      0.67      0.72      0.57      0.70      0.48       139
       True       0.84      0.72      0.67      0.78      0.70      0.49       337

avg / total       0.74      0.71      0.69      0.72      0.70      0.49       476



# Undersampling

In [18]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({False: 464, True: 464})

In [19]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [20]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6965501782550221

In [21]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 89,  50],
       [ 93, 244]], dtype=int64)

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      False       0.49      0.64      0.72      0.55      0.68      0.46       139
       True       0.83      0.72      0.64      0.77      0.68      0.47       337

avg / total       0.73      0.70      0.66      0.71      0.68      0.47       476

