In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 'genres',
    'studios', 'actor_popularity_mean', 'director_popularity_mean'
]

target = ['revenue']

In [4]:
# Importing
# Load the data
file_path = Path('../Tables/ML_Test_pgs_checked.csv')
df = pd.read_csv(file_path)[:-2]
# , index_col=0
# df = df.loc[:, columns].copy()

# # Drop the null columns where all values are null
# df = df.dropna(axis='columns', how='all')

# # Drop the null rows
# df = df.dropna()

# Encoded Age Ratings
ratings_enc = {
    'G': 1,
    'PG': 2,
    'PG-13': 3,
    'R': 4,
    'NC-17': 5,
    'NR':6
}

df['ratings_enc'] = df['ratings'].apply(lambda x: ratings_enc[x])

df['release_month'] = pd.DatetimeIndex(df['release_date']).month
df.head()

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,studios,genres,ratings_enc,release_month
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,2.993965,2.566,"[""''HeydayFilms''"", ""''Other''"", ""''WarnerBros...","[""''Adventure''"", ""''Fantasy''""]",2,11
1,557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,2.387022,2.914,"[""''ColumbiaPictures''"", ""''MarvelEntertainmen...","[""''Action''"", ""''Fantasy''""]",3,5
2,672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,2.696712,2.566,"[""''HeydayFilms''"", ""''Other''"", ""''WarnerBros...","[""''Adventure''"", ""''Fantasy''""]",2,11
3,673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,3.523069,3.333,"[""''HeydayFilms''"", ""''Other''"", ""''WarnerBros...","[""''Adventure''"", ""''Fantasy''""]",2,5
4,674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,3.234944,2.695,"[""''HeydayFilms''"", ""''Other''"", ""''WarnerBros...","[""''Adventure''"", ""''Family''"", ""''Fantasy''""]",3,11


In [5]:
def profitability_df(df):
    if (df['revenue'] >= df['budget']*2):
        return True
    else:
        return False

df['profitability'] = df.apply(profitability_df, axis = 1)
df

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,studios,genres,ratings_enc,release_month,profitability
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,2.993965,2.566,"[""''HeydayFilms''"", ""''Other''"", ""''WarnerBros...","[""''Adventure''"", ""''Fantasy''""]",2,11,True
1,557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,2.387022,2.914,"[""''ColumbiaPictures''"", ""''MarvelEntertainmen...","[""''Action''"", ""''Fantasy''""]",3,5,True
2,672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,2.696712,2.566,"[""''HeydayFilms''"", ""''Other''"", ""''WarnerBros...","[""''Adventure''"", ""''Fantasy''""]",2,11,True
3,673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,3.523069,3.333,"[""''HeydayFilms''"", ""''Other''"", ""''WarnerBros...","[""''Adventure''"", ""''Fantasy''""]",2,5,True
4,674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,3.234944,2.695,"[""''HeydayFilms''"", ""''Other''"", ""''WarnerBros...","[""''Adventure''"", ""''Family''"", ""''Fantasy''""]",3,11,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,744275,After We Fell,2021-09-01,1710.038,7.2,956,14000000,19000000,R,4.204667,2.580,"[""''Other''""]","[""''Drama''"", ""''Romance''""]",4,9,False
66,438631,Dune,2021-09-15,1363.113,8.0,4210,165000000,367000000,PG-13,5.641282,7.870,"[""'Other'""]","[""''Adventure''"", ""''ScienceFiction''""]",3,9,True
67,370172,No Time to Die,2021-09-29,3366.389,7.6,2075,242000000,734000000,PG-13,4.164382,4.167,"[""''Metro-Goldwyn-Mayer''"", ""''Other''"", ""''Un...","[""''Action''"", ""''Adventure''"", ""''Thriller''""]",3,9,True
68,580489,Venom: Let There Be Carnage,2021-09-30,5797.863,7.0,2452,110000000,454000000,PG-13,4.363038,13.077,"[""''ColumbiaPictures''"", ""''MarvelEntertainmen...","[""''Action''"", ""''Adventure''"", ""''ScienceFict...",3,9,True


In [6]:
# creating instance of labelencoder
labelencoder = LabelEncoder()

df['studios_cat'] = labelencoder.fit_transform(df.studios.values)
df['genres_cat'] = labelencoder.fit_transform(df.genres.values)

In [7]:
df = df.drop(['release_date', 'title', 'ratings', 'revenue'], axis=1)
df = df.drop(['studios', 'genres'], axis=1)
df.head()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month,profitability,studios_cat,genres_cat
0,671,268.472,7.9,21429,125000000,2.993965,2.566,2,11,True,6,25
1,557,480.954,7.2,14421,139000000,2.387022,2.914,3,5,True,1,15
2,672,246.027,7.7,17294,100000000,2.696712,2.566,2,11,True,6,25
3,673,225.882,8.0,17001,130000000,3.523069,3.333,2,5,True,6,25
4,674,244.428,7.8,16341,150000000,3.234944,2.695,3,11,True,6,23


In [8]:
df.dtypes

movie_id                      int64
movie_popularity            float64
vote_average                float64
vote_count                    int64
budget                        int64
actor_popularity_mean       float64
director_popularity_mean    float64
ratings_enc                   int64
release_month                 int64
profitability                  bool
studios_cat                   int32
genres_cat                    int32
dtype: object

In [9]:
# Create our features
X = pd.get_dummies(df.drop(columns='profitability'))


# Create our target
y = pd.get_dummies(df['profitability'])

In [10]:
X.describe()

Unnamed: 0,movie_id,movie_popularity,vote_average,vote_count,budget,actor_popularity_mean,director_popularity_mean,ratings_enc,release_month,studios_cat,genres_cat
count,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0
mean,367984.885714,624.850957,7.431429,7022.314286,114560100.0,3.756995,3.45565,3.014286,6.871429,12.371429,16.785714
std,233193.023886,1012.637394,0.637373,6838.780984,85005740.0,1.835921,2.536486,0.770711,2.953364,5.626468,10.507269
min,557.0,177.734,6.0,71.0,1000000.0,1.318528,0.6,2.0,1.0,0.0,0.0
25%,151670.0,232.83525,6.925,1745.75,39250000.0,2.661262,1.68575,3.0,4.25,7.25,7.25
50%,433293.0,290.6235,7.6,4076.0,105000000.0,3.208295,2.8185,3.0,7.0,14.0,14.5
75%,545541.75,607.275,7.9,10911.25,178750000.0,4.245688,4.22375,3.0,9.0,17.0,25.0
max,791373.0,5884.885,8.4,25738.0,356000000.0,11.495923,13.329,6.0,12.0,20.0,36.0


In [11]:
# Check the balance of our target values
y = df['profitability']
y.value_counts()

True     50
False    20
Name: profitability, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({False: 13, True: 39})

# Naive Random Oversampling

In [13]:
# Resample the training data with the RandomOversampler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({False: 39, True: 39})

In [14]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [15]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[7, 0],
       [7, 4]], dtype=int64)

In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.6818181818181819

In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      False       0.50      1.00      0.36      0.67      0.60      0.39         7
       True       1.00      0.36      1.00      0.53      0.60      0.34        11

avg / total       0.81      0.61      0.75      0.59      0.60      0.36        18



# Undersampling

In [18]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({False: 13, True: 13})

In [19]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [20]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6818181818181819

In [21]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4, 3],
       [8, 3]], dtype=int64)

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      False       0.33      0.57      0.27      0.42      0.39      0.16         7
       True       0.50      0.27      0.57      0.35      0.39      0.15        11

avg / total       0.44      0.39      0.46      0.38      0.39      0.15        18

