In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 'genres',
    'studios', 'actor_popularity_mean', 'director_popularity_mean'
]

target = ['revenue']

In [4]:
# Load the data
file_path = Path('../Tables/ML_Test_pgs_checked.csv')
df = pd.read_csv(file_path, index_col=0)[:-2]
# , index_col=0
# df = df.loc[:, columns].copy()

# # Drop the null columns where all values are null
# df = df.dropna(axis='columns', how='all')

# # Drop the null rows
# df = df.dropna()

# Encoded Age Ratings
ratings_enc = {
    'G': 1,
    'PG': 2,
    'PG-13': 3,
    'R': 4,
    'NC-17': 5,
    'NR':6
}

df['ratings_enc'] = df['ratings'].apply(lambda x: ratings_enc[x])

df['release_month'] = pd.DatetimeIndex(df['release_date']).month
df.head()

Unnamed: 0_level_0,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,studios,genres,ratings_enc,release_month
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,2.993965,2.566,"[""'WarnerBros.Pictures'"", ""'HeydayFilms'"", ""'O...","['Adventure', ' Fantasy']",2,11
557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,2.387022,2.914,"[""'Other'"", ""'ColumbiaPictures'"", ""'SonyPictur...","['Fantasy', ' Action']",3,5
672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,2.696712,2.566,"[""'WarnerBros.Pictures'"", ""'HeydayFilms'"", ""'O...","['Adventure', ' Fantasy']",2,11
673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,3.523069,3.333,"[""'WarnerBros.Pictures'"", ""'Other'"", ""'HeydayF...","['Adventure', ' Fantasy']",2,5
674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,3.234944,2.695,"[""'WarnerBros.Pictures'"", ""'HeydayFilms'"", ""'O...","['Adventure', ' Fantasy', ' Family']",3,11


In [5]:
def profitability_df(df):
    if (df['revenue'] >= df['budget']*2):
        return True
    else:
        return False

df['profitability'] = df.apply(profitability_df, axis = 1)
df

Unnamed: 0_level_0,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,studios,genres,ratings_enc,release_month,profitability
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,2.993965,2.566,"[""'WarnerBros.Pictures'"", ""'HeydayFilms'"", ""'O...","['Adventure', ' Fantasy']",2,11,True
557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,2.387022,2.914,"[""'Other'"", ""'ColumbiaPictures'"", ""'SonyPictur...","['Fantasy', ' Action']",3,5,True
672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,2.696712,2.566,"[""'WarnerBros.Pictures'"", ""'HeydayFilms'"", ""'O...","['Adventure', ' Fantasy']",2,11,True
673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,3.523069,3.333,"[""'WarnerBros.Pictures'"", ""'Other'"", ""'HeydayF...","['Adventure', ' Fantasy']",2,5,True
674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,3.234944,2.695,"[""'WarnerBros.Pictures'"", ""'HeydayFilms'"", ""'O...","['Adventure', ' Fantasy', ' Family']",3,11,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370172,No Time to Die,2021-09-29,3366.389,7.6,2075,242000000,734000000,PG-13,4.164382,4.167,"[""'Other'"", ""'Metro-Goldwyn-Mayer'"", ""'Univers...","['Adventure', ' Action', ' Thriller']",3,9,True
580489,Venom: Let There Be Carnage,2021-09-30,5797.863,7.0,2452,110000000,454000000,PG-13,4.363038,13.077,"[""'MarvelEntertainment'"", ""'PascalPictures'"", ...","['Science Fiction', ' Action', ' Adventure']",3,9,True
610253,Halloween Kills,2021-10-14,616.978,6.9,1251,20000000,127000000,R,3.464000,2.385,"[""'UniversalPictures'"", ""'Other'""]","['Horror', ' Thriller']",4,10,True
576845,Last Night in Soho,2021-10-21,685.843,7.5,458,43000000,19000000,R,2.869283,7.468,"[""'Other'""]","['Horror', ' Mystery', ' Thriller']",4,10,False


In [6]:
for test in df['studios']:
    for studio in eval(test):
        (studio)
        
studio_revised = []
for test in df['studios']:
    holder = []
    for studio in eval(test):
        word = studio
        if word not in holder:
            holder.append(word)
    studio_revised.append(holder)
    
df.drop(['studios'],axis=1,inplace=True)
df['studios'] = studio_revised

df

Unnamed: 0_level_0,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,genres,ratings_enc,release_month,profitability,studios
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,2.993965,2.566,"['Adventure', ' Fantasy']",2,11,True,"['WarnerBros.Pictures', 'HeydayFilms', 'Other']"
557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,2.387022,2.914,"['Fantasy', ' Action']",3,5,True,"['Other', 'ColumbiaPictures', 'SonyPictures', ..."
672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,2.696712,2.566,"['Adventure', ' Fantasy']",2,11,True,"['WarnerBros.Pictures', 'HeydayFilms', 'Other']"
673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,3.523069,3.333,"['Adventure', ' Fantasy']",2,5,True,"['WarnerBros.Pictures', 'Other', 'HeydayFilms']"
674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,3.234944,2.695,"['Adventure', ' Fantasy', ' Family']",3,11,True,"['WarnerBros.Pictures', 'HeydayFilms', 'Other']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370172,No Time to Die,2021-09-29,3366.389,7.6,2075,242000000,734000000,PG-13,4.164382,4.167,"['Adventure', ' Action', ' Thriller']",3,9,True,"['Other', 'Metro-Goldwyn-Mayer', 'UniversalPic..."
580489,Venom: Let There Be Carnage,2021-09-30,5797.863,7.0,2452,110000000,454000000,PG-13,4.363038,13.077,"['Science Fiction', ' Action', ' Adventure']",3,9,True,"['MarvelEntertainment', 'PascalPictures', 'Col..."
610253,Halloween Kills,2021-10-14,616.978,6.9,1251,20000000,127000000,R,3.464000,2.385,"['Horror', ' Thriller']",4,10,True,"['UniversalPictures', 'Other']"
576845,Last Night in Soho,2021-10-21,685.843,7.5,458,43000000,19000000,R,2.869283,7.468,"['Horror', ' Mystery', ' Thriller']",4,10,False,['Other']


In [12]:
# creating instance of labelencoder  lambda x: (for i in x : print i)
labelencoder = LabelEncoder()

df['studios_cat'] = lambda studio: for studio in df.studios.values : labelencoder.fit_transform(studio)

SyntaxError: invalid syntax (<ipython-input-12-9eca2be50d5a>, line 4)

In [None]:
df = df.drop(['release_date', 'title', 'ratings', 'revenue'], axis=1)
# df = df.drop(['studios', 'genres'], axis=1)
df.head()

In [None]:
# Create our features
X = pd.get_dummies(df.drop(columns='profitability'))


# Create our target
y = pd.get_dummies(df['profitability'])

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y = df['profitability']
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

# Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=50)
brfc.fit(X_train_scaled, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

# Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=300, random_state=1)
eec.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

# Extra ML

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

predictions