In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier

In [2]:
columns = [
    'movie_id', 'title', 'release_date', 'movie_popularity', 
    'vote_average', 'vote_count', 'budget', 'revenue', 'ratings', 'genres',
    'studios', 'actor_popularity_mean', 'director_popularity_mean'
]

target = ['revenue']

In [3]:
# Load the data
file_path = Path('../Tables/meta_ml.csv')
df = pd.read_csv(file_path)[:-2]
# , index_col=0

movie_data = pd.read_csv('../Tables/Movie_Data.csv')

movie_data

Unnamed: 0,movie_id,title,genre_ids,release_date,movie_popularity,vote_average,vote_count,budget,revenue,rating
0,566525,Shang-Chi and the Legend of the Ten Rings,"[28, 12, 14]",2021-09-01,5884.885,7.9,3428,150000000,430238384,PG-13
1,580489,Venom: Let There Be Carnage,"[878, 28, 12]",2021-09-30,5797.863,7.0,2452,110000000,454000000,PG-13
2,370172,No Time to Die,"[12, 28, 53]",2021-09-29,3366.389,7.6,2075,242000000,734000000,PG-13
3,524434,Eternals,"[28, 12, 878]",2021-11-03,1746.171,7.1,1170,200000000,336000000,PG-13
4,744275,After We Fell,"[10749, 18]",2021-09-01,1710.038,7.2,956,14000000,19000000,R
...,...,...,...,...,...,...,...,...,...,...
69,303857,Dragon Ball Z: Resurrection 'F',"[28, 16, 878]",2015-04-18,192.039,6.8,1246,5000000,61768190,NR
70,565028,Candyman,"[27, 53]",2021-08-25,188.391,6.3,615,25000000,77389310,R
71,400160,The SpongeBob Movie: Sponge on the Run,"[10751, 16, 14, 12, 35]",2020-08-14,186.863,7.7,2351,60000000,4700000,PG
72,72545,Journey 2: The Mysterious Island,"[12, 28, 878]",2012-01-19,185.102,6.1,3312,79000000,355692760,PG


In [4]:
df

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,studios,genres,actor_popularity_mean,director_popularity_mean
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.993965,2.566
1,557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,"[''Other'', '' Columbia Pictures'', '' Sony Pi...","[''Fantasy'', ''Action'']",2.387022,2.914
2,672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'']",2.696712,2.566
3,673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,"[''Warner Bros. Pictures'', '' Other'', '' Hey...","[''Adventure'', ''Fantasy'']",3.523069,3.333
4,674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,"[''Warner Bros. Pictures'', '' Heyday Films'',...","[''Adventure'', ''Fantasy'', ''Family'']",3.234944,2.695
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,370172,No Time to Die,2021-09-29,3366.389,7.6,2075,242000000,734000000,PG-13,"[''Other'', '' Metro-Goldwyn-Mayer'', '' Other...","[''Adventure'', ''Action'', ''Thriller'']",4.164382,4.167
68,580489,Venom: Let There Be Carnage,2021-09-30,5797.863,7.0,2452,110000000,454000000,PG-13,"[''Marvel Entertainment'', '' Pascal Pictures'...","[''Science Fiction'', ''Action'', ''Adventure'']",4.363038,13.077
69,610253,Halloween Kills,2021-10-14,616.978,6.9,1251,20000000,127000000,R,"[''Universal Pictures'', '' Other'', '' Other'...","[''Horror'', ''Thriller'']",3.464000,2.385
70,576845,Last Night in Soho,2021-10-21,685.843,7.5,458,43000000,19000000,R,"[''Other'', '' Other'', '' Other'', '' Other''...","[''Horror'', ''Mystery'', ''Thriller'']",2.869283,7.468


In [5]:
df.dtypes

movie_id                      int64
title                        object
release_date                 object
movie_popularity            float64
vote_average                float64
vote_count                    int64
budget                        int64
revenue                       int64
ratings                      object
studios                      object
genres                       object
actor_popularity_mean       float64
director_popularity_mean    float64
dtype: object

In [6]:
# Encoded Age Ratings
ratings_enc = {
    'G': 1,
    'PG': 2,
    'PG-13': 3,
    'R': 4,
    'NC-17': 5,
    'NR':6
}

df['ratings_enc'] = df['ratings'].apply(lambda x: ratings_enc[x])

In [16]:
df['release_month'] = pd.DatetimeIndex(df['release_date']).month
for lst in df['studios']:
    holder = []
    for studio in lst:
        if studio not in holder:
            holder.append(studio)
    lst = holder
print(df['studios'])

0     [''Warner Bros. Pictures'', '' Heyday Films'',...
1     [''Other'', '' Columbia Pictures'', '' Sony Pi...
2     [''Warner Bros. Pictures'', '' Heyday Films'',...
3     [''Warner Bros. Pictures'', '' Other'', '' Hey...
4     [''Warner Bros. Pictures'', '' Heyday Films'',...
                            ...                        
67    [''Other'', '' Metro-Goldwyn-Mayer'', '' Other...
68    [''Marvel Entertainment'', '' Pascal Pictures'...
69    [''Universal Pictures'', '' Other'', '' Other'...
70    [''Other'', '' Other'', '' Other'', '' Other''...
71                                   ['Marvel Studios']
Name: studios, Length: 72, dtype: object


In [None]:
df = df.drop(['movie_id', 'release_date', 'title', 'ratings'], axis=1)
df = df.drop(['studios', 'genres'], axis=1)
df.head()

In [None]:
# data_scaler = StandardScaler()

In [None]:
# data_scaled = data_scaler.fit_transform(df)

In [None]:
# data_scaled[:5]

In [None]:
# print(np.mean(data_scaled[:,0]))
# print(np.std(data_scaled[:,0]))

In [None]:
# # Create our features
# X = pd.get_dummies(df.drop('revenue', axis=1))

# # Create our target
# y = pd.get_dummies(df['revenue'])

In [None]:
# X.describe()

In [None]:
# Create our features
X = df.drop(['revenue'], axis=1)

y = df[['revenue']].values

In [None]:
# y[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)

In [None]:
# creating an object of LinearRegression class
LR = LinearRegression()
# fitting the training data
LR.fit(X_train,y_train)

In [None]:
y_prediction =  LR.predict(X_test)
y_prediction

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score
score=r2_score(y_test,y_prediction)
print('r2 score is ',score)
print('mean_sqrd_error is==',mean_squared_error(y_test,y_prediction))
print('root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_prediction)))

In [None]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# # Creating a StandardScaler instance.
# scaler = StandardScaler()
# # Fitting the Standard Scaler with the training data.
# X_scaler = scaler.fit(X_train)

# # Scaling the data.
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [None]:
# # Creating the decision tree classifier instance.
# model = tree.DecisionTreeClassifier()
# # Fitting the model.
# model = model.fit(X_train_scaled, y_train)

In [None]:
# # Making predictions using the testing data.
# predictions = model.predict(X_test_scaled)

# predictions

In [None]:
X.info()

In [None]:
columns = X.columns
columns

In [None]:
X.head()

In [None]:
X.describe()

In [None]:
len(y)

In [None]:
# plt.figure(figsize=(15, 10))
# plt.scatter(X, y_prediction, alpha=0.5)
# plt.title('How Variables Can Effect Movie Revenue')
# plt.xlabel('Various Variables')
# plt.ylabel('Box Office Success')
# plt.ylim(0,3000000000)
# plt.xlim(0,3000000000)
# plt.show

In [None]:
test = df.plot.scatter(y='revenue', x=['movie_popularity'], c='r')
test = df.plot.scatter(y='revenue', x=['vote_average'], c='b')
test = df.plot.scatter(y='revenue', x=['vote_count'], c='g')
test = df.plot.scatter(y='revenue', x=['budget'], c='y')
test = df.plot.scatter(y='revenue', x=['actor_popularity_mean'], c='orange')
test = df.plot.scatter(y='revenue', x=['director_popularity_mean'], c='teal')
test = df.plot.scatter(y='revenue', x=['ratings_enc'], c='purple')
test = df.plot.scatter(y='revenue', x=['release_month'], c='pink')

In [None]:
test.show()