In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import random
import sklearn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler




# fix random seed for reproducibility
np.random.seed(42)
random.seed(42)

In [2]:
classified_summaries = pd.read_csv('../data/processed/movies_with_classifications.csv')
movies = pd.read_csv('../data/processed/movies_summary_BO.csv', sep=',')

In [3]:
import ast
def safe_literal_eval(val):
	try:
		return ast.literal_eval(val)
	except (ValueError, SyntaxError):
		return val

movies['movie_languages'] = movies['movie_languages'].apply(safe_literal_eval)

movies['year_interval'] = (movies['movie_release_date'] // 5) * 5  

In [4]:
movies_budget = movies.dropna(subset=['budget'])  

In [5]:
# Ensure that the movie_genres column is parsed as lists
def parse_genres(genres):
    if isinstance(genres, list):
        return genres
    try:
        return ast.literal_eval(genres)
    except (ValueError, SyntaxError):
        return genres.strip('[]').replace("'", "").split(', ')

In [6]:
movies_budget['movie_countries'] = movies_budget['movie_countries'].apply(parse_genres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_budget['movie_countries'] = movies_budget['movie_countries'].apply(parse_genres)


In [7]:
movies_budget['movie_genres'] = movies_budget['movie_genres'].apply(parse_genres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_budget['movie_genres'] = movies_budget['movie_genres'].apply(parse_genres)


In [8]:
movies_budget['profit'] = movies_budget['movie_box_office_revenue'] - movies_budget['budget']
movies_budget['profitability_ratio'] = movies_budget['profit'] / movies_budget['budget']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_budget['profit'] = movies_budget['movie_box_office_revenue'] - movies_budget['budget']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_budget['profitability_ratio'] = movies_budget['profit'] / movies_budget['budget']


In [9]:
common_columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 'movie_runtime']
common_columns =['wikipedia_movie_id','freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 'plot_summary', 'budget', 'opening_weekend', 'rating_score', 'producer', 'title_year']
# Merge the dataframes on common columns
movies_plot = movies_budget.merge(
    classified_summaries,
    on=common_columns,
    how='inner'
)

movies_plot.drop(columns=['movie_languages_y', 'movie_countries_y', 'movie_genres_y', 'movie_runtime_y'], inplace=True)

# Rename the remaining columns to remove the "_x" suffix
movies_plot.rename(columns={
    'movie_languages_x': 'movie_languages',
    'movie_countries_x': 'movie_countries',
    'movie_genres_x': 'movie_genres',
    'movie_runtime_x' : 'movie_runtime'
}, inplace=True)

# Update the 'plot_structure' column to only take elements before the first ":"
movies_plot['plot_structure'] = movies_plot['plot_structure'].str.split(':').str[0]
movies_plot['plot_structure_20'] = movies_plot['plot_structure_20'].str.split(':').str[0]

movies_plot.columns

Index(['wikipedia_movie_id', 'freebase_movie_id', 'movie_name',
       'movie_release_date', 'movie_box_office_revenue', 'movie_runtime',
       'movie_languages', 'movie_countries', 'movie_genres', 'title_year',
       'plot_summary', 'budget', 'opening_weekend', 'rating_score', 'producer',
       'year_interval', 'profit', 'profitability_ratio', 'summarized',
       'plot_structure', 'plot_structure_20'],
      dtype='object')

In [10]:
def list_to_1_hot(df, column_name):
    mlb = MultiLabelBinarizer()
    one_hot_df = pd.DataFrame(mlb.fit_transform(df[column_name]), columns=mlb.classes_, index=df.index)
    df = pd.concat([df, one_hot_df], axis=1)
    df = df.drop(column_name, axis=1)
    return df

def split_x_y(df, y_column, x_columns_to_drop):
    y = df[y_column]
    x = df.drop(columns=x_columns_to_drop)
    return x, y

def split_train_test(x, y, test_size=0.2):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)
    return x_train, x_test, y_train, y_test

def scale_data(x_train, x_test):
    scaler = sklearn.preprocessing.StandardScaler()
    train = scaler.fit_transform(x_train)
    test = scaler.transform(x_test)
    x_train_df = pd.DataFrame(train, columns=x_train.columns)
    x_test_df = pd.DataFrame(test, columns=x_test.columns)
    return x_train_df, x_test_df

def preprocess4linreg(df, y_column, x_columns_to_drop, test_size=0.2):
    if 'plot_structure' not in x_columns_to_drop :
        df = pd.get_dummies(df, columns=['plot_structure'], drop_first=True, dtype=int)
    
    df = list_to_1_hot(df, 'movie_genres')
    df = list_to_1_hot(df, 'movie_countries')
    x, y = split_x_y(df, y_column, x_columns_to_drop)
    x_train, x_test, y_train, y_test = split_train_test(x, y, test_size)
    x_train, x_test = scale_data(x_train, x_test)
    
    X_train_scaled_df = sm.add_constant(x_train,has_constant='add')
    X_test_scaled_df = sm.add_constant(x_test,has_constant='add')

    X_train_scaled_df = X_train_scaled_df.fillna(X_train_scaled_df.mean())
    X_test_scaled_df = X_test_scaled_df.fillna(X_train_scaled_df.mean())

    y_train_no_index = y_train.reset_index(drop=True)
    y_test_no_index = y_test.reset_index(drop=True)
    X_train_scaled_df = X_train_scaled_df.reset_index(drop=True)
    X_test_scaled_df = X_test_scaled_df.reset_index(drop=True)
    
    return X_train_scaled_df, X_test_scaled_df, y_train_no_index, y_test_no_index


# Predict profit

### without plot structure

In [15]:
# Features = [movie_release_date, budget, rating_score, movie_genres, movie_countries]
X_train, X_test, y_train, y_test = preprocess4linreg(movies_plot, y_column='profit', x_columns_to_drop=['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profit', 'profitability_ratio', 'summarized', 'plot_structure', 'plot_structure_20', 'producer'])

# Fit the model
model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 profit   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.956
Method:                 Least Squares   F-statistic:                     286.8
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        11:30:12   Log-Likelihood:                -92514.
No. Observations:                4660   AIC:                         1.857e+05
Df Residuals:                    4304   BIC:                         1.880e+05
Df Model:                         355                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

### with plot structure

In [12]:
# Features = [movie_release_date, budget, rating_score, producer, movie_genres, movie_countries]
X_train_plot, X_test_plot, y_train_plot, y_test_plot = preprocess4linreg(movies_plot, 'profit', ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profit', 'profitability_ratio', 'summarized', 'plot_structure_20', 'producer', 'rating_score'])

# Fit the model
model = sm.OLS(y_train_plot, X_train_plot)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 profit   R-squared:                       0.956
Model:                            OLS   Adj. R-squared:                  0.952
Method:                 Least Squares   F-statistic:                     254.9
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        11:13:30   Log-Likelihood:                -92691.
No. Observations:                4660   AIC:                         1.861e+05
Df Residuals:                    4291   BIC:                         1.885e+05
Df Model:                         368                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

# Predict profitability ratio

In [115]:
# Features = [movie_release_date, budget, rating_score, producer, movie_genres, movie_countries]
X_train, X_test, y_train, y_test = preprocess4linreg(movies_plot, 'profitability_ratio', ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profit', 'profitability_ratio', 'summarized', 'plot_structure', 'plot_structure_20', 'producer'])

# Fit the model
model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

                             OLS Regression Results                            
Dep. Variable:     profitability_ratio   R-squared:                       0.166
Model:                             OLS   Adj. R-squared:                  0.097
Method:                  Least Squares   F-statistic:                     2.409
Date:                 Sun, 08 Dec 2024   Prob (F-statistic):           9.70e-38
Time:                         18:47:27   Log-Likelihood:                -31330.
No. Observations:                 4660   AIC:                         6.337e+04
Df Residuals:                     4304   BIC:                         6.567e+04
Df Model:                          355                                         
Covariance Type:             nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

In [20]:
# Features = [movie_release_date, budget, rating_score, producer, movie_genres, movie_countries]
movies_plot['inverse_budget'] = 1 / movies_plot['budget']
X_train_plot, X_test_plot, y_train_plot, y_test_plot = preprocess4linreg(movies_plot, 'profitability_ratio', ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profit', 'profitability_ratio', 'summarized', 'plot_structure_20', 'producer'])

# Fit the model
model = sm.OLS(y_train_plot, X_train_plot)
results = model.fit()

print(results.summary())

                             OLS Regression Results                            
Dep. Variable:     profitability_ratio   R-squared:                       0.357
Model:                             OLS   Adj. R-squared:                  0.301
Method:                  Least Squares   F-statistic:                     6.412
Date:                 Tue, 10 Dec 2024   Prob (F-statistic):          1.21e-215
Time:                         11:38:46   Log-Likelihood:                -30724.
No. Observations:                 4660   AIC:                         6.219e+04
Df Residuals:                     4288   BIC:                         6.459e+04
Df Model:                          371                                         
Covariance Type:             nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------