In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import random
import sklearn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler




# fix random seed for reproducibility
np.random.seed(42)
random.seed(42)

In [57]:
classified_summaries = pd.read_csv('../data/processed/movies_with_classifications.csv')
movies = pd.read_csv('../data/processed/movies_summary_BO.csv', sep=',')

In [58]:
import ast
def safe_literal_eval(val):
	try:
		return ast.literal_eval(val)
	except (ValueError, SyntaxError):
		return val

movies['movie_languages'] = movies['movie_languages'].apply(safe_literal_eval)

movies['year_interval'] = (movies['movie_release_date'] // 5) * 5  

In [59]:
movies_budget = movies.dropna(subset=['budget'])  

In [60]:
# Ensure that the movie_genres column is parsed as lists
def parse_genres(genres):
    if isinstance(genres, list):
        return genres
    try:
        return ast.literal_eval(genres)
    except (ValueError, SyntaxError):
        return genres.strip('[]').replace("'", "").split(', ')

In [61]:
movies_budget['movie_countries'] = movies_budget['movie_countries'].apply(parse_genres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_budget['movie_countries'] = movies_budget['movie_countries'].apply(parse_genres)


In [62]:
movies_budget['movie_genres'] = movies_budget['movie_genres'].apply(parse_genres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_budget['movie_genres'] = movies_budget['movie_genres'].apply(parse_genres)


In [63]:
movies_budget['profit'] = movies_budget['movie_box_office_revenue'] - movies_budget['budget']
movies_budget['profitability_ratio'] = movies_budget['profit'] / movies_budget['budget']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_budget['profit'] = movies_budget['movie_box_office_revenue'] - movies_budget['budget']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_budget['profitability_ratio'] = movies_budget['profit'] / movies_budget['budget']


In [64]:
common_columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 'movie_runtime']
common_columns =['wikipedia_movie_id','freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 'plot_summary', 'budget', 'opening_weekend', 'rating_score', 'producer', 'title_year']
# Merge the dataframes on common columns
movies_plot = movies_budget.merge(
    classified_summaries,
    on=common_columns,
    how='inner'
)

movies_plot.drop(columns=['movie_languages_y', 'movie_countries_y', 'movie_genres_y', 'movie_runtime_y'], inplace=True)

# Rename the remaining columns to remove the "_x" suffix
movies_plot.rename(columns={
    'movie_languages_x': 'movie_languages',
    'movie_countries_x': 'movie_countries',
    'movie_genres_x': 'movie_genres',
    'movie_runtime_x' : 'movie_runtime'
}, inplace=True)

# Update the 'plot_structure' column to only take elements before the first ":"
movies_plot['plot_structure'] = movies_plot['plot_structure'].str.split(':').str[0]
movies_plot['plot_structure_20'] = movies_plot['plot_structure_20'].str.split(':').str[0]

movies_plot.columns

Index(['wikipedia_movie_id', 'freebase_movie_id', 'movie_name',
       'movie_release_date', 'movie_box_office_revenue', 'movie_runtime',
       'movie_languages', 'movie_countries', 'movie_genres', 'title_year',
       'plot_summary', 'opening_weekend', 'rating_score', 'producer', 'budget',
       'year_interval', 'profit', 'profitability_ratio', 'summarized',
       'plot_structure', 'plot_structure_20'],
      dtype='object')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# import KMeans
from sklearn.cluster import KMeans

# Text Vectorization with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_plot['plot_summary'])

combined_matrix = tfidf_matrix.toarray()

# Clustering with KMeans
n_clusters = 15
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
movies_plot['plot_structure_cluster'] = kmeans.fit_predict(combined_matrix)

In [100]:
def list_to_1_hot(df, column_name):
    mlb = MultiLabelBinarizer()
    one_hot_df = pd.DataFrame(mlb.fit_transform(df[column_name]), columns=mlb.classes_, index=df.index)
    df = pd.concat([df, one_hot_df], axis=1)
    df = df.drop(column_name, axis=1)
    return df

def split_x_y(df, y_column, x_columns_to_drop):
    y = df[y_column]
    x = df.drop(columns=x_columns_to_drop)
    return x, y

def split_train_test(x, y, test_size=0.2):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)
    return x_train, x_test, y_train, y_test

def scale_data(x_train, x_test):
    scaler = sklearn.preprocessing.StandardScaler()
    train = scaler.fit_transform(x_train)
    test = scaler.transform(x_test)
    x_train_df = pd.DataFrame(train, columns=x_train.columns)
    x_test_df = pd.DataFrame(test, columns=x_test.columns)
    return x_train_df, x_test_df

def preprocess4linreg(df, y_column, x_columns_to_drop, test_size=0.2):
    if 'plot_structure' not in x_columns_to_drop :
        df = pd.get_dummies(df, columns=['plot_structure'], drop_first=True, dtype=int)
    
    if 'movie_genres' not in x_columns_to_drop :
        df = list_to_1_hot(df, 'movie_genres')
    if 'plot_structure_cluster' not in x_columns_to_drop :
        df = pd.get_dummies(df, columns=['plot_structure_cluster'], drop_first=True, dtype=int)
    df = list_to_1_hot(df, 'movie_countries')
    x, y = split_x_y(df, y_column, x_columns_to_drop)
    x_train, x_test, y_train, y_test = split_train_test(x, y, test_size)
    x_train, x_test = scale_data(x_train, x_test)
    
    X_train_scaled_df = sm.add_constant(x_train,has_constant='add')
    X_test_scaled_df = sm.add_constant(x_test,has_constant='add')

    X_train_scaled_df = X_train_scaled_df.fillna(X_train_scaled_df.mean())
    X_test_scaled_df = X_test_scaled_df.fillna(X_train_scaled_df.mean())

    y_train_no_index = y_train.reset_index(drop=True)
    y_test_no_index = y_test.reset_index(drop=True)
    X_train_scaled_df = X_train_scaled_df.reset_index(drop=True)
    X_test_scaled_df = X_test_scaled_df.reset_index(drop=True)
    
    return X_train_scaled_df, X_test_scaled_df, y_train_no_index, y_test_no_index


In [85]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import PolynomialFeatures
def enhance_features(X_train, X_test):
    # Extract numerical columns
    numerical_cols = ['movie_release_date', 'budget', 'movie_runtime']
    
    # 1. Create polynomial features for numerical variables
    poly = PolynomialFeatures(degree=4, include_bias=False)
    numerical_train = poly.fit_transform(X_train[numerical_cols])
    numerical_test = poly.transform(X_test[numerical_cols])
    
    # Add polynomial features back to the dataframe
    poly_features = poly.get_feature_names_out(numerical_cols)
    X_train_poly = pd.DataFrame(numerical_train, columns=poly_features, index=X_train.index)
    X_test_poly = pd.DataFrame(numerical_test, columns=poly_features, index=X_test.index)
    X_train = pd.concat([X_train, X_train_poly], axis=1)
    X_test = pd.concat([X_test, X_test_poly], axis=1)
    
    # 3. Create genre count feature
    genre_cols = [col for col in X_train.columns if col.startswith('movie_genres_')]
    X_train['genre_count'] = X_train[genre_cols].sum(axis=1)
    X_test['genre_count'] = X_test[genre_cols].sum(axis=1)
    
    
    return X_train, X_test

# Predict profit

### without plot structure LLM and without clustering

In [104]:
# Features = [movie_release_date, budget, rating_score, movie_genres, movie_countries]
X_train, X_test, y_train, y_test = preprocess4linreg(movies_plot, y_column='profit', x_columns_to_drop=['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profitability_ratio', 'summarized', 'plot_structure', 'plot_structure_20', 'producer', 'profit', 'plot_structure_cluster'])
#X_train_enhanced, X_test_enhanced = enhance_features(X_train.copy(), X_test.copy())
# Fit the model
model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 profit   R-squared:                       0.480
Model:                            OLS   Adj. R-squared:                  0.436
Method:                 Least Squares   F-statistic:                     10.70
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        13:00:39   Log-Likelihood:                -87188.
No. Observations:                4438   AIC:                         1.751e+05
Df Residuals:                    4084   BIC:                         1.773e+05
Df Model:                         353                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

### with plot structure without genres

In [105]:
# Features = [movie_release_date, budget, rating_score, producer, movie_genres, movie_countries]
X_train_plot, X_test_plot, y_train_plot, y_test_plot = preprocess4linreg(movies_plot, 'profit', ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profitability_ratio', 'summarized', 'plot_structure_20', 'producer', 'profit', 'movie_genres', 'plot_structure_cluster'])
#X_train_enhanced, X_test_enhanced = enhance_features(X_train.copy(), X_test.copy())
# Fit the model
model = sm.OLS(y_train, X_train_plot)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 profit   R-squared:                       0.317
Model:                            OLS   Adj. R-squared:                  0.302
Method:                 Least Squares   F-statistic:                     21.45
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          2.62e-287
Time:                        13:01:18   Log-Likelihood:                -87795.
No. Observations:                4438   AIC:                         1.758e+05
Df Residuals:                    4343   BIC:                         1.764e+05
Df Model:                          94                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

### with plot structure and genres

In [106]:
# Features = [movie_release_date, budget, rating_score, producer, movie_genres, movie_countries]
X_train_plot, X_test_plot, y_train_plot, y_test_plot = preprocess4linreg(movies_plot, 'profit', ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profitability_ratio', 'summarized', 'plot_structure_20', 'producer', 'profit', 'plot_structure_cluster'])
#X_train_enhanced, X_test_enhanced = enhance_features(X_train.copy(), X_test.copy())
# Fit the model
model = sm.OLS(y_train, X_train_plot)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 profit   R-squared:                       0.484
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     10.42
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        13:02:38   Log-Likelihood:                -87171.
No. Observations:                4438   AIC:                         1.751e+05
Df Residuals:                    4070   BIC:                         1.774e+05
Df Model:                         367                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

### adding plot_structure clustering


In [109]:
# Features = [movie_release_date, budget, rating_score, producer, movie_genres, movie_countries]
X_train_plot, X_test_plot, y_train_plot, y_test_plot = preprocess4linreg(movies_plot, 'profit', ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profitability_ratio', 'summarized', 'plot_structure_20', 'producer', 'profit'])
#X_train_enhanced, X_test_enhanced = enhance_features(X_train.copy(), X_test.copy())
# Fit the model
model = sm.OLS(y_train, X_train_plot)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 profit   R-squared:                       0.493
Model:                            OLS   Adj. R-squared:                  0.445
Method:                 Least Squares   F-statistic:                     10.35
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        13:03:25   Log-Likelihood:                -87134.
No. Observations:                4438   AIC:                         1.750e+05
Df Residuals:                    4056   BIC:                         1.775e+05
Df Model:                         381                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

# Predict profitability ratio

In [33]:
# Features = [movie_release_date, budget, rating_score, producer, movie_genres, movie_countries]
X_train, X_test, y_train, y_test = preprocess4linreg(movies_plot, 'profitability_ratio', ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profit', 'profitability_ratio', 'summarized', 'plot_structure', 'plot_structure_20', 'producer'])

# Fit the model
model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

                             OLS Regression Results                            
Dep. Variable:     profitability_ratio   R-squared:                       0.325
Model:                             OLS   Adj. R-squared:                  0.266
Method:                  Least Squares   F-statistic:                     5.552
Date:                 Tue, 10 Dec 2024   Prob (F-statistic):          1.71e-169
Time:                         12:06:23   Log-Likelihood:                -29475.
No. Observations:                 4438   AIC:                         5.966e+04
Df Residuals:                     4083   BIC:                         6.193e+04
Df Model:                          354                                         
Covariance Type:             nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

In [34]:
# Features = [movie_release_date, budget, rating_score, producer, movie_genres, movie_countries]
movies_plot['inverse_budget'] = 1 / movies_plot['budget']

X_train_plot, X_test_plot, y_train_plot, y_test_plot = preprocess4linreg(movies_plot, 'profitability_ratio', ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_box_office_revenue', 'movie_languages', 'title_year', 'plot_summary', 'opening_weekend', 'year_interval', 'profit', 'profitability_ratio', 'summarized', 'plot_structure_20', 'producer'])

# Fit the model
model = sm.OLS(y_train_plot, X_train_plot)
results = model.fit()

print(results.summary())

                             OLS Regression Results                            
Dep. Variable:     profitability_ratio   R-squared:                       0.325
Model:                             OLS   Adj. R-squared:                  0.264
Method:                  Least Squares   F-statistic:                     5.325
Date:                 Tue, 10 Dec 2024   Prob (F-statistic):          1.94e-164
Time:                         12:06:29   Log-Likelihood:                -29475.
No. Observations:                 4438   AIC:                         5.969e+04
Df Residuals:                     4069   BIC:                         6.205e+04
Df Model:                          368                                         
Covariance Type:             nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [96]:
movies_plot.columns

Index(['wikipedia_movie_id', 'freebase_movie_id', 'movie_name',
       'movie_release_date', 'movie_box_office_revenue', 'movie_runtime',
       'movie_languages', 'movie_countries', 'movie_genres', 'title_year',
       'plot_summary', 'opening_weekend', 'rating_score', 'producer', 'budget',
       'year_interval', 'profit', 'profitability_ratio', 'summarized',
       'plot_structure', 'plot_structure_20'],
      dtype='object')