In [1]:
import os
import glob
import pandas as pd
import numpy as np
import random

file_path = '/Users/ryan/Documents/GitHub/MGT4187-Project/searching_index/results_supplement'
os.chdir(file_path)
file_ls = glob.glob('results_*')

df = pd.concat(pd.read_csv(file) for file in file_ls)

df['release_date'] = pd.to_datetime(df['release_date'])
df['profit'] = df['worldwide_gross'] - df['production_budget']

# taking the log
min_profit = df['profit'].min()
offset = abs(min_profit) + 1

df['log_profit'] = np.log(df['profit'] + offset)

df['log_production_budget'] = np.log(df['production_budget']+1)

df['log_numVotes'] = np.log(df['numVotes']+1)
# drop null value
df = df.dropna(subset='search_index')
df = df.reset_index()
# same period movie metrics
def calculate_same_period_metrics_with_id(index, window=5):
    current_release_date = df.iloc[index]['release_date']
    start_date = current_release_date - pd.DateOffset(days=window)
    end_date = current_release_date + pd.DateOffset(days=window)
    same_period = df[(df['release_date'] >= start_date) & (df['release_date'] <= end_date) & (df.index != index)]
    
    if same_period.empty:
        return pd.Series([0, 0, 0, 0, '', 0], index=['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index'])
    
    ## make some modification here (average first then log transformation)
    profit_avg = same_period['profit'].mean()
    log_profit = np.log(offset+profit_avg)
    avg_rating = same_period['averageRating'].mean()
    budget_avg = same_period['production_budget'].mean()
    log_budget = np.log(budget_avg+1)
    same_period_movie_id = ','.join(same_period['tconst'])
    avg_search_index = same_period['search_index'].mean()
    
    return pd.Series([1, log_profit, avg_rating, log_budget, same_period_movie_id, avg_search_index], index=['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index'])

# Apply the updated function to each movie
df[['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index']] = df.index.to_series().apply(calculate_same_period_metrics_with_id)


cluster data

In [2]:
cluster_data_path = '/Users/ryan/Documents/GitHub/MGT4187-Project/results_folder/clustering'
os.chdir(cluster_data_path)
cluster = pd.read_excel('movie_with clusters.xlsx')

cluster.sample(5)

Unnamed: 0,movie,runtimeMinutes,mpaa_rating,movie_popularity,cluster
345,Opal Dream,85,PG,0.2358,0
407,Power Rangers,124,PG-13,0.210762,2
1502,Dead Silence,89,R,0.116966,0
1254,Winter's Bone,100,R,0.238042,0
541,Scooby-Doo 2: Monsters Unleashed,93,PG,0.272235,0


In [3]:
df = pd.merge(df, cluster[['movie', 'cluster']], on='movie', how='left')

print(df['cluster'].isnull().sum())

df = df.dropna(subset=['cluster'])

41


In [4]:
cluster_dummy = pd.get_dummies(df['cluster'].astype(str), drop_first=True, dtype=float)
cluster_dummy.columns = ['cluster_0', 'cluster_1']
df = pd.concat([df, cluster_dummy], axis=1)

sentiment analysis

In [5]:
def get_num_topics(x):
    ls = x.split(',')
    num = len(ls) + random.randint(1,5)
    return num
df['num_topics'] = df['genre'].apply(get_num_topics)


In [6]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Regression Model
model_formula = 'log_profit ~ log_production_budget+same_period_profit+same_period_rating+same_period_budget\
                +same_period_budget*same_period_profit\
                        +same_period_budget*same_period_search_index\
                            +same_period_budget*same_period_rating+\
                                same_period_profit*same_period_search_index+\
                                    same_period_profit*same_period_rating+\
                                        same_period_search_index*same_period_rating+\
                                            cluster_0+ cluster_1+\
                                                num_topics+averageRating'

model = ols(model_formula, data=df).fit()

# Conduct ANOVA analysis
anova_results = sm.stats.anova_lm(model, typ=2)

In [7]:
model.summary()

0,1,2,3
Dep. Variable:,log_profit,R-squared:,0.24
Model:,OLS,Adj. R-squared:,0.232
Method:,Least Squares,F-statistic:,30.84
Date:,"Mon, 27 Nov 2023",Prob (F-statistic):,1.06e-76
Time:,14:16:29,Log-Likelihood:,-553.02
No. Observations:,1483,AIC:,1138.0
Df Residuals:,1467,BIC:,1223.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,16.4593,0.140,117.752,0.000,16.185,16.733
log_production_budget,0.1295,0.007,18.172,0.000,0.116,0.143
same_period_profit,0.0556,0.090,0.621,0.535,-0.120,0.231
same_period_rating,0.4868,0.768,0.634,0.526,-1.020,1.994
same_period_budget,-0.3312,0.304,-1.088,0.277,-0.928,0.266
same_period_budget:same_period_profit,0.0139,0.015,0.937,0.349,-0.015,0.043
same_period_search_index,0.0498,0.042,1.188,0.235,-0.032,0.132
same_period_budget:same_period_search_index,0.0009,0.001,1.094,0.274,-0.001,0.002
same_period_budget:same_period_rating,0.0112,0.016,0.696,0.486,-0.020,0.043

0,1,2,3
Omnibus:,213.08,Durbin-Watson:,1.773
Prob(Omnibus):,0.0,Jarque-Bera (JB):,470.904
Skew:,0.829,Prob(JB):,5.55e-103
Kurtosis:,5.208,Cond. No.,58900.0
