In [1]:
import os
import glob
import pandas as pd
import numpy as np
import random

file_path = '/Users/ryan/Documents/GitHub/MGT4187-Project/searching_index/results_supplement'
os.chdir(file_path)
file_ls = glob.glob('results_*')

df = pd.concat(pd.read_csv(file) for file in file_ls)

df['release_date'] = pd.to_datetime(df['release_date'])
df['profit'] = df['worldwide_gross'] - df['production_budget']

# taking the log
min_profit = df['profit'].min()
offset = abs(min_profit) + 1

df['log_profit'] = np.log(df['profit'] + offset)

df['log_production_budget'] = np.log(df['production_budget']+1)

df['log_numVotes'] = np.log(df['numVotes']+1)
# drop null value
df = df.dropna(subset='search_index')
df = df.reset_index()
# same period movie metrics
def calculate_same_period_metrics_with_id(index, window=5):
    current_release_date = df.iloc[index]['release_date']
    start_date = current_release_date - pd.DateOffset(days=window)
    end_date = current_release_date + pd.DateOffset(days=window)
    same_period = df[(df['release_date'] >= start_date) & (df['release_date'] <= end_date) & (df.index != index)]
    
    if same_period.empty:
        return pd.Series([0, 0, 0, 0, '', 0], index=['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index'])
    
    ## make some modification here (average first then log transformation)
    profit_avg = same_period['profit'].mean()
    log_profit = np.log(offset+profit_avg)
    avg_rating = same_period['averageRating'].mean()
    budget_avg = same_period['production_budget'].mean()
    log_budget = np.log(budget_avg+1)
    same_period_movie_id = ','.join(same_period['tconst'])
    avg_search_index = same_period['search_index'].mean()
    
    return pd.Series([1, profit_avg, avg_rating, budget_avg, same_period_movie_id, avg_search_index], index=['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index'])

# Apply the updated function to each movie
df[['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index']] = df.index.to_series().apply(calculate_same_period_metrics_with_id)


cluster data

In [2]:
cluster_data_path = '/Users/ryan/Documents/GitHub/MGT4187-Project/results_folder/clustering'
os.chdir(cluster_data_path)
cluster = pd.read_excel('movie_with clusters.xlsx')

cluster.sample(5)

Unnamed: 0,movie,runtimeMinutes,mpaa_rating,movie_popularity,cluster
1915,Jackass: The Movie,85,R,0.264948,0
794,The Day the Earth Stood Still,104,PG-13,0.317638,1
983,The One,87,PG-13,0.176943,0
1607,Enough Said,93,PG-13,0.432922,1
2034,Lottery Ticket,99,PG-13,0.161996,0


In [3]:
df = pd.merge(df, cluster[['movie', 'cluster']], on='movie', how='left')

print(df['cluster'].isnull().sum())

df = df.dropna(subset=['cluster'])

41


In [4]:
cluster_dummy = pd.get_dummies(df['cluster'].astype(str), drop_first=True, dtype=float)
cluster_dummy.columns = ['cluster_0', 'cluster_1']
df = pd.concat([df, cluster_dummy], axis=1)

sentiment analysis

In [5]:
def get_num_topics(x):
    ls = x.split(',')
    num = len(ls) + random.randint(1,5)
    return num
df['num_topics'] = df['genre'].apply(get_num_topics)


In [6]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Regression Model
model_formula = 'profit ~ production_budget+same_period_profit+same_period_rating+same_period_budget\
                +same_period_budget*same_period_profit\
                        +same_period_budget*same_period_search_index\
                            +same_period_budget*same_period_rating+\
                                same_period_profit*same_period_search_index+\
                                    same_period_profit*same_period_rating+\
                                        same_period_search_index*same_period_rating+\
                                            cluster_0+ cluster_1+\
                                                num_topics+averageRating'

model = ols(model_formula, data=df).fit()

# Conduct ANOVA analysis
anova_results = sm.stats.anova_lm(model, typ=2)

In [7]:
model.summary()

0,1,2,3
Dep. Variable:,profit,R-squared:,0.297
Model:,OLS,Adj. R-squared:,0.293
Method:,Least Squares,F-statistic:,77.68
Date:,"Mon, 27 Nov 2023",Prob (F-statistic):,4.48e-107
Time:,14:15:13,Log-Likelihood:,-29571.0
No. Observations:,1483,AIC:,59160.0
Df Residuals:,1474,BIC:,59210.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-262.4400,114.471,-2.293,0.022,-486.984,-37.896
production_budget,1.8306,0.068,26.923,0.000,1.697,1.964
same_period_profit,0.2705,0.320,0.847,0.397,-0.356,0.897
same_period_rating,-1903.7318,830.372,-2.293,0.022,-3532.568,-274.895
same_period_budget,-1.1547,0.832,-1.389,0.165,-2.786,0.476
same_period_budget:same_period_profit,6.56e-10,7.9e-10,0.830,0.407,-8.94e-10,2.21e-09
same_period_search_index,-1.193e+04,5201.644,-2.293,0.022,-2.21e+04,-1722.009
same_period_budget:same_period_search_index,0.0127,0.008,1.527,0.127,-0.004,0.029
same_period_budget:same_period_rating,0.1900,0.130,1.465,0.143,-0.064,0.444

0,1,2,3
Omnibus:,899.941,Durbin-Watson:,1.75
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14356.74
Skew:,2.531,Prob(JB):,0.0
Kurtosis:,17.377,Cond. No.,7.73e+16
