In [10]:
import os
import glob
import pandas as pd
import numpy as np

file_path = '/Users/ryan/Documents/GitHub/MGT4187-Project/searching_index/results_supplement'
os.chdir(file_path)
file_ls = glob.glob('results_*')

df = pd.concat(pd.read_csv(file) for file in file_ls)

df['release_date'] = pd.to_datetime(df['release_date'])
df['profit'] = df['worldwide_gross'] - df['production_budget']

# taking the log
min_profit = df['profit'].min()
offset = abs(min_profit) + 1

df['log_profit'] = np.log(df['profit'] + offset)

df['log_production_budget'] = np.log(df['production_budget']+1)

df['log_numVotes'] = np.log(df['numVotes']+1)
# drop null value
df = df.dropna(subset='search_index')
df = df.reset_index()
# same period movie metrics
def calculate_same_period_metrics_with_id(index, window=5):
    current_release_date = df.iloc[index]['release_date']
    start_date = current_release_date - pd.DateOffset(days=window)
    end_date = current_release_date + pd.DateOffset(days=window)
    same_period = df[(df['release_date'] >= start_date) & (df['release_date'] <= end_date) & (df.index != index)]
    
    if same_period.empty:
        return pd.Series([0, 0, 0, 0, '', 0], index=['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index'])
    
    ## make some modification here (average first then log transformation)
    profit_avg = same_period['profit'].mean()
    log_profit = np.log(offset+profit_avg)
    avg_rating = same_period['averageRating'].mean()
    budget_avg = same_period['production_budget'].mean()
    log_budget = np.log(budget_avg+1)
    same_period_movie_id = ','.join(same_period['tconst'])
    avg_search_index = same_period['search_index'].mean()
    
    return pd.Series([1, profit_avg, avg_rating, budget_avg, same_period_movie_id, avg_search_index], index=['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index'])

# Apply the updated function to each movie
df[['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index']] = df.index.to_series().apply(calculate_same_period_metrics_with_id)


In [11]:
import random
df['comment_length'] = random.randint(10, 40)

In [12]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Regression Model
model_formula = 'profit ~ production_budget+same_period_profit+same_period_rating+same_period_budget\
    +same_period_budget*same_period_profit\
        +same_period_budget*same_period_search_index\
            +same_period_budget*same_period_rating+\
                same_period_profit*same_period_search_index+\
                    same_period_profit*same_period_rating+\
                        same_period_search_index*same_period_rating'

model = ols(model_formula, data=df).fit()

# Conduct ANOVA analysis
anova_results = sm.stats.anova_lm(model, typ=2)

In [13]:
model.summary()

0,1,2,3
Dep. Variable:,profit,R-squared:,0.303
Model:,OLS,Adj. R-squared:,0.3
Method:,Least Squares,F-statistic:,82.98
Date:,"Mon, 27 Nov 2023",Prob (F-statistic):,4.63e-114
Time:,13:56:08,Log-Likelihood:,-30552.0
No. Observations:,1533,AIC:,61120.0
Df Residuals:,1524,BIC:,61170.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-284.6804,110.996,-2.565,0.010,-502.402,-66.959
production_budget,1.8380,0.066,27.779,0.000,1.708,1.968
same_period_profit,0.2318,0.311,0.745,0.456,-0.378,0.842
same_period_rating,-2057.7324,802.306,-2.565,0.010,-3631.473,-483.992
same_period_budget,-1.2284,0.816,-1.505,0.133,-2.829,0.373
same_period_budget:same_period_profit,7.191e-10,7.59e-10,0.948,0.343,-7.69e-10,2.21e-09
same_period_search_index,-1.293e+04,5040.913,-2.565,0.010,-2.28e+04,-3040.938
same_period_budget:same_period_search_index,0.0151,0.008,1.860,0.063,-0.001,0.031
same_period_budget:same_period_rating,0.1975,0.127,1.551,0.121,-0.052,0.447

0,1,2,3
Omnibus:,928.648,Durbin-Watson:,1.758
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15052.271
Skew:,2.523,Prob(JB):,0.0
Kurtosis:,17.498,Cond. No.,3.17e+16


Log transformation

In [14]:
import os
import glob
import pandas as pd
import numpy as np

file_path = '/Users/ryan/Documents/GitHub/MGT4187-Project/searching_index/results_supplement'
os.chdir(file_path)
file_ls = glob.glob('results_*')

df = pd.concat(pd.read_csv(file) for file in file_ls)

df['release_date'] = pd.to_datetime(df['release_date'])
df['profit'] = df['worldwide_gross'] - df['production_budget']

# taking the log
min_profit = df['profit'].min()
offset = abs(min_profit) + 1

df['log_profit'] = np.log(df['profit'] + offset)

df['log_production_budget'] = np.log(df['production_budget']+1)

df['log_numVotes'] = np.log(df['numVotes']+1)
# drop null value
df = df.dropna(subset='search_index')
df = df.reset_index()
# same period movie metrics
def calculate_same_period_metrics_with_id(index, window=5):
    current_release_date = df.iloc[index]['release_date']
    start_date = current_release_date - pd.DateOffset(days=window)
    end_date = current_release_date + pd.DateOffset(days=window)
    same_period = df[(df['release_date'] >= start_date) & (df['release_date'] <= end_date) & (df.index != index)]
    
    if same_period.empty:
        return pd.Series([0, 0, 0, 0, '', 0], index=['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index'])
    
    ## make some modification here (average first then log transformation)
    profit_avg = same_period['profit'].mean()
    log_profit = np.log(offset+profit_avg)
    avg_rating = same_period['averageRating'].mean()
    budget_avg = same_period['production_budget'].mean()
    log_budget = np.log(budget_avg+1)
    same_period_movie_id = ','.join(same_period['tconst'])
    avg_search_index = same_period['search_index'].mean()
    
    return pd.Series([1, log_profit, avg_rating, log_budget, same_period_movie_id, avg_search_index], index=['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index'])

# Apply the updated function to each movie
df[['same_period_indicator', 'same_period_profit', 'same_period_rating', 'same_period_budget', 'same_period_movie_id', 'same_period_search_index']] = df.index.to_series().apply(calculate_same_period_metrics_with_id)


In [15]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Regression Model
model_formula = 'log_profit ~ log_production_budget+same_period_profit+same_period_rating+same_period_budget\
    +same_period_budget*same_period_profit\
        +same_period_budget*same_period_search_index\
            +same_period_budget*same_period_rating+\
                same_period_profit*same_period_search_index+\
                    same_period_profit*same_period_rating+\
                        same_period_search_index*same_period_rating'

model = ols(model_formula, data=df).fit()

# Conduct ANOVA analysis
anova_results = sm.stats.anova_lm(model, typ=2)

In [16]:
model.summary()

0,1,2,3
Dep. Variable:,log_profit,R-squared:,0.186
Model:,OLS,Adj. R-squared:,0.18
Method:,Least Squares,F-statistic:,31.62
Date:,"Mon, 27 Nov 2023",Prob (F-statistic):,9.13e-61
Time:,13:56:12,Log-Likelihood:,-617.61
No. Observations:,1533,AIC:,1259.0
Df Residuals:,1521,BIC:,1323.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,16.9741,0.123,138.502,0.000,16.734,17.214
log_production_budget,0.1297,0.007,18.317,0.000,0.116,0.144
same_period_profit,0.0509,0.091,0.561,0.575,-0.127,0.229
same_period_rating,0.5986,0.767,0.780,0.436,-0.907,2.104
same_period_budget,-0.3542,0.305,-1.160,0.246,-0.953,0.245
same_period_budget:same_period_profit,0.0153,0.015,1.029,0.304,-0.014,0.044
same_period_search_index,0.0301,0.042,0.717,0.474,-0.052,0.113
same_period_budget:same_period_search_index,0.0012,0.001,1.556,0.120,-0.000,0.003
same_period_budget:same_period_rating,0.0100,0.016,0.618,0.537,-0.022,0.042

0,1,2,3
Omnibus:,234.074,Durbin-Watson:,1.781
Prob(Omnibus):,0.0,Jarque-Bera (JB):,506.921
Skew:,0.883,Prob(JB):,8.38e-111
Kurtosis:,5.195,Cond. No.,58100.0
