# **Regresion Analysis**
## **Google Trends vs Songs Lyrics**

Note:
- Model 01: Using ConceptNet Approach
- Model 02: Using LDA Approach

In [3]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, RandomEffects

In [4]:
def results_latex(model, model_name):
    # Extract coefficients, standard errors, and p-values
    coefficients = model.params
    standard_errors = model.std_errors
    p_values = model.pvalues
    
    # Create a LaTeX table
    latex = f"\\begin{{table}}[htbp]\n\\centering\n"
    latex += f"\\caption{{Regression results for {model_name}}}\n"
    latex += f"\\begin{{tabular}}{{lccc}}\n\\hline\n"
    latex += f"Variable & Coefficient & Standard Error & P-value \\\\ \n\\hline\n"
    
    # Add results for each variable
    for var in coefficients.index:
        latex += f"{var} & {coefficients[var]:.4f} & {standard_errors[var]:.4f} & {p_values[var]:.4f} \\\\ \n"
    
    latex += f"\\hline\n\\end{{tabular}}\n\\end{{table}}\n"
    
    return latex

## **Loading Data**

In [5]:
path_cne = os.path.join(os.getcwd(), 'ConceptNet')
path_lda = os.path.join(os.getcwd(), 'LDA_Priors')
path_tre = os.path.join(os.getcwd(), 'google_trends')

cne = pd.read_csv(os.path.join(path_cne, 'cnet_regression_inputs.csv'))
lda = pd.read_csv(os.path.join(path_lda, 'result_relative_relevance_LDA_priors.csv'))
tre = pd.read_csv(os.path.join(path_tre, 'multiTimeline.csv'), skiprows = 2)

In [156]:
lda.head()

Unnamed: 0,release_year_month,Health,Justice,Money,Police,Politics,topic_share_5,topic_share_6,topic_share_7,topic_share_8,topic_share_9
0,2004-01-01,0.025013,0.001806,0.351627,0.000858,0.005358,0.02606,0.220086,0.34178,0.02694,0.000473
1,2004-02-01,0.007513,0.03194,0.520524,0.00983,0.013094,0.062232,0.150252,0.102857,0.091748,0.010012
2,2004-03-01,0.075941,0.013204,0.528969,0.000425,0.032071,0.019245,0.188363,0.025692,0.111238,0.004851
3,2004-04-01,0.026549,0.000471,0.434405,0.000921,0.032372,0.045719,0.178884,0.217956,0.060088,0.002635
4,2004-05-01,0.030128,0.001803,0.391479,0.000655,0.000622,0.019045,0.389498,0.098304,0.063384,0.005082


In [157]:
cne.head()

Unnamed: 0,release_year,release_month,release_year_month,relative_relevance_health,relative_relevance_justice,relative_relevance_money,relative_relevance_police,relative_relevance_politics,count_songs_released
0,2003,2,2003-02,0.040572,0.023197,0.0,0.004915,0.0,2
1,2003,3,2003-03,0.0,0.043855,0.0,0.040851,0.0,1
2,2003,6,2003-06,0.0,0.0,0.008137,0.0,0.0,1
3,2003,7,2003-07,0.009288,0.0,0.006643,0.0,0.0,3
4,2003,8,2003-08,0.015498,0.013493,0.033819,0.0,0.0,6


In [158]:
tre.head()

Unnamed: 0,Month,health: (United States),money: (United States),justice: (United States),police: (United States),politics: (United States)
0,2004-01,82,30,10,31,4
1,2004-02,80,29,11,28,5
2,2004-03,79,30,10,30,5
3,2004-04,79,30,11,29,5
4,2004-05,74,28,9,29,4


## **Cleaning Data**

In [6]:
# Settings

# 1. Google
df_tre = tre.rename(columns = {'health: (United States)': 'Health',
                                'money: (United States)': 'Money',
                                'justice: (United States)': 'Justice',
                                'police: (United States)': 'Police',
                                'politics: (United States)': 'Politics'})
df_tre = df_tre.melt(id_vars=["Month"], var_name="Topic", value_name="GoogleTrend")
df_tre["Month"] = pd.to_datetime(df_tre["Month"])


# 2. ConceptNet
df_cen = cne[['release_year_month', 'relative_relevance_health', 'relative_relevance_justice', 
                'relative_relevance_money', 'relative_relevance_police', 'relative_relevance_politics', 
                'count_songs_released']]
df_cen = df_cen.melt(id_vars=["release_year_month", "count_songs_released"], 
                        value_vars=['relative_relevance_health', 'relative_relevance_justice', 'relative_relevance_money', 
                        'relative_relevance_police', 'relative_relevance_politics'],
                        var_name="Topic", value_name="ScoreConceptNet")
df_cen["Topic"] = df_cen["Topic"].str.replace("relative_relevance_", "").str.capitalize()
df_cen["release_year_month"] = pd.to_datetime(df_cen["release_year_month"])


# 3. LDA
df_lda = lda[['release_year_month', 'Health', 'Justice', 'Money', 'Police', 'Politics']]
df_lda = df_lda.melt(id_vars=["release_year_month"], var_name="Topic", value_name="ScoreLDA")
df_lda["release_year_month"] = pd.to_datetime(df_lda["release_year_month"])

In [173]:
df_tre.head()

Unnamed: 0,Month,Topic,GoogleTrend
0,2004-01-01,Health,82
1,2004-02-01,Health,80
2,2004-03-01,Health,79
3,2004-04-01,Health,79
4,2004-05-01,Health,74


In [174]:
df_lda.head()

Unnamed: 0,release_year_month,Topic,ScoreLDA
0,2004-01-01,Health,0.025013
1,2004-02-01,Health,0.007513
2,2004-03-01,Health,0.075941
3,2004-04-01,Health,0.026549
4,2004-05-01,Health,0.030128


In [175]:
df_cen.head()

Unnamed: 0,release_year_month,count_songs_released,Topic,ScoreConceptNet
0,2003-02-01,2,Health,0.040572
1,2003-03-01,1,Health,0.0
2,2003-06-01,1,Health,0.0
3,2003-07-01,3,Health,0.009288
4,2003-08-01,6,Health,0.015498


## **Modelling**

### **Google vs ConceptNet**

In [7]:
cen_final = pd.merge(df_cen, df_tre, how="inner", 
                        left_on=["release_year_month", "Topic"], 
                        right_on=["Month", "Topic"])

cen_final = cen_final.set_index(["Topic", "Month"])

In [8]:
X_cen = cen_final[["ScoreConceptNet", "count_songs_released"]]
X_cen = sm.add_constant(X_cen)
y_cen = cen_final["GoogleTrend"]

# Fixed Effects Model for ConceptNet
FE_ConceptNet = PanelOLS(y_cen, X_cen, entity_effects=True).fit()
print(FE_ConceptNet)

# Random Effects Model for ConceptNet
RF_ConceptNet = RandomEffects(y_cen, X_cen).fit()
print(RF_ConceptNet)

                          PanelOLS Estimation Summary                           
Dep. Variable:            GoogleTrend   R-squared:                        0.0018
Estimator:                   PanelOLS   R-squared (Between):              0.0035
No. Observations:                1240   R-squared (Within):               0.0018
Date:                Wed, Mar 26 2025   R-squared (Overall):              0.0034
Time:                        12:50:22   Log-likelihood                   -3960.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      1.1309
Entities:                           5   P-value                           0.3231
Avg Obs:                       248.00   Distribution:                  F(2,1233)
Min Obs:                       248.00                                           
Max Obs:                       248.00   F-statistic (robust):             1.1309
                            

In [9]:
# ConceptNet model results
latex_ConceptNet_FE = results_latex(FE_ConceptNet, "Fixed Effects Model ConceptNet")
latex_ConceptNet_RE = results_latex(RF_ConceptNet, "Random Effects Model ConceptNet")

# Save the results to .tex files
with open("results_ConceptNet_FE.tex", "w") as f:
    f.write(latex_ConceptNet_FE)

with open("results_ConceptNet_RE.tex", "w") as f:
    f.write(latex_ConceptNet_RE)

### **Google vs LDA**

In [11]:
lda_final = pd.merge(df_lda, df_tre, how="inner", 
                        left_on=["release_year_month", "Topic"], 
                        right_on=["Month", "Topic"])

lda_final = lda_final.set_index(["Topic", "Month"])

In [15]:
X_lda = lda_final["ScoreLDA"]
X_lda = sm.add_constant(X_lda)
y_lda = lda_final["GoogleTrend"]

# Fixed Effects Model for LDA
FE_LDA = PanelOLS(y_lda, X_lda, entity_effects=True).fit()
print(FE_LDA)

# Random Effects Model for LDA
RF_LDA = RandomEffects(y_lda, X_lda).fit()
print(RF_LDA)

                          PanelOLS Estimation Summary                           
Dep. Variable:            GoogleTrend   R-squared:                        0.0139
Estimator:                   PanelOLS   R-squared (Between):             -0.0225
No. Observations:                1240   R-squared (Within):               0.0139
Date:                Wed, Mar 26 2025   R-squared (Overall):             -0.0205
Time:                        12:52:15   Log-likelihood                   -3952.9
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      17.413
Entities:                           5   P-value                           0.0000
Avg Obs:                       248.00   Distribution:                  F(1,1234)
Min Obs:                       248.00                                           
Max Obs:                       248.00   F-statistic (robust):             17.413
                            

In [16]:
# LDA model results
latex_LDA_FE = results_latex(FE_LDA, "Fixed Effects Model LDA")
latex_LDA_RE = results_latex(RF_LDA, "Random Effects Model LDA")

# Save the results to .tex files
with open("results_LDA_FE.tex", "w") as f:
    f.write(latex_LDA_FE)

with open("results_LDA_RE.tex", "w") as f:
    f.write(latex_LDA_RE)