In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [2]:
df = pd.read_csv('Combined_Data.csv')
df = df.set_index('dates')

In [3]:
df.columns

Index(['Filenames', 'tfidf', 'hawkish', 'dovish', 'Neutral', 'Negative',
       'Positive', 'GT10', 'GT2', '2s10s_Spread', 'Gold_Prices', 'VIX',
       'SP500', 'quintile'],
      dtype='object')

In [4]:
# Linear regression method

def get_regression_values(X, y):
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X, y)
    r_squared = model.score(X, y)
    coefficients = model.coef_
    intercept = model.intercept_
    return {"r_squared": r_squared, "coefficients": coefficients, "intercept": intercept}


In [5]:
# I have run it individually so I could compare results easily and then have merged all the files 
# this is linear regression based on y = ax+b where b in the intercept and x is the variable which in this case is hawkish. 

columns = ['hawkish']
index = ['GT10', 'GT2', '2s10s_Spread', 'Gold_Prices', 'VIX', 'SP500']
r2_df = pd.DataFrame(index=index, columns=columns)


for target_var in ['GT10', 'GT2', '2s10s_Spread', 'Gold_Prices', 'VIX', 'SP500']:
    filtered_df = df[abs(df['hawkish']) >= df['hawkish'].quantile(0.95)]
    
    X = filtered_df[['hawkish']]
    y = filtered_df[[target_var]]
    
    result = get_regression_values(X, y)
    
    r2_df.loc[target_var, 'hawkish'] = result['r_squared']

# Print the R-squared DataFrame
print('R-Squared Dataframe for Hawkish')
print(r2_df*100)

R-Squared Dataframe for Hawkish
               hawkish
GT10          0.475869
GT2           0.065445
2s10s_Spread  3.255899
Gold_Prices   0.082743
VIX           1.123259
SP500         0.038921


In [6]:
# I have run it individually so I could compare results easily and then have merged all the files 
# this is linear regression based on y = ax+b where b in the intercept and x is the variable which in this case is dovish. 

columns = ['dovish']
index = ['GT10', 'GT2', '2s10s_Spread', 'Gold_Prices', 'VIX', 'SP500']
r2_df_h = pd.DataFrame(index=index, columns=columns)

# Loop through target variables
for target_var in index:  # target_var corresponds to ['GT10', 'GT2', etc.]
    # Filter the DataFrame based on the 95th percentile of the 'dovish' column
    filtered_df = df[abs(df['dovish']) >= df['dovish'].quantile(0.95)]
    
    # Ensure X is a DataFrame
    X = filtered_df[['dovish']]  # Use double brackets to keep X as a DataFrame
    y = filtered_df[[target_var]]  # Dependent variable
    
    # Perform regression
    result = get_regression_values(X, y)
    
    # Fill the R-squared value into the r2_df DataFrame
    r2_df_h.loc[target_var, 'dovish'] = result['r_squared']

# Print R-squared DataFrame for Dovish
print('R-Squared Dataframe for Dovish')
print(r2_df_h * 100)

R-Squared Dataframe for Dovish
                dovish
GT10          1.975653
GT2           3.897818
2s10s_Spread  1.130968
Gold_Prices   0.002872
VIX           0.012606
SP500         0.165994


In [7]:
# I have run it individually so I could compare results easily and then have merged all the files 
# this is linear regression based on y = ax+b where b in the intercept and x is the variable which in this case is TFIDF. 

columns = ['tfidf']
index = ['GT10', 'GT2', '2s10s_Spread', 'Gold_Prices', 'VIX', 'SP500']
r2_df_tf = pd.DataFrame(index=index, columns=columns)

# Loop through target variables
for target_var in index:  # target_var corresponds to ['GT10', 'GT2', etc.]
    # Filter the DataFrame based on the 95th percentile of the 'dovish' column
    filtered_df = df[abs(df['tfidf']) >= df['tfidf'].quantile(0.95)]
    
    # Ensure X is a DataFrame
    X = filtered_df[['tfidf']]  # Use double brackets to keep X as a DataFrame
    y = filtered_df[[target_var]]  # Dependent variable
    
    # Perform regression
    result = get_regression_values(X, y)
    
    # Fill the R-squared value into the r2_df DataFrame
    r2_df_tf.loc[target_var, 'tfidf'] = result['r_squared']

# Print R-squared DataFrame for Dovish
print('R-Squared Dataframe for TF_IDF')
print(r2_df_tf * 100)

R-Squared Dataframe for TF_IDF
                 tfidf
GT10          0.334636
GT2           0.060404
2s10s_Spread  0.146273
Gold_Prices   1.771619
VIX           1.075799
SP500         0.996451


In [8]:
# I have run it individually so I could compare results easily and then have merged all the files 
# this is linear regression based on y = ax+b where b in the intercept and x is the variable which in this case is Positive -BERT. 

columns = ['Positive']
index = ['GT10', 'GT2', '2s10s_Spread', 'Gold_Prices', 'VIX', 'SP500']
r2_df_pos = pd.DataFrame(index=index, columns=columns)

# Loop through target variables
for target_var in index:  # target_var corresponds to ['GT10', 'GT2', etc.]
    # Filter the DataFrame based on the 95th percentile of the 'dovish' column
    filtered_df = df[abs(df['Positive']) >= df['Positive'].quantile(0.95)]
    
    # Ensure X is a DataFrame
    X = filtered_df[['Positive']]  # Use double brackets to keep X as a DataFrame
    y = filtered_df[[target_var]]  # Dependent variable
    
    # Perform regression
    result = get_regression_values(X, y)
    
    # Fill the R-squared value into the r2_df DataFrame
    r2_df_pos.loc[target_var, 'Positive'] = result['r_squared']

# Print R-squared DataFrame for Dovish
print('R-Squared Dataframe for Positive - BERT')
print(r2_df_pos * 100)

R-Squared Dataframe for Positive - BERT
              Positive
GT10          0.001123
GT2            0.72549
2s10s_Spread  4.753694
Gold_Prices   0.134738
VIX           2.566851
SP500         2.345252


In [9]:
# I have run it individually so I could compare results easily and then have merged all the files 
# this is linear regression based on y = ax+b where b in the intercept and x is the variable which in this case is Negative - BERT. 

columns = ['Negative']
index = ['GT10', 'GT2', '2s10s_Spread', 'Gold_Prices', 'VIX', 'SP500']
r2_df_neg = pd.DataFrame(index=index, columns=columns)

# Loop through target variables
for target_var in index:  # target_var corresponds to ['GT10', 'GT2', etc.]
    # Filter the DataFrame based on the 95th percentile of the 'dovish' column
    filtered_df = df[abs(df['Negative']) >= df['Negative'].quantile(0.95)]
    
    # Ensure X is a DataFrame
    X = filtered_df[['Negative']]  # Use double brackets to keep X as a DataFrame
    y = filtered_df[[target_var]]  # Dependent variable
    
    # Perform regression
    result = get_regression_values(X, y)
    
    # Fill the R-squared value into the r2_df DataFrame
    r2_df_neg.loc[target_var, 'Negative'] = result['r_squared']

# Print R-squared DataFrame for Dovish
print('R-Squared Dataframe for Negative - BERT')
print(r2_df_neg * 100)

R-Squared Dataframe for Negative - BERT
               Negative
GT10           0.432224
GT2            0.009653
2s10s_Spread   0.001897
Gold_Prices    1.918018
VIX            17.41261
SP500         17.532986


In [10]:
dfs = [r2_df,r2_df_h,r2_df_tf,r2_df_pos,r2_df_neg]
merged_df = pd.concat(dfs,axis = 1)

In [11]:
merged_df * 100
# the values have been multiplied by 100 so it is easier to spot the difference, these are percentages values. 
# Also I have merged all the dataframes in the end for a comparison
# We have taken > 95% quartile for all values as it says we need to compare extremely hawkish and dovish data. 

Unnamed: 0,hawkish,dovish,tfidf,Positive,Negative
GT10,0.475869,1.975653,0.334636,0.001123,0.432224
GT2,0.065445,3.897818,0.060404,0.72549,0.009653
2s10s_Spread,3.255899,1.130968,0.146273,4.753694,0.001897
Gold_Prices,0.082743,0.002872,1.771619,0.134738,1.918018
VIX,1.123259,0.012606,1.075799,2.566851,17.41261
SP500,0.038921,0.165994,0.996451,2.345252,17.532986
