## TOC:
* [Setup](#bullet1)
* [STATA code](#bullet2)
    - [Robustness checks](#sub-bullet2.1)


## Setup <a class="anchor" id="bullet1"></a>

In [1]:
# Imports
import pandas as pd
import os

In [2]:
# Setup paths when working from either desktop or laptop
device = "desktop"
if device == "desktop":
    working_dir = r"E:\Users\Christiaan\Large_Files"
elif device == "laptop":
    working_dir = r"C:\Users\Ck0rt\Documents\Large files\School\MSc Finance & Investments"
    
def join_paths(working_dir, *args):
    path = os.path.join(working_dir, *args).replace('\\', '/')
    return path

**Stata functions**

In [3]:
class Tools():
    @staticmethod
    def varlist_to_string(var_list):
        stata_string = ' '.join([str(var) for var in var_list])
        return stata_string

    @staticmethod
    def list_check(list_or_str):
        return_value = list_or_str
        if type(list_or_str).__name__ == 'str':
            return_value = [list_or_str]
            print("Converted var to list")
        return return_value

class Stata():
    # Converts a list of variables to a Stata readable string command
    @staticmethod
    def regress_code(y_vars, x_vars, function="regress", extras=""):
        stata_code = ""
        y_vars = Tools.list_check(y_vars)
        x_vars = Tools.list_check(x_vars)

        x_vars = Tools.varlist_to_string(x_vars)
        for y in y_vars:

            stata_code = stata_code + f"{function} {y} {x_vars}{extras}\n"

        return stata_code

    @staticmethod
    def grstest2_code(y_vars, x_vars, function="grstest2", extras=""):
        stata_code = ""
        y_vars = Tools.list_check(y_vars)
        x_vars = Tools.list_check(x_vars)

        x_vars = Tools.varlist_to_string(x_vars)
        for y in y_vars:
            stata_code = stata_code + f"{function} {y}, flist({x_vars}) {extras}\n"

        return stata_code

    @staticmethod
    def ttest_code(vars, test_value):
        stata_code = ""
        vars = Tools.list_check(vars)

        for var in vars:
            stata_code = stata_code + f"ttest {var} == {test_value}\n"

        return stata_code

    @staticmethod
    def run(dataframe, functions):
#             print(f"\nSTATA CODE\n"
#                   f"||||||||||||||||||||||||||||||||||||||||||\n"
#                   f"{functions}\n"
#                   f"||||||||||||||||||||||||||||||||||||||||||\n")
            stata.pdataframe_to_data(dataframe, True)
            stata.run(f'''{functions}''', echo=True)

    @staticmethod
    def retrieve_dataset():
        current_dataset = stata.pdataframe_from_data()
        return current_dataset


**Function to loop result files**

Function is also used to lag variables.

In [4]:
def select_tickers(selected_tickers):
    # Check for which tickers function needs to be performed. Loops all if no argument is given for 'ticker_list'
    if selected_tickers is None:
        ticker_list = ['AAPL', 'AMD', 'AMZN', 'ATVI', 'BA', 'BABA', 'BAC', 'DIS', 'F', 
               'GE', 'GME', 'IQ', 'LULU', 'MSFT', 'MU', 'NFLX', 'NVDA', 'SBUX', 
               'SHOP', 'SNAP', 'SQ', 'TLRY', 'TSLA', 'V', 'WMT']
        return ticker_list
    
    else:
        return selected_tickers
    
def csv_to_df(ticker, file_dir=join_paths(working_dir, "Thesis\DATA_MERGED")):    
    # file path
    csv_path = os.path.join(file_dir, f"{ticker}.csv").replace('\\', '/')
    
    # Read csv
    df = pd.read_csv(csv_path)
    
    return df

def lag_cols(df, lagged_cols, lag, interact_dict):
    # Check if interaction term needs to be added to dict
    cols = lagged_cols.copy()
    if interact_dict and interact_dict['lagged']:
        cols.append(interact_dict['var_name'])
    # Lag columns
    if lagged_cols is not None:
        print(cols)
        # Lag columns
        df[cols] = df[cols].shift(lag)
        return df
    else:
        print("No columns lagged")
        return df


def retrieve_results(stata_code, selected_tickers=None, lag=0, lagged_cols=None, interact_dict=False):
    # Loop selected tickers
    for ticker in select_tickers(selected_tickers):
        # read csv
        df = csv_to_df(ticker)
        
        # Create interaction term
        if interact_dict:
            df[interact_dict['var_name']] = df[interact_dict['interact_1']] * df[interact_dict['interact_2']]
        
        # lag columns
        df = lag_cols(df, lagged_cols, lag, interact_dict)
        
        # run(df, stata_code)
        Stata.run(df, stata_code)
    return df


In [100]:
stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap"""
selected_tickers=["AAPL", "TSLA"]
lag = 2
lagged_cols=['[twit][f2s2]method_1', '[twit][f2s2]total']
interact_dict = {"var_name": "[f2s2]interact_sent_vol",
                 "interact_1": "[twit][f2s2]method_1",
                 "interact_2": "[twit][f2s2]total", "lagged": True}


df = retrieve_results(stata_code, selected_tickers=selected_tickers, lag=lag, lagged_cols=lagged_cols, interact_dict=interact_dict)
df[['date', '[rh]performance_1', '[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[f2s2]interact_sent_vol', '[crsp]mrktcap']].head(20)

['[twit][f2s2]method_1', '[twit][f2s2]total', '[f2s2]interact_sent_vol']
. regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol 
> crspmrktcap

      Source |       SS           df       MS      Number of obs   =       561
-------------+----------------------------------   F(4, 556)       =     34.17
       Model |  .004987974         4  .001246994   Prob > F        =    0.0000
    Residual |  .020291878       556  .000036496   R-squared       =    0.1973
-------------+----------------------------------   Adj R-squared   =    0.1915
       Total |  .025279853       560  .000045143   Root MSE        =    .00604

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0054165   .0034884     1.55   0.121    -.0014356    .0122687
twitf2s2to~l |   .0000104   6.30e-06     

Unnamed: 0,date,[rh]performance_1,[twit][f2s2]method_1,[twit][f2s2]total,[twit][f2s2]rel_vol,[f2s2]interact_sent_vol,[crsp]mrktcap
0,2018-05-01,,,,1.463636,,
1,2018-05-02,,,,3.522158,,
2,2018-05-03,,0.671739,460.0,2.835428,309.0,
3,2018-05-04,,0.603422,1987.0,1.185343,1199.0,
4,2018-05-05,,0.601878,2449.0,0.474745,1474.0,
5,2018-05-06,,0.602775,1153.0,0.521951,695.0,
6,2018-05-07,-0.024837,0.573222,478.0,0.920623,274.0,51408529.38
7,2018-05-08,-0.006422,0.563406,552.0,0.570166,311.0,51272694.18
8,2018-05-09,-0.008463,0.623134,1072.0,0.593866,668.0,52101288.9
9,2018-05-10,-0.004242,0.623167,682.0,0.756794,425.0,51790565.88


## STATA code <a class="anchor" id="bullet2"></a>

### Assumptions
- When lagging sentiment data, days in weekends are considered just as valuable as week days. I.e. when lagging by 1, a monday will be predicted using the sentiment of Sunday (and not the whole weekend).

**Initializing STATA**

In [5]:
import os
os.chdir("C:/Program Files/Stata17/utilities")
from pystata import config
config.init("mp")

from pystata import stata



  ___  ____  ____  ____  ____ ©
 /__    /   ____/   /   ____/      17.0
___/   /   /___/   /   /___/       MP—Parallel Edition

 Statistics and Data Science       Copyright 1985-2021 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        stata@stata.com

Stata license: Unlimited-user 2-core network, expiring 25 May 2023
Serial number: 501709318376
  Licensed to: Christiaan
               Erasmus University Rotterdam

Notes:
      1. Unicode is supported; see help unicode_advice.
      2. More than 2 billion observations are allowed; see help obs_advice.
      3. Maximum number of variables is set to 5,000; see help set_maxvar.


In [6]:
%%stata
di "Hello, World!"

Hello, World!


### Robustness checks <a class="anchor" id="sub-bullet2.1"></a>
**Fixed effects**

I check if time and firm fixed effects are needed using:

`xtset company_num date_numeric`

`testparm i.date_numeric`

`testparm i.company_num`

I find that this indeed the case, as `Prob > F = 0.0000` is the case for both firm as time fixed effects. To account for this I use the fixed effects in my model with: `i.company_num` `i.date_numeric`.

**Heteroskedasticity**

Using the following code:

`reg rhperformance_1 redm2f2BERTmethod_1 redm2f2BERTtotal m2f2BERTinteract_sent_vol crspmrktcap_control taqprice_control i.company_num i.date_numeric`
`estat hettest`

I find that heteroskedasticity is assumed for the regression with `Prob > chi2 = 0.0000`. To adjust for this I use `vce(robust)`.

**Autocorrelation**

The following code is used to test for autocorrelation. 

`xtserial rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap_control taqprice_control`

Auto correlation is found for all four regressions. With all p-values lower than `Prob > F = 0.0072`

To control for this


In [47]:

stata_code = """gen date_numeric = date(date, "YMD")
format date_numeric %td
encode rhticker, gen(company_num)

xtset company_num date_numeric

testparm i.date_numeric
testparm i.company_num

"""


# run(df, stata_code)
df_input = df.dropna().copy()
Stata.run(df_input, stata_code)


. gen date_numeric = date(date, "YMD")

. format date_numeric %td

. encode rhticker, gen(company_num)

. 
. xtset company_num date_numeric

Panel variable: company_num (unbalanced)
 Time variable: date_numeric, 07may2018 to 13aug2020, but with gaps
         Delta: 1 day

. 
. testparm i.date_numeric

 ( 1)  21312.date_numeric = 0
 ( 2)  21313.date_numeric = 0
 ( 3)  21314.date_numeric = 0
 ( 4)  21315.date_numeric = 0
 ( 5)  21318.date_numeric = 0
 ( 6)  21319.date_numeric = 0
 ( 7)  21320.date_numeric = 0
 ( 8)  21321.date_numeric = 0
 ( 9)  21322.date_numeric = 0
 (10)  21325.date_numeric = 0
 (11)  21326.date_numeric = 0
 (12)  21327.date_numeric = 0
 (13)  21328.date_numeric = 0
 (14)  21329.date_numeric = 0
 (15)  21333.date_numeric = 0
 (16)  21334.date_numeric = 0
 (17)  21335.date_numeric = 0
 (18)  21336.date_numeric = 0
 (19)  21339.date_numeric = 0
 (20)  21340.date_numeric = 0
 (21)  21341.date_numeric = 0
 (22)  21342.date_numeric = 0
 (23)  21343.date_numeric = 0
 (24) 


 ( 1)  2.company_num = 0
 ( 2)  3.company_num = 0
 ( 3)  4.company_num = 0
 ( 4)  5.company_num = 0
 ( 5)  6.company_num = 0
 ( 6)  7.company_num = 0
 ( 7)  8.company_num = 0
 ( 8)  9.company_num = 0
 ( 9)  10.company_num = 0
 (10)  11.company_num = 0
 (11)  12.company_num = 0
 (12)  13.company_num = 0
 (13)  14.company_num = 0
 (14)  15.company_num = 0
 (15)  16.company_num = 0
 (16)  17.company_num = 0
 (17)  18.company_num = 0
 (18)  19.company_num = 0
 (19)  20.company_num = 0
 (20)  21.company_num = 0
 (21)  22.company_num = 0
 (22)  23.company_num = 0
 (23)  24.company_num = 0
 (24)  25.company_num = 0

       F( 24,  6309) =   23.25
            Prob > F =    0.0000

. 
. 


Next we run the regression, using the firm and time fixed effects.
This is done using the command
- `areg rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol i.company_num, absorb(date_numeric)`

This command is the same as
- `regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol i.company_num i.date_numeric`

Except for the fact that all days `date_numeric` are absorbed.

In [7]:
def change_cols(df, cols, lag=0, rolling=1):
    df[lagged_cols] = df.groupby('[rh]ticker')[cols].shift(lag)
    
    df[lagged_cols] = df.groupby('[rh]ticker', as_index=False)[cols].rolling(rolling).mean()[cols]
    
#     df.dropna(subset=['[rh]dayname'], inplace=True)

    return df   
    

**Preparing main dataframe**

In [89]:
pd.set_option('display.max_rows', 500)

# Reading in main csv
csv_path = r"E:\Users\Christiaan\Large_Files\Thesis\DATA_MERGED\all_companies\all_companies.csv"
df = pd.read_csv(csv_path)

# Converting date to datetime
df['date'] = pd.to_datetime(df['date'])

# Adding price points to the weekend
cols = ['[crsp]mrktcap']
df[cols] = df.groupby('[rh]ticker')[cols].ffill()

# Lagging control variables
# df['[taq]price_control'] = df.groupby('[rh]ticker')['[taq]price'].shift(1)
df['[taq]price_control'] = df.groupby('[rh]ticker')['[taq]returns'].shift(1)

df['[crsp]mrktcap_control'] = df.groupby('[rh]ticker')['[crsp]mrktcap'].shift(1)



In [41]:
# Repeatable STATA code
stata_code_base = """gen date_numeric = date(date, "YMD")
format date_numeric %td

encode rhticker, gen(company_num)
#xtset company_num date_numeric
tsset company_num date_numeric
"""

**Summary statistics**

In [92]:
custom_code = """
summarize rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap_control taqprice_control
"""
stata_code = stata_code_base + custom_code
Stata.run(df_input, stata_code)



. gen date_numeric = date(date, "YMD")

. format date_numeric %td

. 
. encode rhticker, gen(company_num)

. #xtset company_num date_numeric
Unknown #command
. tsset company_num date_numeric

Panel variable: company_num (strongly balanced)
 Time variable: date_numeric, 01may2018 to 31aug2020
         Delta: 1 day

. 
. summarize rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vo
> l crspmrktcap_control taqprice_control

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
rhperforma~1 |     13,970    .0014372    .0109256  -.1122662   .2452852
twitf2s2me~1 |     20,592    .6991973    .1845348          0          1
twitf2s2to~l |     20,592    95.70945    282.6752          1      12166
f2s2intera~l |     20,592    63.22688    187.4822          0       8425
crspmrktca~l |     21,102    2.17e+08    3.29e+08   180482.4   1.97e+09
-------------+----------------------------------------

In [None]:
# df_input = df_input[df_input['[rh]ticker'].isin(['TSLA', 'AMZN', 'AAPL', 'MSFT', 'NFLX', 'NVDA'])]
# df_input = df_input[~df_input['[rh]ticker'].isin(['IQ', 'GE'])]
# df_input.groupby('[rh]ticker').mean()['[crsp]mrktcap'].rank(method='average')
# df_input.reset_index(inplace=True)

### Twitter


**Regression for Twitter with VADER sentiment**

In [112]:
custom_code = """
#areg rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap_control taqprice_control i.company_num, absorb(date_numeric)
reg rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap_control taqprice_control i.company_num i.date_numeric, vce(robust) beta
#summarize rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap_control taqprice_control

"""
stata_code = stata_code_base + custom_code

df_input = df.copy()

sentiment_col = '[twit][f2s2]method_1'
volume_col = '[twit][f2s2]total'
interaction_col = '[f2s2]interact_sent_vol'

# Creating interaction term
df_input[interaction_col] = df_input[sentiment_col] * df_input[volume_col]

# Lag and/or rolling
lagged_cols=[sentiment_col, volume_col, interaction_col]
df_input = change_cols(df_input, lagged_cols, lag=1, rolling=1)

# Filter out day-ticker combinations with less than 10 posts
# df_input = df_input[df_input[volume_col] > 50]

Stata.run(df_input, stata_code)



. gen date_numeric = date(date, "YMD")

. format date_numeric %td

. 
. encode rhticker, gen(company_num)

. #xtset company_num date_numeric
Unknown #command
. tsset company_num date_numeric

Panel variable: company_num (strongly balanced)
 Time variable: date_numeric, 01may2018 to 31aug2020
         Delta: 1 day

. 
. #areg rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol cr
> spmrktcap_control taqprice_control i.company_num, absorb(date_numeric)
Unknown #command
. reg rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crsp
> mrktcap_control taqprice_control i.company_num i.date_numeric, vce(robust) be
> ta

Linear regression                               Number of obs     =     13,517
                                                F(586, 12929)     =          .
                                                Prob > F          =          .
                                                R-squared         =     0.1074
                            

      21605  |  -.0020043   .0033655    -0.60   0.551                -.0077058
      21606  |  -.0040079   .0032313    -1.24   0.215                -.0157263
      21607  |  -.0029102   .0039307    -0.74   0.459                -.0114192
      21608  |  -.0026892   .0033916    -0.79   0.428                -.0105519
      21609  |   .0007268    .004345     0.17   0.867                 .0028517
      21612  |  -.0004792   .0036452    -0.13   0.895                -.0018804
      21613  |  -.0005639   .0034838    -0.16   0.871                -.0022125
      21614  |  -.0021384   .0031918    -0.67   0.503                -.0083907
      21615  |   -.002425   .0031871    -0.76   0.447                -.0095152
      21616  |  -.0015934   .0031847    -0.50   0.617                -.0062523
      21619  |   .0053773   .0094627     0.57   0.570                 .0210994
      21620  |   .0009293   .0053253     0.17   0.861                 .0036463
      21621  |  -.0010605    .004891    -0.22   0.82

      21878  |  -.0020198   .0035179    -0.57   0.566                -.0076024
      21879  |  -.0039348   .0032086    -1.23   0.220                -.0148102
      21880  |  -.0021321   .0031976    -0.67   0.505                -.0080251
      21882  |  -.0022924   .0031516    -0.73   0.467                -.0086283
      21885  |   -.001781   .0031758    -0.56   0.575                -.0069884
      21886  |  -.0034046   .0031767    -1.07   0.284                 -.013359
      21887  |  -.0021367   .0031892    -0.67   0.503                -.0080423
      21888  |  -.0030865    .003194    -0.97   0.334                -.0121109
      21889  |  -.0033576   .0031992    -1.05   0.294                -.0131746
      21892  |  -.0010018   .0034595    -0.29   0.772                -.0039309
      21893  |   -.001929    .003373    -0.57   0.567                -.0075689
      21894  |   .0005645    .004581     0.12   0.902                 .0022151
      21895  |  -.0026807   .0034399    -0.78   0.43

      22049  |   .0018624   .0033565     0.55   0.579                 .0071603
      22050  |   .0009346   .0032547     0.29   0.774                 .0035931
      22053  |   .0026378   .0041846     0.63   0.528                 .0103501
      22054  |  -.0003197   .0036205    -0.09   0.930                -.0012543
      22055  |   .0005522   .0034048     0.16   0.871                 .0021669
      22056  |   .0038993   .0038878     1.00   0.316                    .0153
      22057  |   .0042018   .0037394     1.12   0.261                 .0164872
      22061  |   .0064219   .0041372     1.55   0.121                 .0246901
      22062  |   .0012705   .0034295     0.37   0.711                 .0049851
      22063  |  -.0002354   .0033381    -0.07   0.944                -.0009237
      22064  |  -.0015743   .0032984    -0.48   0.633                -.0061773
      22067  |   .0002324   .0036031     0.06   0.949                 .0009117
      22068  |  -.0026915     .00322    -0.84   0.40

**Regression for Twitter with finBERT sentiment**

In [115]:
custom_code = """
#areg rhperformance_1 twitf2BERTmethod_1 twitf2BERTtotal f2BERTinteract_sent_vol crspmrktcap_control taqprice_control i.company_num, absorb(date_numeric)
reg rhperformance_1 twitf2BERTmethod_1 twitf2BERTtotal f2BERTinteract_sent_vol crspmrktcap_control taqprice_control i.company_num i.date_numeric, vce(robust) beta
"""
stata_code = stata_code_base + custom_code

df_input = df.copy()

sentiment_col = '[twit][f2BERT]method_1'
volume_col = '[twit][f2BERT]total'
interaction_col = '[f2BERT]interact_sent_vol'

# Creating interaction term
df_input[interaction_col] = df_input[sentiment_col] * df_input[volume_col]

# Lag and/or rolling
lagged_cols=[sentiment_col, volume_col, interaction_col]
df_input = change_cols(df_input, lagged_cols, lag=1, rolling=1)

# df_input = df_input[df_input[volume_col] > 10]


Stata.run(df_input, stata_code)



. gen date_numeric = date(date, "YMD")

. format date_numeric %td

. 
. encode rhticker, gen(company_num)

. #xtset company_num date_numeric
Unknown #command
. tsset company_num date_numeric

Panel variable: company_num (strongly balanced)
 Time variable: date_numeric, 01may2018 to 31aug2020
         Delta: 1 day

. 
. #areg rhperformance_1 twitf2BERTmethod_1 twitf2BERTtotal f2BERTinteract_sent_
> vol crspmrktcap_control taqprice_control i.company_num, absorb(date_numeric)
Unknown #command
. reg rhperformance_1 twitf2BERTmethod_1 twitf2BERTtotal f2BERTinteract_sent_vo
> l crspmrktcap_control taqprice_control i.company_num i.date_numeric, vce(robu
> st) beta

Linear regression                               Number of obs     =     10,910
                                                F(586, 10322)     =          .
                                                Prob > F          =          .
                                                R-squared         =     0.1327
                

      21447  |    .006744   .0030827     2.19   0.029                 .0251872
      21448  |  -.0063017    .006424    -0.98   0.327                -.0235353
      21451  |  -.0019106   .0020016    -0.95   0.340                -.0073204
      21452  |   .0019688   .0027517     0.72   0.474                 .0077297
      21453  |  -.0001417   .0023914    -0.06   0.953                -.0005562
      21454  |    .001346   .0024763     0.54   0.587                 .0052843
      21455  |    .002059   .0023934     0.86   0.390                 .0080836
      21458  |  -.0020402   .0047902    -0.43   0.670                -.0078173
      21459  |  -.0058141   .0040566    -1.43   0.152                -.0222772
      21460  |   .0013026   .0027337     0.48   0.634                 .0049912
      21461  |   .0025696   .0031346     0.82   0.412                 .0098456
      21462  |   .0032401   .0039569     0.82   0.413                 .0124149
      21465  |   .0008241   .0029391     0.28   0.77

      21635  |   .0024035   .0025775     0.93   0.351                 .0087372
      21636  |     -.0023   .0037553    -0.61   0.540                -.0083611
      21637  |  -.0027374   .0029069    -0.94   0.346                -.0099514
      21640  |  -.0013816   .0020375    -0.68   0.498                -.0055516
      21641  |   .0009166   .0019189     0.48   0.633                  .003683
      21642  |  -.0005009   .0019878    -0.25   0.801                -.0019664
      21643  |   .0025016   .0042676     0.59   0.558                 .0100519
      21644  |   .0001472   .0023267     0.06   0.950                 .0005916
      21647  |   .0025117   .0023344     1.08   0.282                 .0098611
      21648  |   .0004061   .0019964     0.20   0.839                 .0016318
      21649  |  -.0001719   .0019367    -0.09   0.929                -.0006908
      21650  |   .0012921   .0021046     0.61   0.539                 .0053083
      21651  |   .0014651   .0029409     0.50   0.61

      21797  |   .0014076   .0042765     0.33   0.742                 .0046718
      21798  |  -.0045564   .0047843    -0.95   0.341                -.0151229
      21801  |   .0033318    .004037     0.83   0.409                  .012112
      21802  |   .0021115   .0022425     0.94   0.346                  .007886
      21803  |   .0002634   .0026171     0.10   0.920                 .0010092
      21804  |   .0008873    .002326     0.38   0.703                 .0033137
      21805  |   .0002986   .0019052     0.16   0.875                 .0011151
      21808  |   .0004708   .0019524     0.24   0.809                 .0018039
      21809  |  -.0000506   .0018307    -0.03   0.978                -.0001939
      21810  |   .0001739   .0018863     0.09   0.927                 .0006493
      21811  |   .0011111    .001913     0.58   0.561                  .004039
      21812  |    .002199   .0019758     1.11   0.266                 .0079938
      21815  |   .0004406   .0018889     0.23   0.81

      22028  |  -.0001107   .0023497    -0.05   0.962                -.0004742
      22029  |  -.0004138   .0027428    -0.15   0.880                -.0017721
      22032  |   .0026299   .0028232     0.93   0.352                 .0103249
      22033  |   .0040073   .0024096     1.66   0.096                 .0153541
      22034  |     .00119   .0037854     0.31   0.753                 .0045595
      22035  |   .0026282   .0026323     1.00   0.318                   .01007
      22036  |  -.0002409   .0028928    -0.08   0.934                -.0009231
      22039  |   .0017162   .0021496     0.80   0.425                 .0068962
      22040  |   .0024279   .0026299     0.92   0.356                  .009532
      22041  |   .0017365   .0023486     0.74   0.460                 .0066535
      22042  |  -.0006043   .0021758    -0.28   0.781                -.0023155
      22043  |  -.0001112   .0024858    -0.04   0.964                -.0004259
      22046  |   .0090416   .0021407     4.22   0.00

### Reddit

**Regression for Reddit with VADER sentiment**

In [116]:
custom_code = """
#areg rhperformance_1 redm2f2s2method_1 redm2f2s2total m2f2s2interact_sent_vol crspmrktcap_control taqprice_control i.company_num, absorb(date_numeric)
reg rhperformance_1 redm2f2s2method_1 redm2f2s2total m2f2s2interact_sent_vol crspmrktcap_control taqprice_control i.company_num i.date_numeric, vce(robust) beta

"""
stata_code = stata_code_base + custom_code

df_input = df.copy()

sentiment_col = '[red][m2f2s2]method_1'
volume_col = '[red][m2f2s2]total'
interaction_col = '[m2f2s2]interact_sent_vol'

# Creating interaction term
df_input[interaction_col] = df_input[sentiment_col] * df_input[volume_col]

# Lag and/or rolling
lagged_cols=[sentiment_col, volume_col, interaction_col]
df_input = change_cols(df_input, lagged_cols, lag=1, rolling=1)

Stata.run(df_input, stata_code)


. gen date_numeric = date(date, "YMD")

. format date_numeric %td

. 
. encode rhticker, gen(company_num)

. #xtset company_num date_numeric
Unknown #command
. tsset company_num date_numeric

Panel variable: company_num (strongly balanced)
 Time variable: date_numeric, 01may2018 to 31aug2020
         Delta: 1 day

. 
. #areg rhperformance_1 redm2f2s2method_1 redm2f2s2total m2f2s2interact_sent_vo
> l crspmrktcap_control taqprice_control i.company_num, absorb(date_numeric)
Unknown #command
. reg rhperformance_1 redm2f2s2method_1 redm2f2s2total m2f2s2interact_sent_vol 
> crspmrktcap_control taqprice_control i.company_num i.date_numeric, vce(robust
> ) beta

Linear regression                               Number of obs     =     11,160
                                                F(586, 10572)     =          .
                                                Prob > F          =          .
                                                R-squared         =     0.1258
                    

      21586  |  -.0039038   .0043436    -0.90   0.369                -.0138837
      21587  |  -.0045211   .0050194    -0.90   0.368                -.0164959
      21588  |   -.003715   .0044131    -0.84   0.400                -.0128605
      21591  |  -.0017196   .0046649    -0.37   0.712                 -.005435
      21592  |   -.004384   .0044903    -0.98   0.329                -.0151763
      21593  |  -.0037564   .0045346    -0.83   0.407                -.0137058
      21594  |  -.0055692   .0044495    -1.25   0.211                  -.02131
      21595  |  -.0059087   .0043922    -1.35   0.179                -.0220902
      21599  |  -.0047457   .0043736    -1.09   0.278                -.0173153
      21600  |  -.0014947   .0046778    -0.32   0.749                -.0050288
      21601  |  -.0030984   .0044281    -0.70   0.484                -.0110192
      21602  |  -.0040529   .0044838    -0.90   0.366                  -.01403
      21605  |  -.0043518   .0044777    -0.97   0.33

      21788  |  -.0046606    .004392    -1.06   0.289                 -.017424
      21789  |  -.0039586    .004452    -0.89   0.374                -.0147997
      21790  |   -.004972   .0045325    -1.10   0.273                -.0185883
      21791  |  -.0037327    .004435    -0.84   0.400                -.0146031
      21795  |  -.0016974   .0044339    -0.38   0.702                -.0060367
      21796  |  -.0040015   .0044972    -0.89   0.374                -.0149599
      21797  |  -.0035415   .0051354    -0.69   0.490                -.0132403
      21798  |  -.0079011   .0054214    -1.46   0.145                 -.029539
      21801  |  -.0007272    .005333    -0.14   0.892                -.0025863
      21802  |   -.002548    .004483    -0.57   0.570                -.0099683
      21803  |  -.0046843    .004666    -1.00   0.315                -.0183259
      21804  |  -.0052223   .0045197    -1.16   0.248                -.0199825
      21805  |  -.0047173   .0043775    -1.08   0.28

      21952  |  -.0047491   .0045205    -1.05   0.293                -.0177549
      21955  |  -.0015049   .0046266    -0.33   0.745                -.0052095
      21956  |  -.0028809   .0045927    -0.63   0.530                -.0110236
      21957  |  -.0053213   .0044517    -1.20   0.232                -.0198942
      21958  |  -.0064081   .0046704    -1.37   0.170                -.0250701
      21959  |  -.0052681   .0044984    -1.17   0.242                -.0192215
      21963  |  -.0046697   .0045212    -1.03   0.302                -.0166073
      21964  |  -.0054409   .0045676    -1.19   0.234                -.0203413
      21965  |  -.0055328   .0044153    -1.25   0.210                -.0221101
      21966  |  -.0050397   .0044344    -1.14   0.256                -.0188413
      21969  |  -.0062254   .0045008    -1.38   0.167                -.0215506
      21970  |   -.007334   .0045128    -1.63   0.104                -.0274188
      21971  |   .0007062    .004449     0.16   0.87

      22112  |  -.0008627   .0060359    -0.14   0.886                -.0033749
      22113  |  -.0047827   .0045229    -1.06   0.290                -.0187112
      22116  |  -.0049733   .0044372    -1.12   0.262                 -.018146
      22117  |  -.0010216   .0049076    -0.21   0.835                -.0038192
      22118  |   -.003372   .0045006    -0.75   0.454                -.0129027
      22119  |  -.0024405   .0046374    -0.53   0.599                -.0084483
      22120  |  -.0039692   .0044536    -0.89   0.373                -.0151878
      22123  |  -.0026484   .0044929    -0.59   0.556                 -.009663
      22124  |    -.00472   .0045139    -1.05   0.296                -.0184658
      22125  |  -.0067127   .0046971    -1.43   0.153                -.0256857
      22126  |  -.0060462   .0044424    -1.36   0.174                -.0215029
      22127  |  -.0003173   .0048082    -0.07   0.947                -.0012682
      22130  |   .0071803   .0061926     1.16   0.24

**Regression for Reddit with finBERT sentiment**

In [118]:
custom_code = """
#areg rhperformance_1 redm2f2BERTmethod_1 redm2f2BERTtotal m2f2BERTinteract_sent_vol crspmrktcap_control taqprice_control i.company_num, absorb(date_numeric)
reg rhperformance_1 redm2f2BERTmethod_1 redm2f2BERTtotal m2f2BERTinteract_sent_vol crspmrktcap_control taqprice_control i.company_num i.date_numeric, vce(robust) beta

"""
stata_code = stata_code_base + custom_code

df_input = df.copy()

sentiment_col = '[red][m2f2BERT]method_1'
volume_col = '[red][m2f2BERT]total'
interaction_col = '[m2f2BERT]interact_sent_vol'

# Creating interaction term
df_input[interaction_col] = df_input[sentiment_col] * df_input[volume_col]

# Lag and/or rolling
lagged_cols=[sentiment_col, volume_col, interaction_col]
df_input = change_cols(df_input, lagged_cols, lag=1, rolling=1)

Stata.run(df_input, stata_code)


. gen date_numeric = date(date, "YMD")

. format date_numeric %td

. 
. encode rhticker, gen(company_num)

. #xtset company_num date_numeric
Unknown #command
. tsset company_num date_numeric

Panel variable: company_num (strongly balanced)
 Time variable: date_numeric, 01may2018 to 31aug2020
         Delta: 1 day

. 
. #areg rhperformance_1 redm2f2BERTmethod_1 redm2f2BERTtotal m2f2BERTinteract_s
> ent_vol crspmrktcap_control taqprice_control i.company_num, absorb(date_numer
> ic)
Unknown #command
. reg rhperformance_1 redm2f2BERTmethod_1 redm2f2BERTtotal m2f2BERTinteract_sen
> t_vol crspmrktcap_control taqprice_control i.company_num i.date_numeric, vce(
> robust) beta

Linear regression                               Number of obs     =      8,837
                                                F(586, 8249)      =          .
                                                Prob > F          =          .
                                                R-squared         =     0.1663
     

      21440  |  -.0007992   .0089727    -0.09   0.929                -.0030532
      21441  |  -.0085701   .0063828    -1.34   0.179                -.0308711
      21444  |  -.0073379   .0056447    -1.30   0.194                -.0255946
      21445  |  -.0068771   .0055543    -1.24   0.216                -.0269907
      21446  |  -.0013663   .0098395    -0.14   0.890                -.0046043
      21447  |   .0006359   .0061454     0.10   0.918                 .0024291
      21448  |  -.0119811   .0084059    -1.43   0.154                 -.045771
      21451  |  -.0077091   .0054953    -1.40   0.161                -.0286224
      21452  |  -.0043869   .0059666    -0.74   0.462                 -.016759
      21453  |  -.0063443    .005644    -1.12   0.261                -.0255448
      21454  |  -.0045708    .005775    -0.79   0.429                -.0169707
      21455  |   -.003189   .0063784    -0.50   0.617                -.0125158
      21458  |    -.00789    .008223    -0.96   0.33

      21675  |  -.0027161   .0055247    -0.49   0.623                 -.009153
      21676  |  -.0033779   .0054138    -0.62   0.533                -.0117821
      21677  |   -.000776   .0054711    -0.14   0.887                -.0026151
      21678  |  -.0053992   .0053809    -1.00   0.316                -.0194489
      21679  |  -.0054771   .0053974    -1.01   0.310                -.0191042
      21682  |  -.0054558   .0056272    -0.97   0.332                -.0170237
      21683  |   .0010184   .0057463     0.18   0.859                 .0034319
      21684  |   -.002924   .0055184    -0.53   0.596                -.0101988
      21685  |  -.0048781   .0054535    -0.89   0.371                -.0186357
      21686  |  -.0048616   .0054228    -0.90   0.370                -.0180503
      21689  |  -.0025434   .0056078    -0.45   0.650                 -.008571
      21690  |  -.0041686   .0054115    -0.77   0.441                -.0140478
      21691  |  -.0056157   .0053893    -1.04   0.29

      21860  |  -.0081176   .0054804    -1.48   0.139                -.0283141
      21861  |  -.0055356   .0054636    -1.01   0.311                 -.019308
      21864  |  -.0012716   .0056211    -0.23   0.821                -.0044354
      21865  |  -.0040629   .0054803    -0.74   0.458                -.0146352
      21866  |  -.0035329   .0057413    -0.62   0.538                -.0138655
      21867  |  -.0016305   .0056269    -0.29   0.772                -.0052951
      21868  |  -.0065162   .0054723    -1.19   0.234                -.0241935
      21871  |  -.0054326   .0056074    -0.97   0.333                 -.018949
      21872  |  -.0056077   .0054457    -1.03   0.303                -.0208204
      21873  |  -.0048236   .0053952    -0.89   0.371                -.0184272
      21874  |  -.0058473   .0054458    -1.07   0.283                -.0210631
      21875  |  -.0043353   .0055503    -0.78   0.435                -.0151215
      21878  |  -.0031385   .0059911    -0.52   0.60

      22032  |   -.005348   .0059618    -0.90   0.370                -.0198562
      22033  |  -.0010442   .0055625    -0.19   0.851                -.0042043
      22034  |  -.0059111   .0065692    -0.90   0.368                -.0231995
      22035  |  -.0050183    .005824    -0.86   0.389                -.0207036
      22036  |  -.0099607   .0061727    -1.61   0.107                -.0369825
      22039  |   -.005957   .0055206    -1.08   0.281                -.0227575
      22040  |  -.0032538     .00595    -0.55   0.584                -.0134241
      22041  |  -.0069363   .0056468    -1.23   0.219                -.0272231
      22042  |  -.0082042   .0055248    -1.48   0.138                -.0346423
      22043  |  -.0072423    .005629    -1.29   0.198                -.0298791
      22046  |   .0028624   .0056054     0.51   0.610                 .0109351
      22047  |  -.0058308   .0054324    -1.07   0.283                 -.024056
      22048  |  -.0011504   .0056019    -0.21   0.83

In [32]:

stata_code = """gen date_numeric = date(date, "YMD")
format date_numeric %td
encode rhticker, gen(company_num)

xtset company_num date_numeric
xtreg rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol i.company_num, fe
"""


# run(df, stata_code)
df_input = df.dropna().copy()
Stata.run(df_input, stata_code)
df


. gen date_numeric = date(date, "YMD")

. format date_numeric %td

. encode rhticker, gen(company_num)

. 
. xtset company_num date_numeric

Panel variable: company_num (unbalanced)
 Time variable: date_numeric, 07may2018 to 13aug2020, but with gaps
         Delta: 1 day

. xtreg rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol i.
> company_num, fe
note: 2.company_num omitted because of collinearity.
note: 3.company_num omitted because of collinearity.
note: 4.company_num omitted because of collinearity.
note: 5.company_num omitted because of collinearity.
note: 6.company_num omitted because of collinearity.
note: 7.company_num omitted because of collinearity.
note: 8.company_num omitted because of collinearity.
note: 9.company_num omitted because of collinearity.
note: 10.company_num omitted because of collinearity.
note: 11.company_num omitted because of collinearity.
note: 12.company_num omitted because of collinearity.
note: 13.company_num omitted because of co

Unnamed: 0,date,company_id,[rh]dayname,[rh]ticker,[rh]RHtotal_holdings,[rh]RHtotal_holdings_change,[rh]user_estimate,[rh]AUSpU,[rh]users_holding,[rh]change,...,[red][m2f1BERT]rel_vol,[red][m2f1BERT]final_sent,[red][m2f2BERT]pos,[red][m2f2BERT]neg,[red][m2f2BERT]total,[red][m2f2BERT]method_1,[red][m2f2BERT]rel_vol,[red][m2f2BERT]final_sent,[crsp]mrktcap,[f2s2]interact_sent_vol
0,2018-05-01,1,,AAPL,,,,,,,...,,,,,,,,,,1184.0
1,2018-05-02,1,,AAPL,,,,,,,...,,,,,,,,,,876.0
2,2018-05-03,1,,AAPL,,,,,,,...,,,,,,,,,,173.0
3,2018-05-04,1,,AAPL,,,,,,,...,,,,,,,,,,654.0
4,2018-05-05,1,,AAPL,,,,,,,...,,,,,,,,,,184.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21345,2020-08-27,25,,WMT,,,,,,,...,2.376147,1.637615,32.0,12.0,44.0,0.727273,2.961538,2.153846,,335.0
21346,2020-08-28,25,,WMT,,,,,,,...,2.341379,1.786207,19.0,4.0,23.0,0.826087,1.364407,1.127119,,185.0
21347,2020-08-29,25,,WMT,,,,,,,...,1.043478,0.652174,16.0,9.0,25.0,0.640000,1.258993,0.805755,,18.0
21348,2020-08-30,25,,WMT,,,,,,,...,0.367284,0.259259,4.0,3.0,7.0,0.571429,0.347518,0.198582,,32.0


In [104]:
stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol 
> crspmrktcap

      Source |       SS           df       MS      Number of obs   =       561
-------------+----------------------------------   F(4, 556)       =     34.17
       Model |  .004987974         4  .001246994   Prob > F        =    0.0000
    Residual |  .020291878       556  .000036496   R-squared       =    0.1973
-------------+----------------------------------   Adj R-squared   =    0.1915
       Total |  .025279853       560  .000045143   Root MSE        =    .00604

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0054165   .0034884     1.55   0.121    -.0014356    .0122687
twitf2s2to~l |   .0000104   6.30e-06     1.65   0.101    -2.01e-06    .0000228
f2s2intera~l |  -.0000138   9.36e-0

In [None]:


# columns = ['[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[crsp]mrktcap']
# df[columns] = df[columns].shift(1)
# df.dropna(inplace=True)
df[['date', '[rh]performance_1', '[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[f2s2]interact_sent_vol', '[crsp]mrktcap']]

In [None]:
lagged_cols=['[twit][f2s2]method_1', '[twit][f2s2]total']
df[lagged_cols] = df.groupby('[rh]ticker')[lagged_cols].shift(2)

lagged_cols=['date', '[rh]ticker', '[twit][f2s2]method_1', '[twit][f2s2]total']
df[lagged_cols].iloc[800:900]

In [38]:
interact_dict = {"var_name": "[f2s2]interact_sent_vol",
                 "interact_1": "[twit][f2s2]method_1",
                 "interact_2": "[twit][f2s2]total", "lagged": True}

df = retrieve_results(selected_tickers=["AAPL"], interact_dict=interact_dict, lagged_cols=['[twit][f2s2]method_1', '[twit][f2s2]total'], lag=2)
df[['date', '[rh]performance_1', '[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[f2s2]interact_sent_vol', '[crsp]mrktcap']].head(20)

['[twit][f2s2]method_1', '[twit][f2s2]total', '[f2s2]interact_sent_vol']


Unnamed: 0,date,[rh]performance_1,[twit][f2s2]method_1,[twit][f2s2]total,[twit][f2s2]rel_vol,[f2s2]interact_sent_vol,[crsp]mrktcap
0,2018-05-01,,,,4.272192,,
1,2018-05-02,,,,2.180548,,
2,2018-05-03,,0.714544,1657.0,0.479733,1184.0,
3,2018-05-04,,0.770449,1137.0,1.373905,876.0,
4,2018-05-05,,0.673152,257.0,0.365468,173.0,
5,2018-05-06,,0.748284,874.0,0.158439,654.0,
6,2018-05-07,-0.000588,0.763485,241.0,0.49358,184.0,910087000.0
7,2018-05-08,-0.00311,0.714286,105.0,0.395633,75.0,914461400.0
8,2018-05-09,-0.000739,0.746914,324.0,0.349832,242.0,920900300.0
9,2018-05-10,0.000816,0.607955,176.0,0.905489,107.0,934072800.0


In [125]:
stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol 
> crspmrktcap

      Source |       SS           df       MS      Number of obs   =       548
-------------+----------------------------------   F(4, 543)       =     45.19
       Model |  .006257554         4  .001564388   Prob > F        =    0.0000
    Residual |  .018798487       543   .00003462   R-squared       =    0.2497
-------------+----------------------------------   Adj R-squared   =    0.2442
       Total |   .02505604       547  .000045806   Root MSE        =    .00588

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |  -.0022792   .0037684    -0.60   0.546    -.0096816    .0051231
twitf2s2to~l |   7.07e-06   3.88e-06     1.82   0.069    -5.47e-07    .0000147
f2s2intera~l |  -2.63e-06   5.97e-0

In [127]:

stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol

      Source |       SS           df       MS      Number of obs   =       548
-------------+----------------------------------   F(3, 544)       =     20.40
       Model |  .002533516         3  .000844505   Prob > F        =    0.0000
    Residual |  .022522524       544  .000041402   R-squared       =    0.1011
-------------+----------------------------------   Adj R-squared   =    0.0962
       Total |   .02505604       547  .000045806   Root MSE        =    .00643

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0039622   .0040681     0.97   0.330    -.0040288    .0119533
twitf2s2to~l |   3.06e-06   4.22e-06     0.73   0.468    -5.23e-06    .0000113
f2s2intera~l |   5.27e-06   6.48e-06     0.81   0.

In [128]:

stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1 twitf2s2total

      Source |       SS           df       MS      Number of obs   =       548
-------------+----------------------------------   F(2, 545)       =     30.28
       Model |  .002506102         2  .001253051   Prob > F        =    0.0000
    Residual |  .022549938       545  .000041376   R-squared       =    0.1000
-------------+----------------------------------   Adj R-squared   =    0.0967
       Total |   .02505604       547  .000045806   Root MSE        =    .00643

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0060289   .0031769     1.90   0.058    -.0002115    .0122693
twitf2s2to~l |   6.42e-06   8.61e-07     7.46   0.000     4.73e-06    8.11e-06
       _cons |  -.0040004   .0020734    -1.93   0.054    -.0080733    .0

In [130]:

stata_code = """regress rhperformance_1 twitf2s2method_1"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1

      Source |       SS           df       MS      Number of obs   =       548
-------------+----------------------------------   F(1, 546)       =      4.53
       Model |  .000206278         1  .000206278   Prob > F        =    0.0337
    Residual |  .024849762       546  .000045512   R-squared       =    0.0082
-------------+----------------------------------   Adj R-squared   =    0.0064
       Total |   .02505604       547  .000045806   Root MSE        =    .00675

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0070863   .0033286     2.13   0.034     .0005479    .0136246
       _cons |  -.0028336   .0021684    -1.31   0.192     -.007093    .0014259
------------------------------------------------------------------------------


Stata

In [104]:
df = pd.DataFrame()
stata_code = 'di "Hello, World!"'

stata.run(f'''{stata_code}''', echo=True)

. di "Hello, World!"
Hello, World!


In [103]:
import os
os.chdir("C:/Program Files/Stata17/utilities")
from pystata import config
config.init("mp")

from pystata import stata

# # Setup Stata
# import stata_setup
# stata_setup.config("C:/Program Files/Stata17", "mp")

# from pystata import stata
# # stata.config.status()




## Vault

In [56]:
def add_marketcap(results_df, ticker):
    # Path
    csv_path = r"C:\Users\Ck0rt\Documents\Large files\School\MSc Finance & Investments\Thesis\market_cap_info.csv"
    
    # Read csv
    df = pd.read_csv(csv_path)
    
    # Rename twitter columns to contain [crsp]
    df.rename(columns={"Ticker": "ticker", "DlyCalDt": "date", "DlyCap": "[crsp]mrktcap"}, inplace=True)
    
    # Filter observations for ticker
    df = df[df['ticker'] == ticker].reset_index(drop=True)
    
    # Drop ticker column to prevent duplicate
    df.drop(columns=['ticker'], inplace=True)

    # Merge data
    results_df = results_df.merge(df, how='left', left_on='date', right_on='date')

    return results_df


Loop files and add marktcap info

In [58]:
file_dir = r"C:\Users\Ck0rt\Documents\Large files\School\MSc Finance & Investments\Thesis\MERGED_DATA"
ticker_list = ['AAPL', 'AMD', 'AMZN', 'ATVI', 'BA', 'BABA', 'BAC', 'DIS', 'F', 'GE', 'GME', 'IQ', 'LULU', 'MSFT', 'MU', 'NFLX', 'NVDA', 'SBUX', 'SHOP', 'SNAP', 'SQ', 'TLRY', 'TSLA', 'V', 'WMT']

save = False

if save:
    for ticker in ticker_list:
        # file path
        file_path = os.path.join(file_dir, f"{ticker}.csv").replace('\\', '/')

        # Read csv
        merged_df = pd.read_csv(file_path)
        
        # Add files
        df = add_marketcap(merged_df, ticker)


        df.to_csv(file_path, encoding='utf-8', index=False)
        print(f"Merged file saved at: {file_path}")

Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/AAPL.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/AMD.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/AMZN.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/ATVI.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/BA.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/BABA.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/BAC.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/DIS.csv
Merged file saved at: C:/Users/Ck0rt/Document

In [58]:
csv_path = join_paths(working_dir, "Thesis\DATA_MERGED", "AAPL.csv")
df = pd.read_csv(csv_path)

columns = ['[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[crsp]mrktcap']
df[columns] = df[columns].shift(1)
df.dropna(inplace=True)
df['[f2s2]interact_sent_vol'] = df['[twit][f2s2]method_1'] * df['[twit][f2s2]total']
df[['date', '[rh]performance_1', '[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[f2s2]interact_sent_vol', '[crsp]mrktcap']]

Unnamed: 0,date,[rh]performance_1,[twit][f2s2]method_1,[twit][f2s2]total,[twit][f2s2]rel_vol,[f2s2]interact_sent_vol,[crsp]mrktcap
7,2018-05-08,-0.003110,0.746914,324.0,0.493580,242.0,9.100870e+08
8,2018-05-09,-0.000739,0.607955,176.0,0.395633,107.0,9.144614e+08
9,2018-05-10,0.000816,0.673077,104.0,0.349832,70.0,9.209003e+08
14,2018-05-15,-0.001767,0.666667,87.0,0.657667,58.0,9.247832e+08
15,2018-05-16,-0.000028,0.576923,130.0,1.034091,75.0,9.163783e+08
...,...,...,...,...,...,...,...
828,2020-08-06,0.017180,0.787815,476.0,0.506845,375.0,1.882348e+09
829,2020-08-07,0.012765,0.792222,900.0,1.071064,713.0,1.948022e+09
833,2020-08-11,0.010803,0.654717,530.0,0.959897,347.0,1.927926e+09
834,2020-08-12,0.016233,0.641243,354.0,0.717221,227.0,1.870590e+09


In [71]:
import pystata
import pandas as pd

stata.pdataframe_to_data(df, True)
stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap"""

# Run the regression and capture the output in a string
stata.run(stata_code, echo=True)
# regression_output = pystata.run_statamc(stata_code)

# Convert the output to a pandas DataFrame
# regression_results = pd.read_csv(StringIO(regression_output), delim_whitespace=True, skiprows=1)

# Process the regression results here
# r = stata.get_return()['r(table)']
print()
r = stata.get_return()

print(r['r(table)'])
# print(r)


. regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol 
> crspmrktcap

      Source |       SS           df       MS      Number of obs   =       427
-------------+----------------------------------   F(4, 422)       =     28.00
       Model |  .003893954         4  .000973489   Prob > F        =    0.0000
    Residual |  .014673348       422  .000034771   R-squared       =    0.2097
-------------+----------------------------------   Adj R-squared   =    0.2022
       Total |  .018567302       426  .000043585   Root MSE        =     .0059

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0021138   .0043585     0.48   0.628    -.0064533    .0106809
twitf2s2to~l |   .0000127   4.14e-06     3.06   0.002     4.53e-06    .0000208
f2s2intera~l |  -.0000141   6.42e-0

In [77]:
import pandas as pd
from pystata import stata

def run(dataframe, commands):
    # Convert Pandas DataFrame to Stata dataset
    stata.pdataframe_to_data(dataframe, True)

    # Execute Stata commands
    stata.run(commands, echo=True)

    # Print processed instructions
    print(f"Stata processed the following instructions\n"
          f"----------------------------------------\n"
          f"{commands}"
          f"----------------------------------------")


In [123]:
csv_path = r"C:\Users\Ck0rt\Documents\Large files\School\MSc Finance & Investments\Thesis\MERGED_DATA\AAPL.csv"
df = pd.read_csv(csv_path)

columns = ['[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[crsp]mrktcap']
df[columns] = df[columns].shift(1)
df.dropna(inplace=True)
df['[f2s2]interact_sent_vol'] = df['[twit][f2s2]method_1'] * df['[twit][f2s2]total']
df[['date', '[rh]performance_1', '[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[f2s2]interact_sent_vol', '[crsp]mrktcap']]

Unnamed: 0,date,[rh]performance_1,[twit][f2s2]method_1,[twit][f2s2]total,[twit][f2s2]rel_vol,[f2s2]interact_sent_vol,[crsp]mrktcap
1,2018-05-08,-0.003110,0.746914,324.0,0.493580,242.0,9.100870e+08
2,2018-05-09,-0.000739,0.607955,176.0,0.395633,107.0,9.144614e+08
3,2018-05-10,0.000816,0.673077,104.0,0.349832,70.0,9.209003e+08
5,2018-05-14,0.000028,0.694118,170.0,0.855500,118.0,9.269459e+08
6,2018-05-15,-0.001767,0.666667,87.0,0.657667,58.0,9.247832e+08
...,...,...,...,...,...,...,...
556,2020-08-07,0.012765,0.792222,900.0,1.071064,713.0,1.948022e+09
557,2020-08-10,0.022062,0.765583,738.0,1.186768,565.0,1.900306e+09
558,2020-08-11,0.010803,0.654717,530.0,0.959897,347.0,1.927926e+09
559,2020-08-12,0.016233,0.641243,354.0,0.717221,227.0,1.870590e+09
