In [53]:
import pandas as pd
import os

## Add marketcap info

In [56]:
def add_marktcap(results_df, ticker):
    # Path
    csv_path = r"C:\Users\Ck0rt\Documents\Large files\School\MSc Finance & Investments\Thesis\market_cap_info.csv"
    
    # Read csv
    df = pd.read_csv(csv_path)
    
    # Rename twitter columns to contain [crsp]
    df.rename(columns={"Ticker": "ticker", "DlyCalDt": "date", "DlyCap": "[crsp]mrktcap"}, inplace=True)
    
    # Filter observations for ticker
    df = df[df['ticker'] == ticker].reset_index(drop=True)
    
    # Drop ticker column to prevent duplicate
    df.drop(columns=['ticker'], inplace=True)

    # Merge data
    results_df = results_df.merge(df, how='left', left_on='date', right_on='date')

    return results_df


Loop files and add marktcap info

In [58]:
file_dir = r"C:\Users\Ck0rt\Documents\Large files\School\MSc Finance & Investments\Thesis\MERGED_DATA"
ticker_list = ['AAPL', 'AMD', 'AMZN', 'ATVI', 'BA', 'BABA', 'BAC', 'DIS', 'F', 'GE', 'GME', 'IQ', 'LULU', 'MSFT', 'MU', 'NFLX', 'NVDA', 'SBUX', 'SHOP', 'SNAP', 'SQ', 'TLRY', 'TSLA', 'V', 'WMT']

save = False

if save:
    for ticker in ticker_list:
        # file path
        file_path = os.path.join(file_dir, f"{ticker}.csv").replace('\\', '/')

        # Read csv
        merged_df = pd.read_csv(file_path)
        
        # Add files
        df = add_marktcap(merged_df, ticker)


        df.to_csv(file_path, encoding='utf-8', index=False)
        print(f"Merged file saved at: {file_path}")

Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/AAPL.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/AMD.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/AMZN.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/ATVI.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/BA.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/BABA.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/BAC.csv
Merged file saved at: C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/MERGED_DATA/DIS.csv
Merged file saved at: C:/Users/Ck0rt/Document

## STATA code

In [66]:
import os
os.chdir("C:/Program Files/Stata17/utilities")
from pystata import config
config.init("mp")


  ___  ____  ____  ____  ____ ©
 /__    /   ____/   /   ____/      17.0
___/   /   /___/   /   /___/       MP—Parallel Edition

 Statistics and Data Science       Copyright 1985-2021 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        stata@stata.com

Stata license: Unlimited-user 2-core network, expiring 25 May 2023
Serial number: 501709318376
  Licensed to: Christiaan
               Erasmus University Rotterdam

Notes:
      1. Unicode is supported; see help unicode_advice.
      2. More than 2 billion observations are allowed; see help obs_advice.
      3. Maximum number of variables is set to 5,000; see help set_maxvar.


In [68]:
%%stata
di "Hello, World!"

Hello, World!


In [77]:
import pandas as pd
from pystata import stata

def run(dataframe, commands):
    # Convert Pandas DataFrame to Stata dataset
    stata.pdataframe_to_data(dataframe, True)

    # Execute Stata commands
    stata.run(commands, echo=True)

    # Print processed instructions
    print(f"Stata processed the following instructions\n"
          f"----------------------------------------\n"
          f"{commands}"
          f"----------------------------------------")


In [100]:
csv_path = r"C:\Users\Ck0rt\Documents\Large files\School\MSc Finance & Investments\Thesis\MERGED_DATA\AAPL.csv"
df = pd.read_csv(csv_path)
df.columns.to_list()

['date',
 '[rh]dayname',
 '[rh]ticker',
 '[rh]RHtotal_holdings',
 '[rh]RHtotal_holdings_change',
 '[rh]user_estimate',
 '[rh]AUSpU',
 '[rh]users_holding',
 '[rh]change',
 '[rh]pct_change',
 '[rh]market_share',
 '[rh]expected_position_1',
 '[rh]performance_1',
 '[rh]expected_position_2',
 '[rh]performance_2',
 '[taq]price',
 '[taq]total_vol',
 '[taq]buy',
 '[taq]sell',
 '[taq]buy_vol',
 '[taq]sell_vol',
 '[taq]total_bs',
 '[taq]total_price',
 '[taq]bs_change',
 '[taq]vol_change',
 '[twit][f1s2]pos',
 '[twit][f1s2]neg',
 '[twit][f1s2]total',
 '[twit][f2s2]pos',
 '[twit][f2s2]neg',
 '[twit][f2s2]total',
 '[twit][f1s2]method_1',
 '[twit][f2s2]method_1',
 '[twit][f1s2]method_3',
 '[twit][f2s2]method_3',
 '[twit][f1s2]rel_vol',
 '[twit][f2s2]rel_vol',
 '[twit][f1s2M1]final_sent',
 '[twit][f1s2M3]final_sent',
 '[twit][f2s2M1]final_sent',
 '[twit][f2s2M3]final_sent',
 '[twit][f1BERT]pos',
 '[twit][f1BERT]neg',
 '[twit][f1BERT]total',
 '[twit][f2BERT]pos',
 '[twit][f2BERT]neg',
 '[twit][f2BERT]

In [123]:
csv_path = r"C:\Users\Ck0rt\Documents\Large files\School\MSc Finance & Investments\Thesis\MERGED_DATA\AAPL.csv"
df = pd.read_csv(csv_path)

columns = ['[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[crsp]mrktcap']
df[columns] = df[columns].shift(1)
df.dropna(inplace=True)
df['[f2s2]interact_sent_vol'] = df['[twit][f2s2]method_1'] * df['[twit][f2s2]total']
df[['date', '[rh]performance_1', '[twit][f2s2]method_1', '[twit][f2s2]total', '[twit][f2s2]rel_vol', '[f2s2]interact_sent_vol', '[crsp]mrktcap']]

Unnamed: 0,date,[rh]performance_1,[twit][f2s2]method_1,[twit][f2s2]total,[twit][f2s2]rel_vol,[f2s2]interact_sent_vol,[crsp]mrktcap
1,2018-05-08,-0.003110,0.746914,324.0,0.493580,242.0,9.100870e+08
2,2018-05-09,-0.000739,0.607955,176.0,0.395633,107.0,9.144614e+08
3,2018-05-10,0.000816,0.673077,104.0,0.349832,70.0,9.209003e+08
5,2018-05-14,0.000028,0.694118,170.0,0.855500,118.0,9.269459e+08
6,2018-05-15,-0.001767,0.666667,87.0,0.657667,58.0,9.247832e+08
...,...,...,...,...,...,...,...
556,2020-08-07,0.012765,0.792222,900.0,1.071064,713.0,1.948022e+09
557,2020-08-10,0.022062,0.765583,738.0,1.186768,565.0,1.900306e+09
558,2020-08-11,0.010803,0.654717,530.0,0.959897,347.0,1.927926e+09
559,2020-08-12,0.016233,0.641243,354.0,0.717221,227.0,1.870590e+09


In [125]:

stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol crspmrktcap"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol 
> crspmrktcap

      Source |       SS           df       MS      Number of obs   =       548
-------------+----------------------------------   F(4, 543)       =     45.19
       Model |  .006257554         4  .001564388   Prob > F        =    0.0000
    Residual |  .018798487       543   .00003462   R-squared       =    0.2497
-------------+----------------------------------   Adj R-squared   =    0.2442
       Total |   .02505604       547  .000045806   Root MSE        =    .00588

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |  -.0022792   .0037684    -0.60   0.546    -.0096816    .0051231
twitf2s2to~l |   7.07e-06   3.88e-06     1.82   0.069    -5.47e-07    .0000147
f2s2intera~l |  -2.63e-06   5.97e-0

In [127]:

stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1 twitf2s2total f2s2interact_sent_vol

      Source |       SS           df       MS      Number of obs   =       548
-------------+----------------------------------   F(3, 544)       =     20.40
       Model |  .002533516         3  .000844505   Prob > F        =    0.0000
    Residual |  .022522524       544  .000041402   R-squared       =    0.1011
-------------+----------------------------------   Adj R-squared   =    0.0962
       Total |   .02505604       547  .000045806   Root MSE        =    .00643

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0039622   .0040681     0.97   0.330    -.0040288    .0119533
twitf2s2to~l |   3.06e-06   4.22e-06     0.73   0.468    -5.23e-06    .0000113
f2s2intera~l |   5.27e-06   6.48e-06     0.81   0.

In [128]:

stata_code = """regress rhperformance_1 twitf2s2method_1 twitf2s2total"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1 twitf2s2total

      Source |       SS           df       MS      Number of obs   =       548
-------------+----------------------------------   F(2, 545)       =     30.28
       Model |  .002506102         2  .001253051   Prob > F        =    0.0000
    Residual |  .022549938       545  .000041376   R-squared       =    0.1000
-------------+----------------------------------   Adj R-squared   =    0.0967
       Total |   .02505604       547  .000045806   Root MSE        =    .00643

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0060289   .0031769     1.90   0.058    -.0002115    .0122693
twitf2s2to~l |   6.42e-06   8.61e-07     7.46   0.000     4.73e-06    8.11e-06
       _cons |  -.0040004   .0020734    -1.93   0.054    -.0080733    .0

In [130]:

stata_code = """regress rhperformance_1 twitf2s2method_1"""
# run(df, stata_code)
Stata.run(df, stata_code)

. regress rhperformance_1 twitf2s2method_1

      Source |       SS           df       MS      Number of obs   =       548
-------------+----------------------------------   F(1, 546)       =      4.53
       Model |  .000206278         1  .000206278   Prob > F        =    0.0337
    Residual |  .024849762       546  .000045512   R-squared       =    0.0082
-------------+----------------------------------   Adj R-squared   =    0.0064
       Total |   .02505604       547  .000045806   Root MSE        =    .00675

------------------------------------------------------------------------------
rhperforma~1 | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
twitf2s2me~1 |   .0070863   .0033286     2.13   0.034     .0005479    .0136246
       _cons |  -.0028336   .0021684    -1.31   0.192     -.007093    .0014259
------------------------------------------------------------------------------


Stata

In [104]:
df = pd.DataFrame()
stata_code = 'di "Hello, World!"'

stata.run(f'''{stata_code}''', echo=True)

. di "Hello, World!"
Hello, World!


In [103]:
import os
os.chdir("C:/Program Files/Stata17/utilities")
from pystata import config
config.init("mp")

from pystata import stata

# # Setup Stata
# import stata_setup
# stata_setup.config("C:/Program Files/Stata17", "mp")

# from pystata import stata
# # stata.config.status()


class Tools():
    @staticmethod
    def varlist_to_string(var_list):
        stata_string = ' '.join([str(var) for var in var_list])
        return stata_string

    @staticmethod
    def list_check(list_or_str):
        return_value = list_or_str
        if type(list_or_str).__name__ == 'str':
            return_value = [list_or_str]
            print("Converted var to list")
        return return_value

class Stata():
    # Converts a list of variables to a Stata readable string command
    @staticmethod
    def regress_code(y_vars, x_vars, function="regress", extras=""):
        stata_code = ""
        y_vars = Tools.list_check(y_vars)
        x_vars = Tools.list_check(x_vars)

        x_vars = Tools.varlist_to_string(x_vars)
        for y in y_vars:

            stata_code = stata_code + f"{function} {y} {x_vars}{extras}\n"

        return stata_code

    @staticmethod
    def grstest2_code(y_vars, x_vars, function="grstest2", extras=""):
        stata_code = ""
        y_vars = Tools.list_check(y_vars)
        x_vars = Tools.list_check(x_vars)

        x_vars = Tools.varlist_to_string(x_vars)
        for y in y_vars:
            stata_code = stata_code + f"{function} {y}, flist({x_vars}) {extras}\n"

        return stata_code

    @staticmethod
    def ttest_code(vars, test_value):
        stata_code = ""
        vars = Tools.list_check(vars)

        for var in vars:
            stata_code = stata_code + f"ttest {var} == {test_value}\n"

        return stata_code

    @staticmethod
    def run(dataframe, functions):
            stata.pdataframe_to_data(dataframe, True)
            stata.run(f'''{functions}''', echo=True)
            print(f"\n\nStata processed the following instructions\n"
                  f"----------------------------------------\n"
                  f"{functions}"
                  f"----------------------------------------")

    @staticmethod
    def retrieve_dataset():
        current_dataset = stata.pdataframe_from_data()
        return current_dataset

