# Robinhood data

The data from Robintrack is used as a proxy for Robinhood trader behaviour. It contains quite a lot of information which is not relevant for this research. As I am only looking at daily data, most hourly data points within this dataset are not used. Hence we will be filtered and adjust some of this data with the code in this file.


The Robintrack data set contains data points outside trade times.
This can be the case either
- during the weekend, or
- during the weekday outside trading hours
  Trading hours are weekdays from (Robinhood, 2022a):
  - 09:30 to 16:00 Eastern Daylight Time (ET)
  - 15:30 to 22:00 Central European Time (CET)
  - 13:30 to 20:00 Coordinated Universal Time (UTC)

The Robintrack data set is based on UTC times. I will be taking the last `users_holding` value of each day, to the closing day value.

In [3]:
import os
import pandas as pd
pd.set_option('display.max_rows', 500)

In [4]:
def loop_tickers(func):
    '''Decorator that loops all (ticker) files in the directory.'''
  
    def wrap(*args, **kwargs):
        if 'file_dir' in kwargs:
            for filename in os.listdir(filedir):
                csv_path = os.path.join(filedir, filename)
                kwargs['ticker'] = csv_path.split("\\")[-1].split(".")[0]
                # checking if it is a file
                if os.path.isfile(csv_path):
                    
                    message = f"""Now executing function {func.__name__} for [{filename}], variables:
                    - csv_path: [{csv_path}]
                    - kwargs: [{kwargs}])"""
                    print(message)

                    result = func(csv_path=csv_path, *args, **kwargs)
                    
        else:
            raise ValueError("Wrapper can not loop, as no file directory given. Please specify which folder needs to be looped by setting 'file_dir' variable.") 
        
        
        
        return result
    return wrap

In [5]:
x = False
def filters(df):
    #     -----------------     Filtering data and convert to daily     -----------------
    # Create dataframe and make sure timestamp is read as datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    
    # Create more info surrounding the date and time
    df['date'] = df['timestamp'].dt.date
    df['dayname'] = df['timestamp'].dt.day_name()

    # Remove all observations which are not weekdays
    df = df[~((df['dayname'] == "Saturday") | (df['dayname'] == "Sunday"))]
    
    # Keep only the last value of each day. This will portray the closing value of the users_holding variable for each day
    df = df.sort_values('timestamp').groupby('date').tail(1)

    #     -----------------     Creating new data     -----------------
    # Create column with numerical user difference
    df['change'] = df['users_holding'].diff()
    
    # Create column with percentual user difference
    df['pct_change'] = df['users_holding'].pct_change().round(4)
    
    # Dropping na values (this should only be the first column)
    df = df.dropna()
    
    return df

In [None]:
@loop_tickers
def filter_data(csv_path, *args, **kwargs):
    """This function filters the unfiltered data to daily data"""
    df = pd.read_csv(csv_path)
    
    df = filters(df)

    # Setup save location by changing base_path from /unfiltered to /filtered
    base_path = os.path.dirname(kwargs['file_dir'])
    base_path = os.path.join(base_path, 'daily_data').replace('\\', '/')
    
    ticker = kwargs['ticker']
    save_path = os.path.join(base_path, f"{ticker}.csv").replace('\\', '/')

    # Save results
    df.to_csv(save_path, encoding='utf-8')


filedir = r"E:/Users/Christiaan/Large_Files/Thesis/robintrack/popularity_export"
filter_data(file_dir= filedir)



In [32]:
csv_path = r"E:\Users\Christiaan\Large_Files\Thesis\robintrack\popularity_export\AAPL.csv"
df = pd.read_csv(csv_path)
df = filters(df)
df

Unnamed: 0,timestamp,users_holding,date,dayname,change,pct_change
20,2018-05-03 23:43:06,143747,2018-05-03,Thursday,-7038.0,-0.0467
44,2018-05-04 23:43:08,141424,2018-05-04,Friday,-2323.0,-0.0162
112,2018-05-07 23:43:08,141179,2018-05-07,Monday,-245.0,-0.0017
136,2018-05-08 23:43:12,140820,2018-05-08,Tuesday,-359.0,-0.0025
160,2018-05-09 23:43:07,140742,2018-05-09,Wednesday,-78.0,-0.0006
...,...,...,...,...,...,...
19656,2020-08-07 23:46:09,683299,2020-08-07,Friday,9310.0,0.0138
19728,2020-08-10 23:45:58,698981,2020-08-10,Monday,15682.0,0.0230
19752,2020-08-11 23:46:11,706672,2020-08-11,Tuesday,7691.0,0.0110
19771,2020-08-12 23:46:30,718187,2020-08-12,Wednesday,11515.0,0.0163


## ToDo


In [7]:
# Remove firms which dont have data for the full sample period.

# Calculate total amount of stock holdings for ALL STOCKS

# See effect forecasting method: % change in total users * % of total users which own particular stock