In [1]:
# Import statements go here
import numpy as np
import emoji
import regex
import pandas as pd

from ipynb.fs.full.WordProcessing import get_processed_data, convert_to_dict


In [2]:
# Preprocessing functions go here

def extract_emojis(input_file):
    emoji_list = []
    emoji_mat = {}
    posts = input_file.readlines()
    for line in posts:
        post = line[0]
        data = regex.findall(r'\X', post)
        for word in data:
            if any(char in emoji.EMOJI_DATA for char in word):
                if word not in emoji_list:
                    emoji_list.append(word)
    
    for key in emoji_list:
        emoji_mat[key] = np.zeros(len(posts)-1)

    i = 0
    for line in posts:
        post = line[0]
        data = regex.findall(r'\X', post)
        for word in data:
            for char in word:
                if emoji_mat.get(char) is not None:
                    emoji_mat[char][i] += 1
        i += 1
    return emoji_mat


In [3]:
# Run preprocessing code
def preprocess():
    f = open('dataset/reddit_wsb.csv', "r")
    emoji_mat = extract_emojis(f)
    for emoji_val in mat.keys():
        print(emoji_val, np.sum(mat[emoji_val]))
    return emoji_mat

In [4]:
# Run preprocessing code
def preprocess(f_in):
    print("[preprocess::extract emojis]")
    f = open(f_in, "r")
    my_dict_emoji = extract_emojis(f)

    # open and process
    print("[preprocess::extract titles and body]")
    data = pd.read_csv(f_in)
    processed=get_processed_data(data)
    my_dict_titles = convert_to_dict(processed,"title")
    my_dict_body = convert_to_dict(processed,"body")
    print("[preprocess::complete]")
    return my_dict_emoji, my_dict_titles, my_dict_body
    
#dict_emojis, dict_titles, dict_body = preprocess('dataset/reddit_wsb.csv')

In [5]:
# sanity check
#dict_titles[2345]
#dict_body[2345]
#for emoji_val in dict_emojis.keys():
#        print(emoji_val, np.sum(dict_emojis[emoji_val]))

In [51]:
import yfinance as yf

def get_ticker_data(f_in, tickers_found):
    file = pd.read_csv(f_in)
    tickers_file = pd.read_csv(tickers_found)
    tickers_set = set()
    for i, row in tickers_file.iterrows():
        #print(row)
        if not pd.isnull(row['Ticker']):
            row_ticker_list = row['Ticker'].replace('{','').replace('}','').replace('\'', '').strip().split(',')
            #print(row_ticker_list)
            for ticker in row_ticker_list:
                tickers_set.add(ticker.strip())
            
    #print(tickers_set)
    
    data_tickers = pd.concat([file, tickers_file['Ticker']], axis=1)
    condensed_file = data_tickers.dropna(subset=['Ticker'])
    sorted_file = condensed_file.sort_values(['timestamp'],ascending=True)
    start_date = sorted_file.iloc[0]['timestamp'].split()[0]
    end_date = sorted_file.iloc[-1]['timestamp'].split()[0]
    #print(start_date,end_date)
    
    ticker_str = ' '.join(tickers_set)
    #all_tickers = yf.Tickers(ticker_str)
    data = yf.download(ticker_str, start=start_date, end=end_date, group_by = 'ticker', auto_adjust=True, prepost=True)
    return data, tickers_set
    
    

In [73]:
data, ticker_set = get_ticker_data('dataset/reddit_wsb.csv','dataset/ticker_location.csv')
data.to_excel("tickerdata.xlsx")
print(ticker_set)

[*********************100%***********************]  25 of 25 completed
{'PLTR', 'AMD', 'SPCE', 'RKT', 'REAL', 'AMC', 'DD', 'ONE', 'GME', 'TSLA', 'CRSR', 'BB', 'RH', 'CLOV', 'AM', 'NOK', 'WISH', 'BY', 'UWMC', 'MVIS', 'APP', 'NIO', 'TD', 'SNDL', 'AAL'}


In [53]:
print(data['GME']['Open'] - data['GME']['Close'])
print(data['GME'])
#for line in data['GME','Open']:
#    print(line)
    
#print(data['GME']['2021-01-28'])

Date
2021-01-28    71.399994
2021-01-29    54.709991
2021-02-01    91.559998
2021-02-02    50.759995
2021-02-03    19.599998
                ...    
2021-08-09    -9.330002
2021-08-10     2.309998
2021-08-11    -0.350006
2021-08-12    -2.470001
2021-08-13    -2.050003
Length: 138, dtype: float64
                  Open        High         Low       Close    Volume
Date                                                                
2021-01-28  265.000000  483.000000  112.250000  193.600006  58815800
2021-01-29  379.709991  413.980011  250.000000  325.000000  50566100
2021-02-01  316.559998  322.000000  212.000000  225.000000  37382200
2021-02-02  140.759995  158.000000   74.220001   90.000000  78183100
2021-02-03  112.010002  113.400002   85.250000   92.410004  42698500
...                ...         ...         ...         ...       ...
2021-08-09  151.800003  164.710007  150.660004  161.130005   2249200
2021-08-10  161.360001  166.899994  155.350006  159.050003   1623300
2021-08-11  1

In [68]:
from dateutil import parser

def append_ticker_data(data, ticker_set):
    file = pd.read_csv('dataset/reddit_wsb.csv')
    tickers_file = pd.read_csv('dataset/ticker_location.csv')
    data_tickers = pd.concat([file, tickers_file['Ticker']], axis=1)
    condensed_file = data_tickers.dropna(subset=['Ticker'])
    sorted_file = condensed_file.sort_values(['timestamp'],ascending=True)
    start_date_str = sorted_file.iloc[0]['timestamp'].split()[0]
    start_date = parser.parse(start_date_str)
    # Store the change in stock prices for each day
    ticker_deltas = {}
    for ticker in ticker_set:
        ticker_deltas[ticker] = data[ticker, 'Close'] - data[ticker, 'Open']
    
    print(ticker_deltas['GME'][5])
    # Line up those changing stock prices with the dataset
    dataset_deltas = []
    for i, row in tickers_file.iterrows():
        if not pd.isnull(row['Ticker']):
            row_ticker_list = row['Ticker'].replace('{','').replace('}','').replace('\'', '').strip().split(',')
            date_str = file.iloc[i]['timestamp'].split()[0] # Rows correspond to each other
            date = parser.parse(date_str)
            
            # Get the date difference
            str_diff = str(date - start_date)
            datediff = 0
            if str_diff[0] == '0':
                datediff = 0
            else:
                datediff = int(str_diff.split()[0])
            ticker_dict = {}
            for ticker in row_ticker_list:
                check_ticker = ticker.strip()
                ticker_dict[check_ticker] = ticker_deltas[check_ticker][datediff]
            dataset_deltas.append(ticker_dict)
        else:
            dataset_deltas.append(None)
    print(dataset_deltas)
            
                

In [69]:
append_ticker_data(data, ticker_set)

-37.69000244140625


IndexError: index 143 is out of bounds for axis 0 with size 138