# Setup

In [91]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import torch
import pickle

In [3]:
df = pd.read_json("data/analysis/stock_data.json")
df_batch = pd.read_json("data/analysis/batch_data.json")
df_train = pd.read_json("data/analysis/stock_data_train.json")
df_test = pd.read_json("data/analysis/stock_data_test.json")

In [4]:
df_train.head(1)

Unnamed: 0,y_batch,main_mv_percent_batch,word_batch,n_msgs_batch,T_batch,main_target_date_batch,batch_size,price_batch,texts_batch,stock_batch,s,n_words_batch
0,"[[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0....","[-0.005330999847501, 0.006130000110715001, 0.0...","[[[[18764, 25572, 4165, 26136, 16603, 322, 247...","[[12, 2, 1, 4, 0], [20, 2, 1, 0, 0], [3, 20, 2...","[4, 3, 3, 3, 5, 4, 3, 3, 5, 4, 3, 2, 3, 3, 4, ...","[2015-07-30, 2015-07-29, 2015-07-28, 2015-07-2...",135,"[[[0.06799300014972601, 0.04161800071597, -0.4...","[[[['unh', 'unitedhealth', 'group', ',', 'inc'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",GE,"[[[20, 21, 27, 23, 18, 19, 19, 22, 19, 18, 27,..."


# Create Training Data: Price & Label

In [5]:
# Select only stocks with batch_size > 120
# We take arbitrary number 120
B = 120
sel_stocks = df_train[df_train.batch_size > 120]
sel_stocks.s

0       GE
9      JPM
11     BAC
13       C
19       D
26    AAPL
44    GOOG
45    MSFT
46      FB
47       T
52    INTC
63    CELG
64    AMZN
73    PCLN
Name: s, dtype: object

In [6]:
# Sanity check: check the ratio of up and down labels
# Get y_batch from sel_stocks and count number of [1,0] and [0,1] labels
y_counts = {'up': 0, 'down': 0}
for y_batch in sel_stocks.y_batch:
    y_arr = np.array(y_batch)
    y_counts['up'] += np.sum(np.all(y_arr == [1,0], axis=1))
    y_counts['down'] += np.sum(np.all(y_arr == [0,1], axis=1))

# find ratio of up and down
ratio = y_counts['up'] / (y_counts['up'] + y_counts['down'])
ratio, 1-ratio

(0.4614065180102916, 0.5385934819897085)

In [7]:
OUT = False
if OUT:
    # Fetch price_batch from sel_stocks and limit them to 100 batches per stock
    price_batches = []
    for price_batch in sel_stocks.price_batch:
        price_batches.append(price_batch[:120])
    price_batches = np.array(price_batches)
    # Save 120 .npy files of (14, 5, 3) dim numpy arrays
    for i in range(price_batches.shape[1]):
        np.save(f"data/out/train_price/{str(i).zfill(10)}.npy", price_batches[:, i, :])

    # Fetch y_batch from sel_stocks and limit them to 100 batches per stock
    y_batches = []
    for y_batch in sel_stocks.y_batch:
        y_batches.append(y_batch[:120])
    y_batches = np.array(y_batches)
    # Save 120 .npy files of (14, 5, 2) dim numpy arrays
    for i in range(y_batches.shape[1]):
        np.save(f"data/out/train_label/{str(i).zfill(10)}.npy", y_batches[:, i, :])


In [57]:
# Copy newly generated data to man_sf_emnlp repo
!cp -r data/out/train_label/ ../man-sf-emnlp/train_label/
!cp -r data/out/train_price/ ../man-sf-emnlp/train_price/


# Create Training Data: Tweets

In [9]:
# Get texts_batch from sel_stocks and convert to embeddings
print("Loading Universal Sentence Encoder model...")
embed = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4")

Loading Universal Sentence Encoder model...


In [11]:
# Process each batch of texts
# Numpy array to store all embeddings

debug = True
max_tweets = 30  # Max tweets per day
n_window = 5     # Days window
emb_size = 512   # Embedding dimension
padding = np.zeros(emb_size)  # Use numpy zeros for padding

num_stocks = len(sel_stocks)
B = 120  # Number of batches

# Initialize output array with zeros
all_embeddings = np.zeros((B, num_stocks, n_window, max_tweets, emb_size))

# Process each stock
for stock_idx, texts_batch in enumerate(sel_stocks.texts_batch):
    # Process each batch
    for batch_idx, batch in enumerate(texts_batch[:B]):
        # Process each day window
        for day_idx, days in enumerate(batch[:n_window]):
            # Process tweets for this day
            day_tweets = []
            for tweets in days:
                for tweet in tweets:
                    # Join words into single string, filtering out empty strings
                    tweet_text = ' '.join([w for w in tweet if w])
                    tweet_vec = embed([tweet_text])[0].numpy()
                    day_tweets.append(tweet_vec)

            # Pad or truncate tweets for this day
            if len(day_tweets) > 0:
                day_tweets = np.array(day_tweets[:max_tweets])
                if len(day_tweets) < max_tweets:
                    padding_needed = max_tweets - len(day_tweets)
                    padding_array = np.zeros((padding_needed, emb_size))
                    day_tweets = np.vstack([day_tweets, padding_array])
            else:
                day_tweets = np.zeros((max_tweets, emb_size))
                
            # Store in final array
            all_embeddings[batch_idx, stock_idx, day_idx] = day_tweets

    if debug and stock_idx == 0:
        break

print(f"Final shape: {all_embeddings.shape}")


Final shape: (120, 14, 5, 30, 512)


In [21]:
for i in range(all_embeddings.shape[0]):
    np.save(f"data/out/train_text/{str(i).zfill(10)}.npy", all_embeddings[i, :, :, :, :])

In [25]:
# Sanity check the embedding shapes
print(np.load(f"../man-sf-emnlp/train_text/{str(1).zfill(10)}.npy").shape)


(14, 5, 30, 512)


In [83]:
# Copy newly generated data to man-sf-emnlp repo
!cp -r data/out/train_text/ ../man-sf-emnlp/train_text/

# Create Testing Data: Price & Label

In [None]:
# Save df_test y_batch and price_batch as pickle files based on date

# Select only stocks in df_test that are in df_train.s 
sel_stocks_test = df_test[df_test.s.isin(sel_stocks.s)].set_index('s')
test_batch_size = sel_stocks_test.batch_size.min()

In [None]:
np.array(sel_stocks_test.y_batch[0]).shape

(23, 5, 2)

In [93]:
OUT = True
if OUT:
    # Fetch price_batch from sel_stocks_test and limit them to 100 batches per stock
    price_batches = []
    for price_batch in sel_stocks_test.price_batch:
        price_batches.append(price_batch[:test_batch_size])
    price_batches = np.array(price_batches)
    
    # Create dictionary to store price data
    price_dict = {}
    for i in range(price_batches.shape[1]):
        key = str(i).zfill(10)
        price_dict[key] = price_batches[:, i, :]
    
    # Save price dictionary
    with open('data/out/price_feature_data.p', 'wb') as f:
        pickle.dump(price_dict, f)

    # Fetch y_batch from sel_stocks_test and limit them to 100 batches per stock
    y_batches = []
    for y_batch in sel_stocks_test.y_batch:
        y_batches.append(y_batch[:test_batch_size])
    y_batches = np.array(y_batches)
    
    # Create dictionary to store label data
    label_dict = {}
    
    # Original shape:(num_stocks, test_batch_size, 5, 2)
    for batch_idx in range(y_batches.shape[1]):
        batch_labels = []
        for stock_idx in range(y_batches.shape[0]):
            # Get the 5-day window for current stock and batch
            window = y_batches[stock_idx, batch_idx]
            # Find rows where one-hot encoding sums to 1 (valid labels)
            nonzero_rows = (window.sum(axis=1) == 1.0).nonzero()[0]
            if len(nonzero_rows) > 0:
                # Get last valid label in window
                last_effective_idx = nonzero_rows[-1]
                last_effective_label = window[last_effective_idx]
                batch_labels.append(last_effective_label)
        # Store labels for this batch in dictionary
        batch_labels = np.array(batch_labels)  # Shape: (num_stocks, 2)
        key = str(batch_idx).zfill(10)
        label_dict[key] = batch_labels
    
    # Save label dictionary
    with open('data/out/label_data.p', 'wb') as f:
        pickle.dump(label_dict, f)

In [95]:
# Sanity check the pickle data
with open('data/out/price_feature_data.p', 'rb') as f:
    price_dict = pickle.load(f)
print(price_dict['0000000001'].shape)

with open('data/out/label_data.p', 'rb') as f:
    label_dict = pickle.load(f)
print(label_dict['0000000001'].shape)


(14, 5, 3)
(14, 2)


In [98]:
# Copy newly generated data to man_sf_emnlp repo
!cp -r data/out/label_data.p ../man-sf-emnlp/label_data.p
!cp -r data/out/price_feature_data.p ../man-sf-emnlp/price_feature_data.p

# Create Testing Data: Tweets

In [85]:
# Get texts_batch from sel_stocks and convert to embeddings
print("Loading Universal Sentence Encoder model...")
embed = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4")

Loading Universal Sentence Encoder model...


In [86]:
# Process each batch of texts
# Numpy array to store all embeddings

debug = True
max_tweets = 30  # Max tweets per day
n_window = 5     # Days window
emb_size = 512   # Embedding dimension
padding = np.zeros(emb_size)  # Use numpy zeros for padding

num_stocks = len(sel_stocks_test)
B = test_batch_size  # Number of batches

# Initialize output array with zeros
all_embeddings = np.zeros((B, num_stocks, n_window, max_tweets, emb_size))

# Process each stock
for stock_idx, texts_batch in enumerate(sel_stocks_test.texts_batch):
    # Process each batch
    for batch_idx, batch in enumerate(texts_batch[:B]):
        # Process each day window
        for day_idx, days in enumerate(batch[:n_window]):
            # Process tweets for this day
            day_tweets = []
            for tweets in days:
                for tweet in tweets:
                    # Join words into single string, filtering out empty strings
                    tweet_text = ' '.join([w for w in tweet if w])
                    tweet_vec = embed([tweet_text])[0].numpy()
                    day_tweets.append(tweet_vec)

            # Pad or truncate tweets for this day
            if len(day_tweets) > 0:
                day_tweets = np.array(day_tweets[:max_tweets])
                if len(day_tweets) < max_tweets:
                    padding_needed = max_tweets - len(day_tweets)
                    padding_array = np.zeros((padding_needed, emb_size))
                    day_tweets = np.vstack([day_tweets, padding_array])
            else:
                day_tweets = np.zeros((max_tweets, emb_size))
                
            # Store in final array
            all_embeddings[batch_idx, stock_idx, day_idx] = day_tweets

    if debug and stock_idx == 0:
        break

print(f"Final shape: {all_embeddings.shape}")


Final shape: (20, 14, 5, 30, 512)


In [96]:
# Create dictionary to store embeddings
embeddings_dict = {}
for i in range(all_embeddings.shape[0]):
    embeddings_dict[str(i).zfill(10)] = all_embeddings[i, :, :, :, :]

# Save dictionary as pickle file
with open('data/out/text_feature_data.p', 'wb') as f:
    pickle.dump(embeddings_dict, f)

In [97]:
# Sanity check the embedding shapes
with open('data/out/text_feature_data.p', 'rb') as f:
    embeddings_dict = pickle.load(f)
print(embeddings_dict['0000000001'].shape)


(14, 5, 30, 512)


In [99]:
# Copy newly generated data to man-sf-emnlp repo
!cp -r data/out/text_feature_data.p ../man-sf-emnlp/text_feature_data.p

# Sandbox

In [77]:
# Read train_label from data/out/train_label
train_label = np.load("data/out/train_label/0000000001.npy")

# df_train[['y_batch', 'price_batch']].head(1).values
torch.max(torch.LongTensor(train_label), 1)[1], '\n', train_label
# train_label

(tensor([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1]),
 '\n',
 array([[0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.]]))

In [50]:
    
# Read all files within data/out/train_label/ and rewrite each file 
import os
import numpy as np

# Get all files in train_label_v1 directory
label_files = sorted(os.listdir("data/out/train_label_v1/"))

# Create output directory if it doesn't exist
os.makedirs("data/out/train_label/", exist_ok=True)

# Process each file
for filename in label_files:
    # Load label file
    train_label = np.load(f"data/out/train_label_v1/{filename}")
    
    # Extract last effective label for each sequence
    labels = []
    for i in range(train_label.shape[0]):
        nonzero_rows = (train_label[i].sum(axis=1) == 1.0).nonzero()[0]
        if len(nonzero_rows) > 0:
            last_effective_idx = nonzero_rows[-1]
            last_effective_label = train_label[i, last_effective_idx]
            labels.append(last_effective_label)
        
    # Save processed labels
    labels = np.array(labels)
    np.save(f"data/out/train_label/{filename}", labels)



In [44]:

# Read train_price from data/out/train_price
train_price = np.load("data/out/train_price/0000000001.npy")
train_price[0]
# df_train[['price_batch']].head(1).values


array([[ 0.067993,  0.041618, -0.478625],
       [ 0.075076,  0.056041,  0.187695],
       [ 0.077466,  0.065559,  0.140772],
       [ 0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ]])

In [55]:
train_label = np.load("data/out/train_label_v1/0000000001.npy")
train_label[0]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 0.],
       [0., 0.]])

In [78]:
dfexp = pd.read_json("data/analysis/stock_data_v2.json")
dfexp.head(1)


Unnamed: 0,y_batch,main_mv_percent_batch,word_batch,n_msgs_batch,T_batch,main_target_date_batch,batch_size,price_batch,texts_batch,stock_batch,s,n_words_batch
0,"[[[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 0....","[-0.007352999877184001, 0.012298000045120001, ...","[[[[20239, 22919, 26136, 16603, 322, 2804, 819...","[[4, 2, 5, 0, 0], [8, 2, 0, 0, 0], [1, 4, 2, 0...","[3, 2, 3, 2, 3, 4, 3, 4, 3, 3, 3, 5, 3, 3, 4, ...","[2015-12-30, 2015-12-29, 2015-12-23, 2015-11-3...",178,"[[[0.047589000314474, 0.039459999650716004, -0...","[[[['aa', 'alcoa', ',', 'inc', '.', 'ask', 'UR...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",GE,"[[[18, 18, 26, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [64]:
def find_factors(n):
    return [i for i in range(1, n+1) if n % i == 0]
find_factors(26614)

[1, 2, 7, 14, 1901, 3802, 13307, 26614]

In [12]:
df_train.shape, df_test.shape, df.shape

((74, 11), (58, 11), (75, 11))

In [18]:
df_train.batch_size.sum(), df_test.batch_size.sum(), df.batch_size.sum()

(4987, 1008, 6651)

In [17]:
# np.array(df.y_batch[0]).shape
shapes = []
df.y_batch.apply(lambda x: shapes.append(np.array(x).shape))
pd.Series(shapes).shape

(75,)

In [12]:
nsample_per_stock = []
df.y_batch.apply(lambda x: nsample_per_stock.append(len(np.array(x))))
pd.Series(nsample_per_stock).value_counts(), sum(nsample_per_stock)


(5      3
 2      3
 93     2
 3      2
 15     2
       ..
 43     1
 156    1
 123    1
 64     1
 218    1
 Name: count, Length: 61, dtype: int64,
 6651)

In [5]:
ndatapoint_per_stock = []
df.y_batch.apply(lambda x: ndatapoint_per_stock.append(
        np.array(x).shape[0] * np.array(x).shape[1]
    )
)
sum(ndatapoint_per_stock)

33255

In [24]:
len(df.n_words_batch.apply(lambda x: np.array(x))[0][0][1])

30

In [23]:
# The column T_batch is a list of lists, each sublist is a list of days
# Find the number of unique days in T_batch
days = []
df_batch.T_batch.apply(lambda x: days.extend(x))
pd.Series(days).value_counts()

3    2029
4     515
2     374
5     282
Name: count, dtype: int64

In [46]:
df_batch.head(1)

Unnamed: 0,T_batch,word_batch,main_mv_percent_batch,n_msgs_batch,n_words_batch,y_batch,stock_batch,price_batch,batch_size
0,"[4, 3, 2, 5, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 2, ...","[[[[6833, 12221, 22879, 22971, 22030, 25982, 2...","[0.012745999731123002, 0.006849000230431001, -...","[[2, 6, 1, 4, 0], [1, 2, 1, 0, 0], [1, 2, 0, 0...","[[[28, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1....","[59, 29, 8, 80, 71, 82, 56, 53, 52, 64, 66, 78...","[[[0.06393899768590901, 0.051995001733303, -0....",32
