SLOT (Self-supervised Learning of Tweets for Capturing Multi-level Price Trends) aims to address
1. the sparsity of tweets, with the number of tweets being heavily biased towards the most popular stocks.
2. the fact that tweets have noisy information that are often irrelevant to the actual stock movement.

The first problem was addressed by having SLOT learn the stock and tweet embeddings in the same vector space through self-supervised learning. This allows the use of any tweet for even unpopular stocks.

To tackle the second problem, SLOT uses tweets to learn multi-level relationships between stocks, rather than using them as direct evidence for stock prediction (e.g. positive sentiment = up).

## Attention LSTM

In [1]:
import torch
from torch import nn

class ALSTM:
    def __init__(self, input_size, hidden_size):
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            batch_first=True,
        )
        self.ln = nn.Linear(hidden_size, hidden_size)
        self.tanh = nn.Tanh()
        self.u = nn.Parameter(data=torch.randn(hidden_size))


    def forward(self, x):
        # x: (batch, seq_len, input_size)
        # output: (batch, seq_len, hidden_size)
        # h_n, c_n : (num_layers, batch, hidden_size)
        output, h_n, c_n = self.lstm(x)
        output = self.tanh(self.ln(output))

        # query: u, key: output, values: output
        attn_scores = torch.matmul(output, self.u) # (batch, seq_len, hidden_size) @ (hidden_size) -> (batch, seq_len)
        weights = attn_scores / attn_scores.sum(dim=1, keepdim=True)
        weights = weights.unsqueeze(dim=-1) # (batch, seq_len, 1)
        
        # (batch, seq_len, 1) * (batch, seq_len, hidden_size) -> (batch, seq_len, hidden_size)
        # (batch, seq_len, hidden_size) -> (batch, hidden_size)
        h_attn = (weights * output).sum(dim=1)

        # (batch, hidden_size) || (batch, hidden_size) -> (batch, 2*hidden_size)
        h_out = torch.cat((h_n[0], h_attn), dim=1) # both the general summary and the attention

        return h_out

In [None]:
class SLOT:
    def __init__(self, input_size, hidden_size, output_size = 1):
        self.ln_1 = nn.Linear(3*input_size, 3*input_size)
        self.alstm = ALSTM(input_size=3*input_size, hidden_size=hidden_size)
        self.ln_f = nn.Linear(hidden_size*2, output_size)

    def forward(self, features, global_trend, local_trend):
        # features, global_trend, local_trend: (batch, seq_len, input_size)
        final_input = torch.cat((features, global_trend, local_trend), dim=-1) # (batch, 3*input_size)
        finall_input = self.ln_1(final_input)
        h_out = self.alstm(final_input) # (batch, 2*hidden_size)
        y_pred = self.ln_f(h_out) # (batch, output_size)

        return y_pred


        

## Self-supervised Learning of Embeddings

The goal is to learn tweet (h_e) and stock (h_s) embeddings in the same semantic space, that is, learning the embeddings together such that one embedding can be used to query for the other. 
- a stock embedding and the embedding of tweet relevant to it are close together (higher dot product)
- solves problem of tweet sparsity; the model can associate stocks with tweets that don't directly mention it as long as they are close in vector space.

This was done by training for stock identification: predict the mentioned stock in a tweet when the stock symbol is masked.

First, every tweet is tokenized with Sentence Piece. Next, the tokens that correspond to the stock the tweet mentions are masked with the special token MASK.
- "Thank you Apple for the new iPhone." -> "Thank you [MASK] for the new iPhone."

### Stock Identification Model 

Use a BiLSTM (need to understand the context on both sides of the mask token).
- Using a transformer risks overfitting.

The stock embedding is a learnable parameter. 

The hidden state vector generated at the masked token is used as the tweet embedding because it
- captures the immediate left (via the forward LSTM) and right context (via the backward LSTM) of the tweet, making it exactly what we need for stock identification,
- and it is the part of the tweet that most connects to mentioned stock.





In [26]:
from pathlib import Path   
import json

data_dir = Path("data/bigdata22")
tweet_dir = data_dir / "tweet"


tweet_files_list = list(tweet_dir.rglob("*/*"))

json_file = "all_tweets.jsonl"

buffer = []
buffer_size = 100_000
lines = 0

with open(json_file, "w", encoding="utf-8") as out:
    for tweet_file in tweet_files_list:
        with open(tweet_file, "r", encoding="utf-8") as f:
            for line in f:
                tweet = json.loads(line)

                json_tweet = {
                    "stock": tweet_file.parent.name,
                    "tweet": tweet["text"].replace("\n", " ")
                }

                buffer.append(json.dumps(json_tweet))

                if len(buffer) >= buffer_size:
                    out.write("\n".join(buffer) + "\n")
                    lines += len(buffer)

                    buffer.clear()

                    print(f"{lines} lines written")

    if buffer:
        lines += len(buffer)
        out.write("\n".join(buffer) + "\n")
        print(f"{lines} lines written")

print("All tweets written")

100000 lines written
200000 lines written
300000 lines written
324573 lines written
All tweets written


In [15]:
input_file = json_file
output_file = "tweets_for_sp.txt"

with open(input_file, "r", encoding="utf-8") as f, open(output_file, "w", encoding="utf-8") as out:
    for line in f:
        tweet = json.loads(line)
        out.write(tweet["tweet"] + "\n")

In [16]:
import sentencepiece as sp

spm_file = "spm/slot_tweet_spm"

sp.SentencePieceTrainer.train(
    input=output_file,
    model_prefix=spm_file,
    vocab_size=16_000,
    model_type="unigram",
    character_coverage=1.0,
    input_sentence_size=lines,
    shuffle_input_sentence=True,
    user_defined_symbols=["<MASK>", "<PAD>"]
)

spm_file += ".model"

In [None]:
import sentencepiece as sp

slot_spm = sp.SentencePieceProcessor()
slot_spm.load(spm_file)

import re

tweet = "$aapl laser game controller?  URL $mvis disruptive tech ..? $qqq $mu $goog $bbry $sne $txn $fb $twtr $msft $himx $iwm"
stock_name = "AAPL"

pattern = rf"\${stock_name}(?=\s|$|\W)"
masked_tweet = re.sub(pattern, "<MASK>", tweet, flags=re.IGNORECASE)

print(slot_spm.encode(masked_tweet))
print(slot_spm.encode(masked_tweet, out_type=str))

[11562, 212, 1434, 762, 96, 6, 4567, 1810, 384, 5, 102, 671, 63, 5, 8032, 15, 382, 931, 32, 83, 11784, 8476, 6, 3, 259, 135, 1034, 15, 3077, 3845, 842, 29, 41, 39, 32, 520, 1244, 3498, 2494, 96, 1125, 457, 9]
['▁weather', 'ly', '▁asset', '▁management', '▁has', '▁', 'upped', '▁boeing', '▁co', '▁$', 'ba', '▁stake', '▁by', '▁$', '333', ',', '9', '14', ';', '▁as', '▁honeywell', '▁intl', '▁', '<MASK>', '▁share', '▁price', '▁rose', ',', '▁holder', '▁pic', 'te', 't', '▁&', 'amp', ';', '▁c', 'ie', '▁europe', '▁sa', '▁has', '▁raised', '▁holding', '▁URL']


In [36]:
stock_names = [stock.name for stock in list(tweet_dir.rglob("*")) if stock.is_dir()]
stock_ids = [i for i in range(len(stock_names))]
stock_name_to_ids = dict(zip(stock_names, stock_ids))
stock_name_to_ids


{'AAPL': 0,
 'AEP': 1,
 'AGFS': 2,
 'AMGN': 3,
 'AMZN': 4,
 'BA': 5,
 'BAC': 6,
 'C': 7,
 'CAT': 8,
 'CMCSA': 9,
 'CODI': 10,
 'CSCO': 11,
 'CVX': 12,
 'D': 13,
 'DIS': 14,
 'DUK': 15,
 'EXC': 16,
 'GD': 17,
 'GE': 18,
 'GMRE': 19,
 'GOOG': 20,
 'HD': 21,
 'HON': 22,
 'INTC': 23,
 'JNJ': 24,
 'JPM': 25,
 'KO': 26,
 'LMT': 27,
 'MA': 28,
 'MCD': 29,
 'MDT': 30,
 'MMM': 31,
 'MO': 32,
 'MRK': 33,
 'MSFT': 34,
 'NEE': 35,
 'ORCL': 36,
 'PCG': 37,
 'PM': 38,
 'PPL': 39,
 'REX': 40,
 'SO': 41,
 'SRE': 42,
 'T': 43,
 'UPS': 44,
 'V': 45,
 'VZ': 46,
 'WFC': 47,
 'WMT': 48,
 'XOM': 49}

In [42]:
mask_id = slot_spm.piece_to_id("<MASK>")
pad_id = slot_spm.piece_to_id("<PAD>")

mask_id, pad_id

(3, 4)

In [47]:
slot_spm.decode([6, 12, 11, 10, 13, 16, 7, 2672, 76, 16, 7, 474, 42, 15, 7, 2799, 6866, 26, 150, 2852, 24, 16, 7, 56, 3033, 8, 5, 60, 123, 152, 4198, 23, 135, 8221, 38, 421, 27, 1960, 100, 778, 195, 6, 12, 11, 10, 13, 2139, 4978, 2126, 15, 6, 14, 2933, 3122, 5, 2247, 23, 135, 8, 37, 36, 14, 10584, 17, 7515, 2232, 6, 12, 11, 10, 13, 152, 9802, 68, 8, 7, 5070, 7913, 7, 5190, 1811, 56, 9])

'AT_USER the #fed like the #msm, #financialmedia is an extension of the #dnc. $aaple was involved in price manipulation with them on jan 2nd when AT_USER gave false guidance, stripping $trillions in price. it’s intention to disrupt economy AT_USER was strengthening. #secfraud #voterid URL'

In [55]:
import re
tokens_file = "all_tweets_tokenized.jsonl"
bad_data_file = "bad_data.jsonl"

buffer = []
buffer_size = 100_000
lines = 0


with open(json_file, "r", encoding="utf-8") as f, open(tokens_file, "w", encoding="utf-8") as o, open(bad_data_file, "w", encoding="utf-8") as b:
    for line in f:

        entry = json.loads(line)

        stock_name = entry["stock"]
        tweet = entry["tweet"]
        
        pattern = rf"\${stock_name}(?=\s|$|\W)"
        masked_tweet = re.sub(pattern, "<MASK>", tweet, flags=re.IGNORECASE)

        masked_tweet_tokenized = slot_spm.encode(masked_tweet)
        try: 
            mask_idx = masked_tweet_tokenized.index(mask_id)
        
        except Exception:
            bad_entry = {
                "stock": stock_name,
                "tweet": slot_spm.decode(masked_tweet_tokenized)
            }
            b.write(json.dumps(bad_entry) + "\n")
            continue
        



        tokenized_tweet = {
            "tweet": masked_tweet_tokenized,
            "stock_id": stock_name_to_ids[stock_name],
            "mask_idx": mask_idx
        }


        # tokenized_line = " ".join(str(id) for id in tokenized_line) + "\n"
        buffer.append(json.dumps(tokenized_tweet))
        
        if len(buffer) >= buffer_size:
            o.write("\n".join(buffer) + "\n")
            lines += len(buffer)
            buffer.clear()
            
            print(f"{lines} lines written")

    if buffer:
        o.write("\n".join(buffer) + "\n")
        lines += len(buffer)
        print(f"{lines} lines written")


45389 lines written


In [40]:
tweet_tokens = []
mask_indices = []
stock_labels = []

with open(tokens_file, "r") as f:
    for line in f:
        entry = json.loads(line)
        tokens = [int(tok) for tok in entry["tweet"]]
        tweet_tokens.append(tokens)
        stock_labels.append(entry["stock_id"])

tweet_tokens, stock_labels

([[6,
   3,
   8691,
   202,
   725,
   8531,
   202,
   35,
   9,
   5,
   3359,
   6,
   8046,
   692,
   1416,
   35,
   5,
   168,
   5,
   498,
   5,
   258,
   5,
   979,
   1033,
   5,
   2580,
   5,
   2210,
   5,
   99,
   5,
   234,
   5,
   110,
   5,
   11542,
   75,
   5,
   369],
  [13876,
   354,
   44,
   172,
   2850,
   1480,
   96,
   4374,
   23,
   16,
   193,
   154,
   561,
   8,
   5,
   75,
   5,
   29,
   5,
   389,
   5,
   4762,
   5,
   438,
   5,
   612,
   5,
   859,
   5,
   14,
   11339,
   5,
   169,
   6,
   3,
   5,
   258,
   5,
   110,
   5,
   73,
   5,
   99,
   5,
   10224,
   5,
   377,
   5,
   197,
   5,
   398,
   6,
   12,
   11,
   10,
   13,
   6,
   12,
   11,
   10,
   13,
   9],
  [9524,
   1994,
   56,
   563,
   157,
   201,
   30,
   5,
   90,
   5,
   160,
   5,
   1210,
   6,
   3,
   823,
   12008,
   6,
   6999,
   1379,
   683,
   9],
  [6,
   3,
   574,
   1150,
   218,
   24,
   5,
   6211,
   121,
   403,
   259,
   15,
   7

In [21]:

max_len = max(len(tweet) for tweet in tweet_tokens)

pad_id = slot_spm.piece_to_id("<PAD>")

padded_tweets = torch.tensor(
    [tweet + [pad_id] * (max_len - len(tweet))
    for tweet in tweet_tokens],
    dtype=torch.long
)
padded_tweets, padded_tweets.shape

(tensor([[    6,     3,  8691,  ...,     4,     4,     4],
         [13876,   354,    44,  ...,     4,     4,     4],
         [ 9524,  1994,    56,  ...,     4,     4,     4],
         ...,
         [    6,     3,    16,  ...,     4,     4,     4],
         [ 2009,   631,   856,  ...,     4,     4,     4],
         [ 2009,   631,   856,  ...,     4,     4,     4]]),
 torch.Size([324573, 337]))

In [None]:

class StockIdentification:
    def __init__(self, num_stocks, embd_size, hidden_size, vocab_size):
    
        

        self.stock_embd = nn.Embedding(num_embeddings=num_stocks, 
                                embedding_dim=2*hidden_size)
        
        
        self.token_embd = nn.Embedding(num_embeddings=vocab_size,
                                       embedding_dim=embd_size,
                                       padding_idx=pad_id)
        

        self.bi_lstm = nn.LSTM(
            input_size=embd_size, 
            hidden_size=hidden_size,
            bidirectional=True,
            batch_first=True
        )
        
    
    def forward(self, x, mask_idx):
        # x: (B, T)
        # mask_idx: (B,). The index of the <MASK> token for each tweet
        batch_size = x.size(0)
        token_embs = self.token_embd(x) # (B, T, embd_size)


        # output: (B, T, 2*hidden_size)
        # h_n, c_n : (2*num_layers, B, hidden_size)
        output, h_n, c_n = self.bi_lstm(token_embs)

        h_e = output[torch.arange(batch_size), mask_idx] # (B, 2*hidden_size)

        stock_embs = self.stock_embd.weight # (num_stocks, 2*hidden_size)
        logits = torch.matmul(h_e, stock_embs.T) #  (B, 2*hidden_size) @ (2*hidden_size, num_stocks) -> (B, num_stocks)
        

        return logits




In [None]:
import torch.nn.functional as F

def calculate_loss(logits, stock_labels):
    # stock_labels: (B,)
    log_probs = F.log_softmax(logits, dim=1)  # (B, num_stocks)

    true_log_probs = log_probs[torch.arange(logits.size(0)), stock_labels] # (B,)

    loss = -true_log_probs.sum()

    return loss