# LSTM Model for Analyzing StockTwitz Sentiment

In [2]:
import json
import nltk
import os
import random
import re
import torch
import numpy as np

from torch import nn, optim
import torch.nn.functional as F

## Import Twits 
### Load Twits Data 
This JSON file contains a list of objects for each twit in the `'data'` field:

```
{'data':
  {'message_body': 'Neutral twit body text here',
   'sentiment': 0},
  {'message_body': 'Happy twit body text here',
   'sentiment': 1},
   ...
}
```

The fields represent the following:

* `'message_body'`: The text of the twit.
* `'sentiment'`: Sentiment score for the twit, ranges from -2 to 2 in steps of 1, with 0 being neutral.

In [2]:
with open(os.path.join('..', '..', 'data', 'project_6_stocktwits', 'twits.json'), 'r') as f:
    twits = json.load(f)

json_object = json.dumps(twits)
with open('twits.json','w') as outfile:
    outfile.write(json_object)

print(twits['data'][:20])

[{'message_body': '$FITB great buy at 26.00...ill wait', 'sentiment': 2, 'timestamp': '2018-07-01T00:00:09Z'}, {'message_body': '@StockTwits $MSFT', 'sentiment': 1, 'timestamp': '2018-07-01T00:00:42Z'}, {'message_body': '#STAAnalystAlert for $TDG : Jefferies Maintains with a rating of Hold setting target price at USD 350.00. Our own verdict is Buy  http://www.stocktargetadvisor.com/toprating', 'sentiment': 2, 'timestamp': '2018-07-01T00:01:24Z'}, {'message_body': '$AMD I heard there’s a guy who knows someone who thinks somebody knows something - on StockTwits.', 'sentiment': 1, 'timestamp': '2018-07-01T00:01:47Z'}, {'message_body': '$AMD reveal yourself!', 'sentiment': 0, 'timestamp': '2018-07-01T00:02:13Z'}, {'message_body': '$AAPL Why the drop? I warren Buffet taking out his position?', 'sentiment': 1, 'timestamp': '2018-07-01T00:03:10Z'}, {'message_body': '$BA bears have 1 reason on 06-29 to pay more attention https://dividendbot.com?s=BA', 'sentiment': -2, 'timestamp': '2018-07-01T

### Length of Data

In [3]:
"""print out the number of twits"""
total_twits = len(twits['data'])
print(total_twits)

1548010


### Split Message Body and Sentiment Score

In [4]:
messages = [twit['message_body'] for twit in twits['data']]
# Since the sentiment scores are discrete, we'll scale the sentiments to 0 to 4 for use in our network
sentiments = [twit['sentiment'] + 2 for twit in twits['data']]


## Preprocessing the Data
The preprocessing steps we will take will be

### Pre-Processing

In [None]:
nltk.download('wordnet')

Create `preprocess` function to remove any characters that doesn't add any sentiment meaning

In [5]:
def preprocess(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove URLs
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    
    Parameters
    ----------
        message : The text message to be preprocessed.
        
    Returns
    -------
        tokens: The preprocessed text into tokens.
    """ 
     
    # Lowercase the twit message
    text = message.lower()
    
    # Replace URLs with a space in the message
    text = re.sub("https?://\S+",' ',text)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    text = re.sub("\$\S+",' ',text)
    
    # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
    text = re.sub("@\S+",' ',text)

    # Replace everything not a letter with a space
    text = re.sub("[^a-zA-Z]",' ',text)
    
    # Tokenize by splitting the string on whitespace into a list of words
    tokens = text.split()

    # Lemmatize words using the WordNetLemmatizer. You can ignore any word that is not longer than one character.
    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(token) for token in tokens if len(token) > 1]

    
    assert type(tokens) == list, 'Tokens should be list'
    return tokens

[nltk_data] Downloading package wordnet to /root/nltk_data...


### Preprocess All the Twits 
Apply the function `preprocess` to all the twit messages.

In [6]:
# TODO Implement
tokenized = [preprocess(msg) for msg in messages]

In [7]:
print("Raw Text", messages[3])
print("Tokenized List", tokenized[3])

Raw Text $AMD I heard there’s a guy who knows someone who thinks somebody knows something - on StockTwits.
Tokenized List ['heard', 'there', 'guy', 'who', 'know', 'someone', 'who', 'think', 'somebody', 'know', 'something', 'on', 'stocktwits']


### Bag of Words
Count how often each words appear in this dataset

In [8]:
from collections import Counter


"""
Create a vocabulary by using Bag of words
"""
# TODO: Implement 
word_tokens = []
for token in tokenized:
    for word in token:
        word_tokens.append(word)
counts = Counter(word_tokens)

bow = counts

In [9]:
print(bow.most_common(20))

[('the', 398754), ('to', 379487), ('is', 284865), ('for', 273538), ('on', 241663), ('of', 211334), ('and', 208471), ('in', 205307), ('this', 203540), ('it', 193485), ('at', 138453), ('will', 128180), ('up', 121567), ('are', 101424), ('you', 94275), ('that', 89655), ('be', 89277), ('short', 86639), ('what', 79113), ('today', 76240)]


### Frequency of Words Appearing in Message
Remove most common words such as 'the', 'and', 'it' and also rare words that apepars in a few twits. The purpose of this is to reduce the amount of noise in our input

In [10]:
"""
Set the following variables:
    freqs
    low_cutoff
    high_cutoff
    K_most_common
"""
# Dictionart that contains the Frequency of words appearing in messages.
# The key is the token and the value is the frequency of that word in the corpus.
#print(len(messages) == total_twits)
total_word = sum(bow.values()) #Total number of words not total number of twits 

freqs = {word: cnt/total_word for word, cnt in bow.items()}

# Float that is the frequency cutoff. Drop words with a frequency that is lower or equal to this number.
low_cutoff = 1e-6

# Integer that is the cut off for most common words. Drop words that are the `high_cutoff` most common words.
high_cutoff = 17

# The k most common words in the corpus. Use `high_cutoff` as the k.
K_most_common = [word[0] for word in bow.most_common(high_cutoff)]


filtered_words = [word for word in freqs if (freqs[word] > low_cutoff and word not in K_most_common)]
print(K_most_common)
len(filtered_words) 

True
['the', 'to', 'is', 'for', 'on', 'of', 'and', 'in', 'this', 'it', 'at', 'will', 'up', 'are', 'you', 'that', 'be']


14958

### Updating Vocabulary by Removing Filtered Words
Let's creat three variables that will help with our vocabulary.

In [11]:
"""
Set the following variables:
    vocab
    id2vocab
    filtered
"""

#TODO Implement

# A dictionary for the `filtered_words`. The key is the word and value is an id that represents the word. 
vocab = {word:i  for i,word in enumerate(filtered_words)}
# Reverse of the `vocab` dictionary. The key is word id and value is the word. 
id2vocab = {i:word for word,i in vocab.items()}
# tokenized with the words not in `filtered_words` removed.

assert set(vocab.keys()) == set(id2vocab.values()), 'Check vocab and id2vocab dictionaries'
#filtered =  [[word for word in msg if (freqs[word] > low_cutoff and word not in K_most_common)]for msg in tokenized]
filtered = [[word for word in msg if word in filtered_words] for msg in tokenized]

### Balancing the classes
If we look at how our twits are labeled, we'll find that 50% of them are neutral. This means that our network will be 50% accurate just by guessing 0 every single time. To help our network learn appropriately, we'll want to balance our classes.

In [12]:
balanced = {'messages': [], 'sentiments':[]}

n_neutral = sum(1 for each in sentiments if each == 2)
N_examples = len(sentiments)
keep_prob = (N_examples - n_neutral)/(4*n_neutral)

for idx, sentiment in enumerate(sentiments):
    message = filtered[idx]
    if len(message) == 0:
        # skip this message because it has length zero
        continue
    elif sentiment != 2 or random.random() < keep_prob:
        balanced['messages'].append(message)
        balanced['sentiments'].append(sentiment) 

Check if the number of neutral twits are reduced

In [13]:
n_neutral = sum(1 for each in balanced['sentiments'] if each == 2)
N_examples = len(balanced['sentiments'])
n_neutral/N_examples

0.19396845955927397

Convert our tokens into integer ids which we can pass to the network.

In [14]:
token_ids = [[vocab[word] for word in message] for message in balanced['messages']]
sentiments = balanced['sentiments']

## Neural Network
The diagram of the network 

#### Embed -> LSTM> Dense -> Softmax
### Implement the text classifier

In [15]:
train_on_gpu = torch.cuda.is_available()
print(train_on_gpu)

True


In [16]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, lstm_size, output_size, lstm_layers=1, dropout=0.1):
        """
        Initialize the model by setting up the layers.
        
        Parameters
        ----------
            vocab_size : The vocabulary size.
            embed_size : The embedding layer size.
            lstm_size : The LSTM layer size.
            output_size : The output size.
            lstm_layers : The number of LSTM layers.
            dropout : The dropout probability.
        """
        
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.lstm_size = lstm_size
        self.output_size = output_size
        self.lstm_layers = lstm_layers
        self.dropout = dropout
        
        # TODO Implement

        # Setup embedding layer
        self.embedding = nn.Embedding(vocab_size,embed_size)
        self.lstm = nn.LSTM(embed_size,lstm_size,lstm_layers,dropout= dropout, batch_first= True)
        
        self.fc = nn.Linear(lstm_size,output_size)
        self.dropout = nn.Dropout(p=0.4)
        # Setup additional layers


    def init_hidden(self, batch_size):
        """ 
        Initializes hidden state
        
        Parameters
        ----------
            batch_size : The size of batches.
        
        Returns
        -------
            hidden_state
            
        """
        
        # TODO Implement 
        
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        
        weight = next(self.parameters()).data
        if train_on_gpu:
            hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_().cuda(),
                      weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_().cuda())
        else:
            hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_(),
                      weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_())
        return hidden


    def forward(self, nn_input, hidden_state):
        """
        Perform a forward pass of our model on nn_input.
        
        Parameters
        ----------
            nn_input : The batch of input to the NN.
            hidden_state : The LSTM hidden state.

        Returns
        -------
            logps: log softmax output
            hidden_state: The new hidden state.

        """
        batch_size = nn_input.size(0)
        #print("Batch Size:",batch_size)
        seq_length = nn_input.size(1)
        embedding = self.embedding(nn_input)
        
        lstm_out, hidden_state = self.lstm(embedding, hidden_state)
        lstm_out = lstm_out.contiguous().view(-1, self.lstm_size)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        logsoft_out = F.log_softmax(out)
        logsoft_out = logsoft_out.view(batch_size,seq_length, -1)
        logsoft_out = logsoft_out[:,-1,:]
        # TODO Implement 
        
        return logsoft_out, hidden_state

## Training
### DataLoaders and Batching
Build `dataloader` to support batching to generate the current batch inputs and labels. This enable the program to loop through the training data

In [18]:
def dataloader(messages, labels, sequence_length=30, batch_size=32, shuffle=False):
    """ 
    Build a dataloader.
    """
    if shuffle:
        indices = list(range(len(messages)))
        random.shuffle(indices)
        messages = [messages[idx] for idx in indices]
        labels = [labels[idx] for idx in indices]
    
    total_sequences = len(messages)
    n_batch = total_sequences // batch_size
    messages = messages[:n_batch * batch_size]
    
    for ii in range(0, n_batch * batch_size, batch_size):
        batch_messages = messages[ii: ii+batch_size]
        
        # First initialize a tensor of all zeros
        batch = torch.zeros((len(batch_messages), sequence_length), dtype=torch.int64)
        for batch_num, tokens in enumerate(batch_messages):
            token_tensor = torch.tensor(tokens)
            # Left pad!
            start_idx = max(sequence_length - len(token_tensor), 0)
            batch[batch_num, start_idx:] = token_tensor[:sequence_length]
        
        label_tensor = torch.tensor(labels[ii: ii+len(batch_messages)])
        yield batch, label_tensor

### Training and  Validation
With our data in nice shape, we'll split it into training and validation sets.

In [20]:
"""
Split data into training and validation datasets. Use an appropriate split size.
The features are the `token_ids` and the labels are the `sentiments`.
"""   
split= int(0.8 * len(token_ids))
train_features, remain_features = token_ids[:split], token_ids[split:]
train_labels, remain_labels = sentiments[:split], sentiments[split:]

split = int(0.5 * len(remain_features))
valid_features, test_features = remain_features[:split],remain_features[split:]
valid_labels, test_labels = remain_labels[:split],remain_labels[split:]

822715
102839


### Training

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TextClassifier(len(vocab)+1, 512, 512, 5, lstm_layers=2, dropout=0.4)
model.embedding.weight.data.uniform_(-1, 1)
model.to(device)

TextClassifier(
  (embedding): Embedding(14959, 512)
  (lstm): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.4)
  (fc): Linear(in_features=512, out_features=5, bias=True)
  (dropout): Dropout(p=0.4)
)

In [23]:
"""
Train your model with dropout. Make sure to clip your gradients.
Print the training loss, validation loss, and validation accuracy for every 100 steps.
"""

epochs = 1
batch_size = 512
learning_rate = 0.005
sl = 50
print_every = 100
clip = 5
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model.train()

for epoch in range(epochs):
    print('Starting epoch {}'.format(epoch + 1))
    steps = 0
    
    hidden = model.init_hidden(batch_size)
    for text_batch, labels in dataloader(train_features, train_labels, \
                                         batch_size=batch_size, sequence_length=sl, shuffle=True):
        steps += 1
        
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        hidden = tuple([each.data for each in hidden])
        # Set Device
        text_batch, labels = text_batch.to(device), labels.to(device)
        for each in hidden:
            each.to(device)
        
        # TODO Implement: Train Model
        model.zero_grad() #Reset Gradient
        log_ps, hidden = model.forward(text_batch, hidden) #Compute Result
        loss = criterion(log_ps,labels) #Compute Loss
        loss.backward() #Backpropagate
        nn.utils.clip_grad_norm_(model.parameters(),clip) #Gradient Clip
        optimizer.step() #Update parameters
        
        if steps % print_every == 0:
            model.eval()
            val_losses = []
            val_accuracy = []
            
            valid_loader = dataloader(valid_features, valid_labels, batch_size=batch_size, \
                                      sequence_length=sl, shuffle=True)
            val_h = model.init_hidden(batch_size)
            for val_batch, val_labels in valid_loader:
                val_h = tuple([each.data for each in val_h])

                if train_on_gpu:
                    val_batch, val_labels = val_batch.cuda(), val_labels.cuda()
                
                log_ps, val_h = model(val_batch, val_h)
                val_loss = criterion(log_ps, val_labels)
                val_losses.append(val_loss.item())
                
                val_ps = torch.exp(log_ps)
                _,top_class = val_ps.topk(1,dim=1)
                
                val_accuracy.append(torch.mean(top_class.eq(val_labels.view_as(top_class)).float()).item())
            
            print("Epoch: {} ...".format(epoch+1),
                  "Step: {} ...".format(steps),
                  "Loss:{:.3f} ...".format(loss.item()),
                  "Val Loss:{:.3f} ...".format(np.mean(val_losses)),
                  "Val Accruacy: {:.3f}".format(np.mean(val_accuracy)))
            
            
            model.train()

Starting epoch 1




Epoch: 1 ... Step: 100 ... Loss:0.925 ... Val Loss:0.926 ... Val Accruacy: 0.632
Epoch: 1 ... Step: 200 ... Loss:0.854 ... Val Loss:0.846 ... Val Accruacy: 0.672
Epoch: 1 ... Step: 300 ... Loss:0.862 ... Val Loss:0.803 ... Val Accruacy: 0.691
Epoch: 1 ... Step: 400 ... Loss:0.839 ... Val Loss:0.783 ... Val Accruacy: 0.700
Epoch: 1 ... Step: 500 ... Loss:0.735 ... Val Loss:0.775 ... Val Accruacy: 0.691
Epoch: 1 ... Step: 600 ... Loss:0.719 ... Val Loss:0.769 ... Val Accruacy: 0.705
Epoch: 1 ... Step: 700 ... Loss:0.755 ... Val Loss:0.756 ... Val Accruacy: 0.711
Epoch: 1 ... Step: 800 ... Loss:0.762 ... Val Loss:0.749 ... Val Accruacy: 0.713
Epoch: 1 ... Step: 900 ... Loss:0.803 ... Val Loss:0.741 ... Val Accruacy: 0.718
Epoch: 1 ... Step: 1000 ... Loss:0.761 ... Val Loss:0.738 ... Val Accruacy: 0.718
Epoch: 1 ... Step: 1100 ... Loss:0.784 ... Val Loss:0.755 ... Val Accruacy: 0.713
Epoch: 1 ... Step: 1200 ... Loss:0.750 ... Val Loss:0.729 ... Val Accruacy: 0.719
Epoch: 1 ... Step: 1300 .

## Making Predictions
### Prediction 
The `predict` function generate the prediction vector from a message.

In [24]:
def predict(text, model, vocab):
    """ 
    Make a prediction on a single sentence.

    Parameters
    ----------
        text : The string to make a prediction on.
        model : The model to use for making the prediction.
        vocab : Dictionary for word to word ids. The key is the word and the value is the word id.

    Returns
    -------
        pred : Prediction vector
    """    
    
    # TODO Implement
    
    tokens = preprocess(text)
    
    # Filter non-vocab words
    tokens = [word for word in tokens if word in filtered_words]
    
    # Convert words to ids
    tokens = [vocab[word] for word in tokens]
        
    # Adding a batch dimension
    text_input = torch.tensor(tokens).unsqueeze(0)

    # Get the NN output
    hidden = model.init_hidden(1)
    text_input = text_input.to('cuda')
    logps, _ = model.forward(text_input,hidden)
    
    # Take the exponent of the NN output to get a range of 0 to 1 for each label.
    pred = torch.exp(logps).to('cpu')
    
    return pred.detach().numpy()

In [25]:
text = "Google is working on self driving cars, I'm bullish on $goog"
model.eval()
model.to("cuda")
predict(text, model, vocab)



array([[  5.42058668e-04,   1.43642137e-02,   7.90409371e-03,
          7.46714711e-01,   2.30474994e-01]], dtype=float32)

## Testing
### Load the Data 

In [26]:
with open(os.path.join('..', '..', 'data', 'project_6_stocktwits', 'test_twits.json'), 'r') as f:
    test_data = json.load(f)

### Twit Stream

In [27]:
def twit_stream():
    for twit in test_data['data']:
        yield twit

next(twit_stream())

{'message_body': '$JWN has moved -1.69% on 10-31. Check out the movement and peers at  https://dividendbot.com?s=JWN',
 'timestamp': '2018-11-01T00:00:05Z'}

Using the `prediction` function, let's apply it to a stream of twits.

In [28]:
def score_twits(stream, model, vocab, universe):
    """ 
    Given a stream of twits and a universe of tickers, return sentiment scores for tickers in the universe.
    """
    for twit in stream:

        # Get the message text
        text = twit['message_body']
        symbols = re.findall('\$[A-Z]{2,4}', text)
        score = predict(text, model, vocab)

        for symbol in symbols:
            if symbol in universe:
                yield {'symbol': symbol, 'score': score, 'timestamp': twit['timestamp']}

In [29]:
universe = {'$BBRY', '$AAPL', '$AMZN', '$BABA', '$YHOO', '$LQMT', '$FB', '$GOOG', '$BBBY', '$JNUG', '$SBUX', '$MU'}

score_stream = score_twits(twit_stream(), model, vocab, universe)

next(score_stream)



{'symbol': '$AAPL',
 'score': array([[ 0.12625226,  0.11622299,  0.17909808,  0.30883443,  0.26959223]], dtype=float32),
 'timestamp': '2018-11-01T00:00:18Z'}