## **_Using News Data to Predict Movements in the Financial Movements_**

We'll be using two apporaches here:

* Continuous Bag of Words Model
* RNN Models using Word Embeddings

In [36]:
%load_ext autoreload
%autoreload 2

import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext import data

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from collections import Counter, defaultdict
import operator
import os, math
import random
import copy
import string
import multiprocessing as mp
import time

from split_data import split_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to /home/antimony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/antimony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/antimony/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/antimony/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# set the random seeds so the experiments can be replicated exactly
random.seed(72689)
np.random.seed(72689)
torch.manual_seed(72689)
if torch.cuda.is_available():
    torch.cuda.manual_seed(72689)

# Global class labels.
POS_LABEL = 'up'
NEG_LABEL = 'down'

**Reading in all the Data**

In [5]:
all_data = pd.read_csv("ProcessedData/CombinedData.csv")
all_data.drop(columns=['Unnamed: 0'], inplace=True)
all_data.head()

Unnamed: 0,Title,Date,Content,OpenMove,CloseMove
0,Top U.S. General Praises Iran-Backed Shiite Mi...,2017-01-04,The top commander of the U.S.-led coalition ag...,1.0,1.0
1,Extremists Turn to a Leader to Protect Western...,2017-01-04,As the founder of the Traditionalist Worker Pa...,1.0,1.0
2,How Julian Assange evolved from pariah to paragon,2017-01-04,President-elect Donald Trump tweeted some pra...,1.0,1.0
3,House panel recommends cutting funding for Pla...,2017-01-04,A House panel formed by Republicans to invest...,1.0,1.0
4,Missouri Bill: Gun-Banning Businesses Liable f...,2017-01-04,As Missouri lawmakers convene for the 2017 leg...,1.0,1.0


*Using a Small Subset of Data fro Development*

In [6]:
data_sample = all_data.sample(10000, random_state=68)
data_sample.reset_index(drop=True, inplace=True)
data_sample.head()

Unnamed: 0,Title,Date,Content,OpenMove,CloseMove
0,Another government shutdown over Obamacare? On...,2017-04-24,"\nThis is the web version of VoxCare, a daily ...",1.0,1.0
1,Tourists Helped Fatten Him Up; Now Thai Monkey...,2017-05-19,"[Whether he likes it or not, a morbidly obese ...",1.0,1.0
2,"Indian Premier, in Israel Visit, Seeks to Brea...",2017-07-05,JERUSALEM — Prime Minister Benjamin Netanyahu ...,0.0,0.0
3,Kiribati Ends Aerial Search for Missing Ferry ...,2018-03-21,"WELLINGTON, New Zealand — The aerial search fo...",0.0,0.0
4,The American Model,2017-07-28,Amidst a string of pat introductory refl...,0.0,0.0


### Preprocessing the Data For Feeding Into The Model

Preprocessing Involves (in our case):
* Turning All Words into lower/upper case, Normalization
* removing punctuations, accent marks and other diacritics
* removing stop words, sparse terms, and particular words
* Lemmatize using NLTK (It's generally better than Stemming, but way slower)

In [51]:
# Removing all Punctuation
def remove_punctuation(text):
    more_puncs = '—'+ '’'+ '“'+ '”'+ '…'
    return text.translate(str.maketrans('', '', string.punctuation+more_puncs))

# Removing all Stop Words
def remove_stopwords(text, stop_words):
    text = word_tokenize(text)
    return  " ".join([i for i in text if i not in stop_words])

def lemmetize(text, lemmatizer, pos_tag_dict):
    text = word_tokenize(text)
    pos = nltk.pos_tag(text)
    results = []
    for pair in pos:
        tag = pos_tag_dict.get(pair[1][0],wordnet.NOUN)
        results.append(lemmatizer.lemmatize(pair[0], tag))
        
    return " ".join(results)

The pre_process function below performs all the preprocessing we defined above. 

In [52]:
def pre_process(df):
    # Normalization
    df['Title'] = df['Title'].str.lower()
    df['Content'] = df['Content'].str.lower()

    # Removing Punctuation
    df['Title'] = df['Title'].apply(remove_punctuation)
    df['Content'] = df['Content'].apply(remove_punctuation)
    
    STOP_WORDS = set(stopwords.words('english'))
    # Remove Stopwords
    df['Title'] = df['Title'].apply(remove_stopwords, args=(STOP_WORDS, ))
    df['Content'] = df['Content'].apply(remove_stopwords, args=(STOP_WORDS, ))

    # Lemmetization
    lemmer = WordNetLemmatizer()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV
               }
    df['Title'] = df['Title'].apply(lemmetize, args=(lemmer, tag_dict))
    df['Content'] = df['Content'].apply(lemmetize, args=(lemmer, tag_dict))
    
    return df

**We run the pre_process function in parallel to make it faster using the Multi-Processing Module**

In [53]:
# Processing in Parallel
n_threads = mp.cpu_count()-1
data_pieces = np.array_split(data_sample, n_threads)
startTime = time.time()
pool = mp.Pool(n_threads)
data_sample = pd.concat(pool.map(pre_process, data_pieces))
pool.close()
pool.join()

totalTime = time.time() - startTime
print("Time taken in Pre-Processing: {}m {}s".format(totalTime // 60, totalTime%60))
data_sample.head()

TypeError: 'float' object is not callable

### Preparing Data for the CBOW Model

* Building the Vocabulary (Using Spacy) | **MAX_VOCAB_SIZE** = 25000
* Splitting the data for Test and Training


In [4]:
SEED = 68

In [5]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

Splitting the Data and Storing it such that torch text can easily ingest it.

In [6]:
split_data(df=data_sample[['Content', 'CloseMove']],prefix='dev',seed=SEED)

NameError: name 'data_sample' is not defined

#### Reading in Data Using TorchText

In [7]:
train, val, test = data.TabularDataset.splits(
        path='./ProcessedData/', train='dev_train.csv',
        validation='dev_val.csv', test='dev_test.csv', format='csv',
        fields=[('Text', TEXT), ('Label', LABEL)])


In [8]:
MAX_VOCAB_SIZE = 70000

#### Setting up the dataloader

In [71]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class BOWDataLoader(tud.Dataset):
    def __init__(self, data, vocab_size, text, field):
        self.vocab_size = vocab_size
        self.TEXT = text
        self.LABEL = field
        self.TEXT.build_vocab(data, max_size = vocab_size)
        self.LABEL.build_vocab(data)
        self.data = data
        
    def __len__(self):
        '''
        Returns the number of Examples
        '''
        return len(self.data.examples)
    
    def __getitem__(self, idx):
        """
        Returns a tuple of text and label at the given index.
        If label is not present None is returned.
        """
        itm = torch.zeros(self.vocab_size)
        for word in self.data[idx].Text:
            itm[self.TEXT.vocab.stoi[word]] += 1
        
        # To Differentiate Train and Test data
        if len(self.data.fields) == 2:
            label = self.data[idx].Label
            return itm, label
        else:
            return itm, None

train_dataset = BOWDataLoader(train, MAX_VOCAB_SIZE, TEXT, LABEL)
val_dataset = BOWDataLoader(val, MAX_VOCAB_SIZE, TEXT, LABEL)
test_dataset = BOWDataLoader(test, MAX_VOCAB_SIZE, TEXT, LABEL)

### Bag of Words Model

Here we define:
* The INPUT, OUTPUT dimrensions.
* The Loss Function and
* The optimizer

for the model.


Also, the training, classification and evaluation functions.


In [72]:
best_model = None
class BOWClassifier(nn.Module):
    
    def __init__(self, input_size, output_size, batch_size):
        """
        Constructing a Logistic Regression Model
        
        Loss Function: Cross Entropy Loss
        Optimizer: Adam
        """
        super(BOWClassifier, self).__init__()
        
        self.batch_size = batch_size
        # Cuda Availability
        self.cuda = torch.cuda.is_available()
        
        # Linear layer
        self.fc = nn.Linear(input_size, output_size)
        # Activation 
#         self.activate = nn.Sigmoid()
        
        # Loss Function
        self. loss_fn = nn.CrossEntropyLoss()
        
        # Optimizer
        self.optimizer = torch.optim.Adam(self.parameters())
    
    def forward(self, text):
        """
        Passes the data through the network and return the output
        """
        result = self.fc(text)
        return (result)
    
    def train_epoch(self, dataset):
        """
        Trains a logistic regression model across all examples in the dataset.
        """
        self.dataloader = tud.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        self.train()
        for i, (X,y) in enumerate(self.dataloader):
            X = X.float()
            y = torch.Tensor(np.asarray(y, dtype=np.float64)).long()
            if self.cuda:
                X  = X.cuda()
                y = y.cuda()
            
            self.optimizer.zero_grad()
            
            predictions = self.forward(X)
            
            loss = self.loss_fn(predictions, y)
            
            loss.backward()
            
            if (i+1) % 100 == 0:
                print("Iteration : {:4d} | Loss : {:4.4f}".format(i+1, loss.item()))
            
            self.optimizer.step()
        
    def train_model(self, train_data, val_data, num_epocs = 2):
        """
        Trains the model and saves the best model according to the validation score
        """
        self.train()
        accuracy = [0.]
        for epoch in range(num_epocs):
            self.train_epoch(train_data)
            val_accuracy = self.evaluate(val_data)
            print("Validation Accuracy: {:4.4f}".format(val_accuracy))
            if val_accuracy > max(accuracy):
                best_model = copy.deepcopy(self)        
            accuracy.append(val_accuracy)
            
    def classify(self, data):
        """
        Returns the results of the dataset passed.
        """
        dataloader = tud.DataLoader(data, batch_size=self.batch_size, shuffle=False)
        results = np.asarray([])
        for i, (X,y) in enumerate(dataloader):
            X = X.float()
            if self.cuda:
                X = X.cuda()
            predictions = self.forward(X).max(1)[1].cpu().numpy().reshape(-1)
            results = np.append(results, predictions)
        labels = ["UP" if i == 1 else "DOWN" for i in results]
        return labels
    
    def evaluate(self, data):
        self.eval()
        dataloader = tud.DataLoader(data, batch_size=self.batch_size, shuffle=False)
        correct = 0
        total = 0
        
        for i, (X,y) in enumerate(dataloader):
            X = X.float()
            if self.cuda:
                X = X.cuda()
            predictions = self.forward(X).max(1)[1].cpu().numpy().reshape(-1)
            correct += (predictions == np.asarray(y, dtype=np.float64)).sum()
            total += predictions.shape[0]
        
        return correct/total

In [74]:
model = BOWClassifier(MAX_VOCAB_SIZE, 2, 8)
if torch.cuda.is_available():
    model = model.cuda()
# model.train_model(train_dataset, val_dataset, num_epocs=5)

### Neural Network based Model with Word Embeddings

We use a Neural Network now with Word Embeddings, whoose :
* Input : A sentence
* Output: Label : {UP, DOWN}

The basic structure of a model class is as above. Functions like classify, evaluate and train will be defined along with pretrained word-embeddings.

The **work embeddings** we use are **Glove**.

**Refrence:**

Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/pubs/glove.pdf)

In [9]:
def load_glove(path_file):
    """
    Loads the Glove Pre-Trained Embeddings
    
    Args:
        path_file: Path to the official glove embedding text file
    
    Returns: Dictionary {Word: [Embedding]}
    
    """
    start_time = time.time()
    print("Loading Glove Model ...")
    glove = {}
    with open(path_file) as f:
        for line in f:
            tmp = line.split()
            glove[tmp[0]] = np.asarray(tmp[1:], dtype=np.float64)
    print("Glove Model Loaded in {} s".format(time.time()-start_time))
    return glove

def gloveWordIndex(glove):
    """
    Generates word to index mappings
    0 --> <unk>
    1 --> <pad>
    Args:
        Loaded Glove Model as a dict
        
    Returns:
        word to index map {word:idx} and index to word map{idx:word}
    
    """
    w_i = {k:v+2 for v,k in enumerate(glove.keys())}
    w_i['<unk>'] = 0
    w_i['<pad>'] = 1
    i_w = {v+2:k for v,k in enumerate(glove.keys())}
    i_w[0] = '<unk>'
    i_w[1] = '<pad>'
    return w_i, i_w

def getWeightMatrix(glove):
    embd_dim = glove['a'].shape[0]
    num_embeddings = len(glove.keys())
    w_m = np.zeros((num_embeddings+2, embd_dim))
    w_m[0] = np.random.rand(embd_dim)
    w_m[1] = np.zeros(embd_dim)
    for i, word in enumerate(glove.keys()):
        w_m[i+2] = glove[word]
    
    return w_m

In [10]:
glove = load_glove("Embeddings/glove.6B.100d.txt")
word_to_idx, idx_to_word = gloveWordIndex(glove)

Loading Glove Model ...
Glove Model Loaded in 10.682385921478271 s


In [11]:
weight_matrix = getWeightMatrix(glove)
weight_matrix.shape

(400002, 100)

In [12]:
MAX_VOCAB_SIZE = len(glove.keys())

In [103]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class NeuralNetDataLoader(tud.Dataset):
    def __init__(self, data, word_to_idx, idx_to_word, vocab_size):
        self.vocab_size = vocab_size
        self.data = data
        self.word_to_idx = word_to_idx
        self.idx_to_Word = idx_to_word
        
    def __len__(self):
        '''
        Returns the number of Examples
        '''
        return len(self.data.examples)
    
    def __getitem__(self, idx):
        """
        Returns a tuple of text and label at the given index.
        If label is not present None is returned.
        """
        MAX_LEN = 1000
        itm = []
        l = 0
        for word in self.data[idx].Text:
            indx = self.word_to_idx.get(word,0)
            itm.append(indx)
            l += 1
            if l == MAX_LEN:
                break
        
        if len(itm) < MAX_LEN:
            itm  = itm + [1 for i in range(MAX_LEN-len(itm))]
        
        itm = torch.tensor(itm).long()
        # To Differentiate Train and Test data
        if len(self.data.fields) == 2:
            label = self.data[idx].Label
            return itm, label
        else:
            return itm, None

train_dataset = NeuralNetDataLoader(train, word_to_idx, idx_to_word, MAX_VOCAB_SIZE)
val_dataset = NeuralNetDataLoader(val, word_to_idx, idx_to_word, MAX_VOCAB_SIZE)
test_dataset = NeuralNetDataLoader(test, word_to_idx, idx_to_word, MAX_VOCAB_SIZE)

#### Neural Network Model with Glove Embeddings

In [183]:
class NeuralNetClassifier(nn.Module):
    
    def __init__(self, input_dim, output_dim, pad_index, embedding_weights):
        
        super().__init__()
        embd_dim = embedding_weights.shape[1]
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_weights), freeze=False, padding_idx = pad_index)
        
        hid_dim1 = 64
        hid_dim2 = 32
        
        self.drop_out = nn.Dropout()
        
        self.hd1 = nn.Linear(embd_dim, hid_dim1)
        self.hd2 = nn.Linear(hid_dim1, hid_dim2)
        self.out = nn.Linear(hid_dim2, output_dim)
        
        self.activate = nn.ReLU()
        self.loss_fn = None
    
    def forward(self,text):
        
#         print("Text: ", text.shape)
        embds = self.embedding(text)
#         print("Embds: ", embds.shape)
        mean_embd = torch.mean(embds, 1)
#         print("Embedding:", mean_embd.shape)
        output = self.activate(self.hd1(mean_embd.float()))
#         print("Layer 1: ",output.shape)
        output = self.drop_out(output)
        output = self.activate(self.hd2(output))
        output = self.drop_out(output)
        output = self.out(output)
        return output
        

### Training Module
This module contains the evaluate and training functions.

In [184]:
class TrainingModule():
    
    def __init__(self, model):
        self.model = model
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.cuda = torch.cuda.is_available()
        self.optimizer = optim.Adam(self.model.parameters())
        
    def train_epoch(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        self.model.train()
        for i, (X,y) in enumerate(iterator):
            self.optimizer.zero_grad()
            X = X.long()
            y = torch.Tensor(np.asarray(y, dtype=np.float64)).float()
            if self.cuda:
                X = X.cuda()
                y = y.cuda()
            preds = self.model.forward(X).squeeze(1)
            
            loss = self.loss_fn(preds, y)
            
            acc = (preds==y).sum()/y.shape[0]
            
            if i % 100 == 0:
                print("Iteration: ")
            
            loss.backward()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
            self.optimizer.step()
        
        return epoch_loss/len(iterator), epoch_acc/len(iterator)
    
        
    def train_model(self, train_iterator, dev_iterator, num_epocs = 5):

        dev_acc =[0]
        for epoch in range(num_epocs):
            ep_loss, ep_accu = self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print("Dev. Accuracy : {} | Dev. Format : {}".format(dev_acc[0], dev_acc[1]))
            if dev_acc[1] > max(dev_acc):
                best_model = copy.deepcopy(self)
            dev_acc.append(dev_acc[1])

        return best_model.model
        
    
    def evaluate(self, iterator):
        epoch_loss  = 0
        epoch_acc = 0
        
        model.eval()
        
        with torch.no_grad():
            for batch in iterator:
                
                predictions = self.model(batch.text).squeeze(1)
                loss = self.loss_fn(predictions, batch.label)
                acc = binary_accuracy(predictions, batch.label)
                
                epoch_loss += loss.item()
                epoch_acc = acc.item()
                
        
        return epoch_loss/len(iterator), epoch_acc/len(iterator)      

### Initializing the Model 
with the appropriate dimensions

In [185]:
INPUT_DIM = weight_matrix.shape[0]
OUTPUT_DIM = 1
PAD_IDX = 1
BATCH_SIZE = 64
model = NeuralNetClassifier(INPUT_DIM, OUTPUT_DIM, PAD_IDX, weight_matrix)
if torch.cuda.is_available():
    model = model.cuda()
print(model)

NeuralNetClassifier(
  (embedding): Embedding(400002, 100, padding_idx=1)
  (drop_out): Dropout(p=0.5)
  (hd1): Linear(in_features=100, out_features=64, bias=True)
  (hd2): Linear(in_features=64, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=1, bias=True)
  (activate): ReLU()
)


#### Setting up the data iterators

In [186]:
train_iter = tud.DataLoader(train_dataset, batch_size= BATCH_SIZE, shuffle=False)
test_iter = tud.DataLoader(test_dataset, batch_size= BATCH_SIZE, shuffle=False)
val_iter = tud.DataLoader(val_dataset, batch_size= BATCH_SIZE, shuffle=False)

### Training the Model

In [157]:
for (X,y) in train_iter:
    t = model.forward(X)
    break

Text:  torch.Size([64, 1000])
Embds:  torch.Size([64, 1000, 100])
Embedding: torch.Size([64, 100])
Layer 1:  torch.Size([64, 64])


In [158]:
t.squeeze(1).shape

torch.Size([64])