## **_Using News Data to Predict Movements in the Financial Movements_**

We'll be using two apporaches here:

* Continuous Bag of Words Model
* RNN Models using Word Embeddings

In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext import data

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

from collections import Counter, defaultdict
import operator
import os, math
import random
import copy
import string
import multiprocessing as mp

from split_data import split_data

[nltk_data] Downloading package punkt to /home/antimony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/antimony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# set the random seeds so the experiments can be replicated exactly
random.seed(72689)
np.random.seed(72689)
torch.manual_seed(72689)
if torch.cuda.is_available():
    torch.cuda.manual_seed(72689)

# Global class labels.
POS_LABEL = 'up'
NEG_LABEL = 'down'

**Reading in all the Data**

In [109]:
all_data = pd.read_csv("ProcessedData/CombinedData.csv")
all_data.drop(columns=['Unnamed: 0'], inplace=True)
all_data.head()

Unnamed: 0,Title,Date,Content,OpenMove,CloseMove
0,Top U.S. General Praises Iran-Backed Shiite Mi...,2017-01-04,The top commander of the U.S.-led coalition ag...,1.0,1.0
1,Extremists Turn to a Leader to Protect Western...,2017-01-04,As the founder of the Traditionalist Worker Pa...,1.0,1.0
2,How Julian Assange evolved from pariah to paragon,2017-01-04,President-elect Donald Trump tweeted some pra...,1.0,1.0
3,House panel recommends cutting funding for Pla...,2017-01-04,A House panel formed by Republicans to invest...,1.0,1.0
4,Missouri Bill: Gun-Banning Businesses Liable f...,2017-01-04,As Missouri lawmakers convene for the 2017 leg...,1.0,1.0


*Using a Small Subset of Data fro Development*

In [110]:
data_sample = all_data.sample(10000, random_state=68)
data_sample.reset_index(drop=True, inplace=True)
data_sample.head()

Unnamed: 0,Title,Date,Content,OpenMove,CloseMove
0,Another government shutdown over Obamacare? On...,2017-04-24,"\nThis is the web version of VoxCare, a daily ...",1.0,1.0
1,Tourists Helped Fatten Him Up; Now Thai Monkey...,2017-05-19,"[Whether he likes it or not, a morbidly obese ...",1.0,1.0
2,"Indian Premier, in Israel Visit, Seeks to Brea...",2017-07-05,JERUSALEM — Prime Minister Benjamin Netanyahu ...,0.0,0.0
3,Kiribati Ends Aerial Search for Missing Ferry ...,2018-03-21,"WELLINGTON, New Zealand — The aerial search fo...",0.0,0.0
4,The American Model,2017-07-28,Amidst a string of pat introductory refl...,0.0,0.0


### Preprocessing the Data For Feeding Into The Model

Preprocessing Involves (in our case):
* Turning All Words into lower/upper case, Normalization
* removing punctuations, accent marks and other diacritics
* removing stop words, sparse terms, and particular words
* Stemming using a Porter Stemmer from NLTK 

In [111]:
# Removing all Punctuation
def remove_punctuation(text):
    more_puncs = '—'+ '’'+ '“'+ '”'+ '…'
    return text.translate(str.maketrans('', '', string.punctuation+more_puncs))

# Removing all Stop Words
def remove_stopwords(text, stop_words):
    text = word_tokenize(text)
    return  " ".join([i for i in text if i not in stop_words])

def stem(text, stemmer):
    text = word_tokenize(text)
    return " ".join([stemmer.stem(i) for i in text])

The pre_process function below performs all the preprocessing we defined above. 

In [112]:
def pre_process(df):
    # Normalization
    df['Title'] = df['Title'].str.lower()
    df['Content'] = df['Content'].str.lower()

    # Removing Punctuation
    df['Title'] = df['Title'].apply(remove_punctuation)
    df['Content'] = df['Content'].apply(remove_punctuation)
    
    STOP_WORDS = set(stopwords.words('english'))
    # Remove Stopwords
    df['Title'] = df['Title'].apply(remove_stopwords, args=(STOP_WORDS, ))
    df['Content'] = df['Content'].apply(remove_stopwords, args=(STOP_WORDS, ))

    # Stemming
    stemmer = PorterStemmer()
    df['Title'] = df['Title'].apply(stem, args=(stemmer,))
    df['Content'] = df['Content'].apply(stem, args=(stemmer,))

    return df

**We run the pre_process function in parallel to make it faster using the Multi-Processing Module**

In [113]:
# Processing in Parallel
n_threads = mp.cpu_count()-1
data_pieces = np.array_split(data_sample, n_threads)

pool = mp.Pool(n_threads)
data_sample = pd.concat(pool.map(pre_process, data_pieces))
pool.close()
pool.join()

data_sample.head()

Unnamed: 0,Title,Date,Content,OpenMove,CloseMove
0,anoth govern shutdown obamacar trump want,2017-04-24,web version voxcar daili newslett vox latest t...,1.0,1.0
1,tourist help fatten thai monkey diet,2017-05-19,whether like morbidli obes wild monkey thailan...,1.0,1.0
2,indian premier israel visit seek break barrier...,2017-07-05,jerusalem prime minist benjamin netanyahu long...,0.0,0.0
3,kiribati end aerial search miss ferri passeng ...,2018-03-21,wellington new zealand aerial search ferri kir...,0.0,0.0
4,american model,2017-07-28,amidst string pat introductori reflect recent ...,0.0,0.0


### Preparing Data for the CBOW Model

* Building the Vocabulary (Using Spacy) | **MAX_VOCAB_SIZE** = 25000
* Splitting the data for Test and Training


In [114]:
SEED = 68

In [115]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

Splitting the Data and Storing it such that torch text can easily ingest it.

In [116]:
split_data(df=data_sample[['Content', 'CloseMove']],prefix='dev',seed=SEED)

#### Reading in Data Using TorchText

In [117]:
train, val, test = data.TabularDataset.splits(
        path='./ProcessedData/', train='dev_train.csv',
        validation='dev_val.csv', test='dev_test.csv', format='csv',
        fields=[('Text', TEXT), ('Label', LABEL)])


In [118]:
MAX_VOCAB_SIZE = 50000

#### Setting up the dataloader

In [119]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class BOWDataLoader(tud.Dataset):
    def __init__(self, data, vocab_size, text, field):
        self.vocab_size = vocab_size
        self.TEXT = text
        self.LABEL = field
        self.TEXT.build_vocab(data, max_size = MAX_VOCAB_SIZE)
        self.LABEL.build_vocab(data)
        self.data = data
        
    def __len__(self):
        return len(self.data.examples)
    
    def __getitem__(self, idx):
        itm = torch.zeros(self.vocab_size)
        for word in self.data[idx].Text:
            itm[self.TEXT.vocab.stoi[word]] += 1
        
        # To Differentiate Train and Test data
        if len(self.data.fields) == 2:
            label = self.data[idx].Label
            return itm, label
        else:
            return itm, None

train_dataset = BOWDataLoader(train, MAX_VOCAB_SIZE, TEXT, LABEL)
val_dataset = BOWDataLoader(val, MAX_VOCAB_SIZE, TEXT, LABEL)
test_dataset = BOWDataLoader(test, MAX_VOCAB_SIZE, TEXT, LABEL)

### Bag of Words Model

Here we define:
* The INPUT, OUTPUT dimrensions.
* The Loss Function and
* The optimizer

In [135]:
class BOWClassifier(nn.Module):
    
    def __init__(self, input_size, output_size, batch_size):
        nn.Module.__init__(self)
        
        self.batch_size = batch_size
        # Cuda Availability
        self.cuda = torch.cuda.is_available()
        
        # Linear layer
        self.fc = nn.Linear(input_size, output_size)
        # Activation 
#         self.activate = nn.Sigmoid()
        
        # Loss Function
        self. loss_fn = nn.CrossEntropyLoss()
        
        # Optimizer
        self.optimizer = torch.optim.Adam(self.parameters())
    
    def forward(self, text):
        result = self.fc(text)
        return (result)
    
    def train_epoch(self, dataset):
        self.dataloader = tud.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        self.train()
        for i, (X,y) in enumerate(self.dataloader):
            X = X.float()
            y = torch.Tensor(np.asarray(y, dtype=np.float64)).long()
            if self.cuda:
                X  = X.cuda()
                y = y.cuda()
            
            self.optimizer.zero_grad()
            
            predictions = self.forward(X)
            
            loss = self.loss_fn(predictions, y)
            
            loss.backward()
            
            if (i+1) % 100 == 0:
                print("Iteration : {:4d} | Loss : {:4.4f}".format(i+1, loss.item()))
            
            self.optimizer.step()
        
    def train_model(self, train_data, val_data, num_epocs = 2):
        accuracy = 0
        for epoch in range(num_epocs):
            self.train_epoch(train_data)
            val_accuracy = self.evaluate(val_data)
            print("Validation Accuracy: {:4.4f}".format(val_accuracy))
            if val_accuracy > accuracy:
                accuracy = val_accuracy
                best_model = copy.deepcopy(self)        
    
    def classify(self, data):
        dataloader = tud.DataLoader(data, batch_size=self.batch_size, shuffle=False)
        results = np.asarray([])
        for i, (X,y) in enumerate(dataloader):
            X = X.float()
            if self.cuda:
                X = X.cuda()
            predictions = self.forward(X).max(1)[1].cpu().numpy().reshape(-1)
            results = np.append(results, predictions)
        labels = ["UP" if i == 1 else "DOWN" for i in results]
        return labels
    
    def evaluate(self, data):
        self.eval()
        dataloader = tud.DataLoader(data, batch_size=self.batch_size, shuffle=False)
        correct = 0
        total = 0
        
        for i, (X,y) in enumerate(dataloader):
            X = X.float()
            if self.cuda:
                X = X.cuda()
            predictions = self.forward(X).max(1)[1].cpu().numpy().reshape(-1)
            correct += (predictions == np.asarray(y, dtype=np.float64)).sum()
            total += predictions.shape[0]
        
        return correct/total

In [137]:
model = BOWClassifier(MAX_VOCAB_SIZE, 2, 8)
model.train_model(train_dataset, val_dataset, num_epocs=5)

Iteration :  100 | Loss : 0.9569
Iteration :  200 | Loss : 0.6613
Iteration :  300 | Loss : 0.9165
Iteration :  400 | Loss : 0.7737
Iteration :  500 | Loss : 0.6956
Iteration :  600 | Loss : 0.9078
Validation Accuracy: 0.5686
Iteration :  100 | Loss : 0.1815
Iteration :  200 | Loss : 0.4284
Iteration :  300 | Loss : 0.4284
Iteration :  400 | Loss : 0.3501
Iteration :  500 | Loss : 0.4605
Iteration :  600 | Loss : 0.3707
Validation Accuracy: 0.5348
Iteration :  100 | Loss : 0.1545
Iteration :  200 | Loss : 0.2308
Iteration :  300 | Loss : 0.0866
Iteration :  400 | Loss : 0.2568
Iteration :  500 | Loss : 0.1381
Iteration :  600 | Loss : 0.3024
Validation Accuracy: 0.5338
Iteration :  100 | Loss : 0.2689
Iteration :  200 | Loss : 0.1747
Iteration :  300 | Loss : 0.0883
Iteration :  400 | Loss : 0.3071
Iteration :  500 | Loss : 0.2298
Iteration :  600 | Loss : 0.1342
Validation Accuracy: 0.5514


KeyboardInterrupt: 