In [1]:
import torch
import torchtext
import torch.nn as nn
import pandas as pd
import os
import csv
import numpy as np
import matplotlib.pyplot as plt

cwd = os.getcwd()
glove = torchtext.vocab.GloVe(name="6B", dim=300) 

In [2]:
def split_headline(headline):
    
    headline = headline.replace(".", " . ") \
                 .replace(",", " , ") \
                 .replace(";", " ; ") \
                 .replace("?", " ? ") \
                 .replace(":", " : ") \
                 .replace("/", " / ") \
                 .replace("$", " $ ") \
                 .replace("€", " € ") \
                 .replace("(", " ") \
                 .replace(")", " ") \
                 .replace("-", " ") \
                 .replace("~", " ") \
                 .replace("'", "")
    return headline.lower().split()

def get_ML_data(glove):
    train, valid, test = [], [], []
    #df = pd.read_csv('final_input.csv').drop(['Date'], axis=1)
    df = pd.read_csv('testfile.csv')
    for i, line in enumerate(df.values):
        headline = str(line[0])
        word_index = [glove.stoi[w]        
                for w in split_headline(headline)
                if w in glove.stoi] 
        if not word_index: 
            continue
        word_index = torch.tensor(word_index) 
        label1 = torch.tensor(float(line[1]))
        label2 = torch.tensor(float(line[2]))
        label3 = torch.tensor(float(line[3]))
        label4 = torch.tensor(float(line[4]))
        label5 = torch.tensor(float(line[5]))
        label6 = torch.tensor(float(line[6]))
        labels = torch.tensor([label1, label2, label3, label4, label5, label6])
        if i % 5 in [1,2,3]:
            train.append((word_index, labels))
        elif i % 5 == 4:
            valid.append((word_index, labels))
        else:
            test.append((word_index, labels))
    return train, valid, test

train, valid, test = get_ML_data(glove)

In [546]:
train

[(tensor([46768,   661,  1077,     4,   285,    46,   192,   630,   378,     2]),
  tensor([ 0.0300, -0.0100,  0.2300, -0.0120, -0.0110,  0.0210])),
 (tensor([ 2214, 29642, 19568]),
  tensor([0.0400, 0.0100, 0.2400, 0.0120, 0.0110, 0.0280])),
 (tensor([20198,    14]),
  tensor([0.0500, 0.0300, 0.2500, 0.0360, 0.0320, 0.0350])),
 (tensor([ 430,   14, 3082]),
  tensor([0.0800, 0.0900, 0.2800, 0.1080, 0.0970, 0.0560])),
 (tensor([ 521,   14,    7, 8237,   34,   36,    7, 2487]),
  tensor([0.0900, 0.1100, 0.2900, 0.1320, 0.1190, 0.0630])),
 (tensor([  41,   33, 7523]),
  tensor([0.1000, 0.1300, 0.3000, 0.1560, 0.1400, 0.0700])),
 (tensor([5199,   14,   29, 4308,  128]),
  tensor([0.1300, 0.1900, 0.3300, 0.2280, 0.2050, 0.0910])),
 (tensor([143666,     14,     29,    734]),
  tensor([0.1400, 0.2100, 0.3400, 0.2520, 0.2270, 0.0980])),
 (tensor([   41,   913, 22392,    13,  2058, 19036,     2]),
  tensor([0.1500, 0.2300, 0.3500, 0.2760, 0.2480, 0.1050])),
 (tensor([86908,     7, 34136]),
  te

In [3]:
from torch.nn.utils.rnn import pad_sequence

train_data = []
valid_data = []
test_data = []
train_labels = []
valid_labels = []
test_labels = []


train_padded = pad_sequence([headline for headline, labels in train],
                            batch_first=True)
for headline, labels in train:
    train_labels.append(labels)
    
valid_padded = pad_sequence([headline for headline, labels in valid],
                            batch_first=True)
for headline, labels in valid:
    valid_labels.append(labels)
    
test_padded = pad_sequence([headline for headline, labels in test],
                            batch_first=True)
for headline, labels in test:
    test_labels.append(labels)
    
for i in range(len(train_labels)):
    train_data.append((train_padded[i], train_labels[i]))
    
for i in range(len(valid_labels)):
    valid_data.append((valid_padded[i], valid_labels[i]))
    
for i in range(len(test_labels)):
    test_data.append((test_padded[i], test_labels[i]))

In [4]:
def get_error(model, data):
    for headlines, labels in data:
        output = (model(headlines))        
        error = np.square(np.subtract(labels,output.detach().numpy())).mean() 
    return error

In [5]:
def train_rnn_network(model, train, valid, num_epochs=5, learning_rate=1e-6, batch_size = 128):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train_loader = torch.utils.data.DataLoader(train, 
                                               batch_size=batch_size, 
                                               shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, 
                                           batch_size=batch_size, 
                                           shuffle=True)
    epochs, losses, train_err, valid_err = [], [], [], []
    n=0
    
    for epoch in range(num_epochs):
        for headlines, labels in train_loader:
            optimizer.zero_grad()
            pred = model(headlines)
            loss = criterion(pred, labels.float())
            loss.backward()
            optimizer.step()
        losses.append(float(loss))
        
        
        epochs.append(epoch)
        train_err.append(get_error(model, train_loader))
        valid_err.append(get_error(model, valid_loader))
        print("Epoch %d; Loss %f; Train Err %f; Val Err %f" % (
            epoch+1, loss, train_err[-1], valid_err[-1]))
        
    
         if (epoch + 1) % 5 == 0:
             torch.save(model, cwd + '\\Checkpoint.pth' + str(n))
             n += 1
            
    plt.title("Training Curve")
    plt.plot(losses, label="Train")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()
    
    plt.title("Error Curve")
    plt.plot(epochs, train_err, label="Train")
    plt.plot(epochs, valid_err, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("MSE")
    plt.legend(loc='best')
    plt.show()

IndentationError: unexpected indent (<ipython-input-5-09c10a949804>, line 30)

In [6]:
class Exch_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Exch_LSTM, self).__init__()
        self.emb = nn.Embedding.from_pretrained(glove.vectors)
        self.hidden_size = hidden_size
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        x = self.emb(x)
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        c0 = torch.zeros(1, x.size(0), self.hidden_size)
        out, _ = self.rnn(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

#model_lstm = Exch_LSTM(300, 300, 6)
#train_rnn_network(model_lstm, train_data, valid_data, num_epochs=20, learning_rate=2e-5)

In [7]:
def headline_to_index(headline):
    word_index = [glove.stoi[w]        
        for w in split_headline(headline)
                        if w in glove.stoi] 
    if not word_index: 
        pass
    word_index = torch.tensor(word_index) 
    return word_index.unsqueeze(0)

In [57]:
headline = headline_to_index("Mark")

model = torch.load(cwd + "//final_model.pth1")

-model(headline)

tensor([[ 0.1208, -0.0694, -0.0868, -0.0176, -0.0687,  0.0282]],
       grad_fn=<NegBackward>)

In [13]:
def get_accuracy(model, data):
    total = 0
    correct = 0
    for headlines, labels in data:
        output = (model(headlines.unsqueeze(0)))        
        for i in output:
          for j in range(6):
            if i[j] > 0 and labels[j] > 0:
              correct += 1
              total += 1
            elif i[j] < 0 and labels[j] < 0:
              correct +=1
              total += 1
            else:
              total += 1
    return correct/total

In [18]:
get_accuracy(final_model, test_data)

0.8