In [1]:
import torch
import torchtext
import torch.nn as nn
import pandas as pd

glove = torchtext.vocab.GloVe(name="6B", dim=300) 

.vector_cache\glove.6B.zip: 862MB [13:02, 1.10MB/s]                                                                    
100%|███████████████████████████████████████████████████████████████████████▉| 399476/400000 [01:25<00:00, 7192.94it/s]

In [13]:
import csv

def get_data():
    return csv.reader(open("testfile.csv", "rt", encoding="latin-1"))


In [21]:
def split_headline(headline):
    
    headline = headline.replace(".", " . ") \
                 .replace(",", " , ") \
                 .replace(";", " ; ") \
                 .replace("?", " ? ") \
                 .replace(":", " : ") \
                 .replace("/", " / ") \
                 .replace("(", "") \
                 .replace(")", "") \
                 .replace("-", "") \
                 .replace("~", "")
    return headline.lower().split()

def get_ML_data(glove):
    train, valid, test = [], [], []
    for i, line in enumerate(get_data()):
        headline = line[0]
        word_index = [glove.stoi[w]        # lookup the index of word
                for w in split_headline(headline)
                if w in glove.stoi] # keep words that has an embedding
        if not word_index: # ignore tweets without any word with an embedding
            continue
        word_index = torch.tensor(word_index) # convert list to pytorch tensor
        label1 = torch.tensor(int(line[1]))
        label2 = torch.tensor(int(line[2]))
        label3 = torch.tensor(int(line[3]))
        label4 = torch.tensor(int(line[4]))
        label5 = torch.tensor(int(line[5]))
        label6 = torch.tensor(int(line[6]))
        if i % 5 in [0,1,2,3]:
            train.append((word_index, label1, label2, label3, label4, label5, label6))
        elif i % 5 == 4:
            valid.append((word_index, label1, label2, label3, label4, label5, label6))
        else:
            test.append((word_index, label1, label2, label3, label4, label5, label6))
    return train, valid, test

train, valid, test = get_ML_data(glove)

In [24]:
class Exch_RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Exch_RNN, self).__init__()
        self.emb = nn.Embedding.from_pretrained(glove.vectors)
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.rnn(x, h0)
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out

model = Exch_RNN(300, 300, 6)

In [40]:
from torch.nn.utils.rnn import pad_sequence

train_padded = pad_sequence([headline for headline, label1, label2, label3, label4, label5, label6 in train],
                            batch_first=True)
train_padded.shape

torch.Size([11, 9])

In [41]:
model(train_padded)

tensor([[ 0.1586,  0.1899,  0.0399,  0.1140, -0.0206,  0.1224],
        [-0.0202,  0.1809,  0.1517,  0.1377, -0.1003, -0.0245],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473],
        [ 0.1903,  0.1204,  0.0219,  0.1442, -0.0491,  0.1473]],
       grad_fn=<AddmmBackward>)