In [1]:
# Required imports
import torch
import numpy as np
import pandas as pd
from torch.nn import Linear, Embedding, RNN, GRU, LSTM
from torch.nn import Sigmoid, LogSoftmax
from torch.optim import SGD
from torch.nn import BCELoss, NLLLoss, CrossEntropyLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [2]:
# Load the data into a DataFrame
data = pd.read_pickle('../data/newsgroups.pkl')

# Definne a simple convenience function for cleaning the strings
def clean_text(text):
    return "".join([c for c in text.lower() if c not in punctuation])

# Clean the string labels
data['text_cleaned'] = data['text'].map(clean_text)
data.head()

Unnamed: 0,text,label,set,text_cleaned
0,\tolivetti does not exclude stake in sgs thoms...,1,train,\tolivetti does not exclude stake in sgs thoms...
1,\tu k firm ups italy fund ita stake to pct llo...,1,train,\tu k firm ups italy fund ita stake to pct llo...
2,\tmoniterm corp mtrm th qtr loss shr loss cts ...,0,train,\tmoniterm corp mtrm th qtr loss shr loss cts ...
3,\twal mart stores inc wmt raises quarterly qtl...,0,train,\twal mart stores inc wmt raises quarterly qtl...
4,\tautospa lube to buy control of cardis cds au...,1,train,\tautospa lube to buy control of cardis cds au...


In [3]:
text_split = data['text_cleaned'].map(lambda x: x.split())
all_words = set(list(itertools.chain.from_iterable(text_split)))
vocab_size = len(all_words)
word2idx = {word: idx for idx, word in enumerate(all_words)}

idx2word = {idx: word for word, idx in word2idx.items()}

data['idx_encoded'] = data['text_cleaned'].map(lambda x: [word2idx[word] for word in x.split()])

labels = data['label']
features = data['idx_encoded']

labels = data['label'].values
features = data['idx_encoded']
train_data, test_data = train_test_split(list(zip(features, labels)))

In [49]:

labels = data['label'].values
features = data['idx_encoded']
train_data, test_data = train_test_split(list(zip(features, labels)))

In [39]:
labels.shape

(7674, 8)

In [40]:
class rnn_classifier(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, batch_size):
        super(rnn_classifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = Embedding(num_embeddings=vocab_size, 
                                   embedding_dim=embedding_dim)
        self.rnn = LSTM(input_size=embedding_dim, 
                       hidden_size=hidden_dim)
        self.linear = Linear(hidden_dim, output_dim)
        self.batch_size = batch_size
        self.softmax = LogSoftmax()
        self.hidden = self.init_hidden()
                
    def forward(self, x):
        e = self.embedding(x)
        e = e.view(len(x), self.batch_size, -1)
        out, self.hidden = self.rnn(e, self.hidden)
        output = self.linear(out[-1])
        so = self.softmax(output)
        return so
                  
    def init_hidden(self):
        h0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        c0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)

In [59]:
model = rnn_classifier(vocab_size = vocab_size, 
                       embedding_dim=100, 
                       hidden_dim=50, 
                       output_dim=8, 
                       batch_size=1)

optim = SGD(params=model.parameters(), lr=0.01)
criterion = NLLLoss()

for i in range(10):
    total_loss = 0
    model.train()
    for it, example in enumerate(train_data):

        f, t = example
        X = torch.LongTensor(f[:32])
        y = torch.LongTensor([t])
        
        model.hidden = model.init_hidden()
        output = model.forward(X)
        optim.zero_grad()
        prediction = torch.argmax(output)
        loss = criterion(output, y)
        total_loss += loss.data.numpy()

        loss.backward()

        optim.step()

    model.eval()
    y_pred = []
    y_true = []
    for example in test_data:
        optim.zero_grad()
        f, t = example
        X = torch.LongTensor(f[:32])
        y = torch.LongTensor([t])

        model.hidden = model.init_hidden()
        output = model.forward(X)
        prediction = torch.argmax(output)

        y_true.append(y.data.numpy()[0])
        y_pred.append(torch.argmax(output.data).numpy())

        a = accuracy_score(y_true, y_pred)

    total_loss /= (it + 1)

    print("Loss: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a))



Loss: 0.98, Validation Accuracy: 0.78
Loss: 0.60, Validation Accuracy: 0.82
Loss: 0.44, Validation Accuracy: 0.85
Loss: 0.35, Validation Accuracy: 0.87
Loss: 0.27, Validation Accuracy: 0.88
Loss: 0.19, Validation Accuracy: 0.90
Loss: 0.15, Validation Accuracy: 0.91
Loss: 0.11, Validation Accuracy: 0.90
Loss: 0.08, Validation Accuracy: 0.91
Loss: 0.07, Validation Accuracy: 0.91


In [52]:
output

tensor([[-0.5880, -0.8107]], grad_fn=<LogSoftmaxBackward>)

In [28]:
output

tensor([[-0.6805, -0.7059]], grad_fn=<LogSoftmaxBackward>)

In [None]:
output