In [4]:
# Required imports
import torch
import numpy as np
import pandas as pd
from torch.nn import Linear, Embedding, RNN
from torch.nn import Sigmoid, LogSoftmax
from torch.optim import SGD
from torch.nn import BCELoss, NLLLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [7]:
# Load the data into a DataFrame
data = pd.read_pickle('../data/stackoverflow_gbq.pkl')

# Definne a simple convenience function for cleaning the strings
def clean_text(text):
    return "".join([c for c in text.lower() if c not in punctuation])

# Clean the string labels
data['text_cleaned'] = data['text'].map(clean_text)
data.head()

Unnamed: 0,label,text,text_cleaned
0,javascript,Angular.js expression: output transformed obje...,angularjs expression output transformed object...
1,android,How restart Fragment in my Activity - Android,how restart fragment in my activity android
2,javascript,"node.js express, a very strange behaviour",nodejs express a very strange behaviour
3,java,Open Editor Part in E4,open editor part in e4
4,javascript,Prevent Lazy Load of a Div in Laxy Load XT,prevent lazy load of a div in laxy load xt


In [29]:
text_split = data['text_cleaned'].map(lambda x: x.split())
all_words = set(list(itertools.chain.from_iterable(text_split)))
vocab_size = len(all_words)
word2idx = {word: idx for idx, word in enumerate(all_words)}
idx2word = {idx: word for word, idx in word2idx.items()}


In [20]:
embedding = Embedding(num_embeddings=vocab_size, embedding_dim=100)

In [35]:
word = 'python'
idx = word2idx[word]
embedding(torch.LongTensor([idx]))

tensor([[ 0.4265, -0.8815,  0.8797, -0.5368,  0.9310, -0.8299, -0.1891,  0.1967,
          0.0526, -2.3317,  0.5544, -0.1294,  0.4531, -0.1707, -0.7045, -0.2932,
          0.8764,  0.0311,  0.0602,  0.5395,  1.7740, -1.8474,  1.7088, -0.4878,
          0.4022,  0.4565, -0.1093, -1.0329,  1.2101,  0.0685,  0.5724, -0.6685,
          0.1558,  1.3574, -0.6990,  0.6137, -0.2524, -0.2739, -1.5589, -1.0522,
          0.3693, -1.0989,  0.9679,  0.6798,  1.0760, -1.4661,  0.8950, -0.1049,
          0.3754, -0.9723,  0.0988, -0.1588,  0.3688,  0.7010, -0.6596,  0.6013,
          0.0742,  0.0147,  0.1082, -0.6603,  0.5044, -0.8691, -0.3181, -1.6238,
          0.4992, -1.9662, -0.5189, -0.2854, -0.9715,  1.0123, -0.6982, -0.1395,
         -0.0131,  2.0896, -0.6927,  0.0927, -0.5720, -0.7166, -1.9849, -0.4276,
          1.2579, -0.8066, -1.3512, -0.1447, -0.5385,  0.5194,  1.7851,  0.0293,
          0.5075, -0.6541,  0.7000, -1.7866, -0.4941,  0.1871, -1.4818, -0.9636,
          0.1738,  0.3717, -

In [36]:
data['idx_encoded'] = data['text_cleaned'].map(lambda x: [word2idx[word] for word in x.split()])

In [75]:
embedding(torch.LongTensor(data.idx_encoded[0])).unsqueeze(1).shape

torch.Size([6, 1, 100])

In [76]:
rnn = RNN(input_size=100, hidden_size=50)

In [81]:
e = embedding(torch.LongTensor(data.idx_encoded[0])).unsqueeze(1)

In [82]:
e.shape

torch.Size([6, 1, 100])

In [85]:
rnn(e)[1].shape

torch.Size([1, 1, 50])

In [87]:
labels = data.label.values

In [141]:
le = LabelEncoder()
labels = le.fit_transform(data.label.values).reshape(-1,1)

features = data['idx_encoded']

train_data, test_data = train_test_split(list(zip(features, labels)))

In [172]:
class rnn_classifier(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(rnn_classifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = Embedding(num_embeddings=vocab_size, 
                                   embedding_dim=embedding_dim)
        self.rnn = RNN(input_size=embedding_dim, 
                       hidden_size=hidden_dim)
        self.linear = Linear(hidden_dim, output_dim)
        self.softmax = LogSoftmax()
        
        
    def forward(self, x):
        e = self.embedding(x).unsqueeze(1)
        out, hidden = self.rnn(e)
        output = self.linear(hidden)
        so = self.softmax(output.view(-1))
        return so
    
    def init_hidden(self):
        self.rnn.weight_hh_l0 = torch.autograd.Variable(torch.zeros(self.hidden_dim, 
                                            self.hidden_dim))
        self.rnn.weight_ih_l0 = torch.autograd.Variable(torch.zeros(self.hidden_dim, 
                                            2*self.hidden_dim))        
        
        

In [173]:
model = rnn_classifier(vocab_size = vocab_size, 
                       embedding_dim=100, 
                       hidden_dim=50, 
                       output_dim=5)

optim = SGD(params=model.parameters(), lr=0.01)
criterion = NLLLoss()

for f, t in train_data[:100]:
    X = torch.LongTensor(f)
    y = torch.LongTensor(t)
    
    model.init_hidden()
    out = model.forward(X)
    
    loss = criterion(out.unsqueeze(0), y)
    
    loss.backward()
    
    optim.step()

TypeError: cannot assign 'torch.FloatTensor' as parameter 'weight_hh_l0' (torch.nn.Parameter or None expected)

In [153]:
model = rnn_classifier(vocab_size = vocab_size, 
                       embedding_dim=100, 
                       hidden_dim=50, 
                       output_dim=5)

optim = SGD(params=model.parameters(), lr=0.01)
criterion = NLLLoss()

for f, t in train_data[:100]:
    X = torch.LongTensor(f)
    y = torch.LongTensor(t)
    
    e = embedding(X).unsqueeze(1)
    
    out, hidden = rnn(e)
    
    output = linear(hidden)
    so = softmax(output.view(-1))
    
    loss = criterion(so.unsqueeze(0), y)
    
    loss.backward()
    
    optim.step()

  from ipykernel import kernelapp as app


NameError: name 'optim' is not defined

In [135]:
output.view(-1)

tensor([-0.0770, -0.0111, -0.0640,  0.1984, -0.0457], grad_fn=<ViewBackward>)

In [147]:
so.shape

torch.Size([5])

In [155]:
torch.zeros(100)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])

In [159]:
rnn.weight_hh_l0.shape

torch.Size([50, 50])

In [166]:
Embedding?