In [50]:
from torchtext.data import Field, TabularDataset, BucketIterator,LabelField
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import random
import torch

In [51]:
path = '/Users/babyhandzzz/Desktop/ELEPH@NT/Datasets/clean_IMDB.csv'

In [52]:
# Load the data
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize,batch_first=True)
LABEL = LabelField(dtype=torch.float,batch_first=True)
fields = [('text', TEXT), ('label',LABEL)]
training_data=TabularDataset(path=path,format='csv',fields = fields, skip_header=True)

In [53]:
# train test split
SEED=2020
train_data, valid_data = training_data.split(split_ratio=0.7, random_state=random.seed(SEED))

In [54]:
# vectorize words GloVe
TEXT.build_vocab(train_data,min_freq=3,vectors="glove.6B.100d")  
LABEL.build_vocab(train_data)

In [55]:
train_iterator, valid_iterator = BucketIterator.splits((train_data, valid_data), batch_size = 32,
sort_key = lambda x: len(x.text),sort_within_batch=True)

In [56]:
class LSTMClassifier(nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(LSTMClassifier, self).__init__()
		
		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		
		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
		self.lstm = nn.LSTM(embedding_length, hidden_size)
		self.label = nn.Linear(hidden_size, output_size)
		
	def forward(self, input_embedding, batch_size=None):
	
		input_ = self.word_embeddings(input_embedding) 
		input_ = input_.permute(1, 0, 2)
		if batch_size is None:
			h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size)) 
			c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size))
		else:
			h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size))
			c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size))
		output, (final_hidden_state, final_cell_state) = self.lstm(input_, (h_0, c_0))
		final_output = self.label(final_hidden_state[-1]) 		
		return final_output

In [57]:
batch_size = 32
output_size = 1
hidden_size = 150
vocab_size = len(TEXT.vocab)
embedding_length = 100
word_embeddings = TEXT.vocab.vectors

In [58]:
model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = nn.BCELoss()
optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

In [59]:
model.train()
for idx, batch in enumerate(train_iterator):
    text = batch.text
    target = batch.label
    if (text.size()[0] is not 32):
            continue
    optim.zero_grad()
    prediction = model(text)
    loss = loss_fn(F.sigmoid(prediction), target)
    print(loss.item())
    loss.backward()
    optim.step()

0.6933239698410034
0.6928998827934265
0.6946552991867065
0.6910000443458557
0.6919717788696289
0.6925653219223022
0.6938671469688416
0.6881396174430847
0.6871300339698792
0.6969411373138428
0.68184894323349
0.6823946833610535
0.6900270581245422
0.6696925163269043
0.679523229598999
0.6788899302482605
0.666327714920044
0.6704390645027161
0.7030359506607056
0.719089686870575
0.7318813800811768
0.7316553592681885
0.7288674712181091
0.6906354427337646
0.6850621700286865
0.6783450841903687
0.6935259103775024
0.674932062625885
0.6944950819015503
0.6963032484054565
0.6707750558853149
0.6902047395706177
0.6976813077926636
0.695963978767395
0.7015758752822876
0.6983414888381958
0.680922269821167
0.6677483916282654
0.6704552173614502


KeyboardInterrupt: 