In [53]:
import numpy as np
import pandas as pd
import torch
import re
import time
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import Module, GRU, Embedding, Linear, Sigmoid, CrossEntropyLoss

# Part 1

In [42]:
"data/sentiment_analysis/train_pos_merged.txt"
# function clearing HTML tags from text
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

# preprocessing
def clean_text(path):
    reviews = []
    all_words = []
    with open(path) as pos:
        lines = pos.readlines()
        for line in lines:
            #clear html tags
            line = cleanhtml(line)
            # lower case and punctuation
            line = re.sub(r'[^a-zA-Z]', ' ', line.lower())
            # split to list of words
            words = line.split()
            # add list to reviews
            reviews.append(words)
            # extend words with new review
            all_words.extend(words)

    return reviews, all_words

def create_vocab(words):
    # create vocabulary with indexes
    vocab = {}
    id = 1
    for word in words:
        if word not in vocab.keys():
            vocab[word] = id
            id += 1
    return vocab


def vectorize_data(reviews, y, vocab, LENGTH=400):
    y = np.array([y for _ in range(len(reviews))])
    indexed_reviews = np.zeros((len(reviews), LENGTH), dtype = np.int64)
    for i, review in enumerate(reviews):
        indexed_review = []
        for word in review:
            indexed_review.append(vocab[word])
        indexed_reviews[i, max(LENGTH-len(review),0):] = indexed_review[:400]
    return indexed_reviews, y

def preprocessing(path1, path2, y1, y2, LENGTH=400):
    reviews1, words1 = clean_text(path1)
    reviews2, words2 = clean_text(path2)
    words1.extend(words2)
    # print(words1)

    del words2

    vocab = create_vocab(words1)

    x1, y1 = vectorize_data(reviews1, y1, vocab, LENGTH)
    x2, y2 = vectorize_data(reviews2, y2, vocab, LENGTH)

    x = np.concatenate((x1, x2))
    y = np.concatenate((y1, y2))

    return x, y, vocab



        

In [43]:
train_x, train_y, vocab = preprocessing("data/sentiment_analysis/train_pos_merged.txt", "data/sentiment_analysis/train_neg_merged.txt", 0, 1)

In [57]:
torch.from_numpy(train_y).dtype

torch.int64

In [44]:
batch_size = 8
train_data =  TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)


# Part 2

In [88]:
class GRU_model(Module):

    def __init__(self, vocab_size, input_dim, n_layers, hidden_dim, LENGTH=400):
        
        super(GRU_model, self).__init__()
        
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = Embedding(vocab_size, input_dim)
        self.gru = GRU(input_dim, hidden_dim, n_layers, batch_first=True)
        self.linear = Linear(hidden_dim, 2)
        self.sigmoid = Sigmoid()

    def forward(self, x, h):
        x = self.embedding(x)
        x, h = self.gru(x, h)
        print(f"shape of x: {x.shape}; shape of h: {h.shape}; shape of x[:,-1]: {x[:,-1].shape}")
        x = self.linear(x[:,-1])
        print(f"shape of x: {x.shape}")
        x = self.sigmoid(x)
        return x, h

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
        return hidden


In [46]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [83]:
def train(train_loader, vocab_size, learn_rate, hidden_dim=256, EPOCHS=5):
    
    # Setting common hyperparameters
    input_dim = next(iter(train_loader))[0].shape[1]
    # print(next(iter(train_loader))[0].shape)
    output_dim = 1
    n_layers = 2
    # Instantiating the model
    model = GRU_model(vocab_size, input_dim, hidden_dim, output_dim, n_layers)
    
    # Defining loss function and optimizer
    criterion = CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
    
    model.train()
    print("Starting Training")
    epoch_times = []
    # Start training loop
    for epoch in range(1,EPOCHS+1):
        start_time = time.time()
        h = model.init_hidden(batch_size)
        avg_loss = 0.
        counter = 0
        for x, label in train_loader:
            counter += 1
            h = h.data
            model.zero_grad()
            
            out, h = model(x.to(device), h)
            print(f"shape of out.squeeze(): {out.squeeze().shape}; shape of label: {label.shape}")
            loss = criterion(out.squeeze(), label.to(device))
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            if counter%200 == 0:
                print("Epoch {}......Step: {}/{}....... Average Loss for Epoch: {}".format(epoch, counter, len(train_loader), avg_loss/counter))
        current_time = time.time()
        print("Epoch {}/{} Done, Total Loss: {}".format(epoch, EPOCHS, avg_loss/len(train_loader)))
        print("Total Time Elapsed: {} seconds".format(str(current_time-start_time)))
        epoch_times.append(current_time-start_time)
    print("Total Training Time: {} seconds".format(str(sum(epoch_times))))
    return model

In [89]:
train(
    train_loader, 
    vocab_size = len(vocab), 
    learn_rate=0.1, 
    hidden_dim=16, 
    EPOCHS=5
    )

Starting Training
shape of x: torch.Size([64, 400, 1]); shape of h: torch.Size([16, 64, 1]); shape of x[:,-1]: torch.Size([64, 1])
shape of x: torch.Size([64, 2])
shape of out.squeeze(): torch.Size([64, 2]); shape of label: torch.Size([64])
shape of x: torch.Size([64, 400, 1]); shape of h: torch.Size([16, 64, 1]); shape of x[:,-1]: torch.Size([64, 1])
shape of x: torch.Size([64, 2])
shape of out.squeeze(): torch.Size([64, 2]); shape of label: torch.Size([64])
shape of x: torch.Size([64, 400, 1]); shape of h: torch.Size([16, 64, 1]); shape of x[:,-1]: torch.Size([64, 1])
shape of x: torch.Size([64, 2])
shape of out.squeeze(): torch.Size([64, 2]); shape of label: torch.Size([64])
shape of x: torch.Size([64, 400, 1]); shape of h: torch.Size([16, 64, 1]); shape of x[:,-1]: torch.Size([64, 1])
shape of x: torch.Size([64, 2])
shape of out.squeeze(): torch.Size([64, 2]); shape of label: torch.Size([64])
shape of x: torch.Size([64, 400, 1]); shape of h: torch.Size([16, 64, 1]); shape of x[:,-1

IndexError: index out of range in self

In [87]:
input = torch.randn(3, 5, requires_grad=True)

target = torch.empty(3, dtype=torch.long).random_(5)

print(input.shape)
print(target.shape)

torch.Size([3, 5])
torch.Size([3])
