In [29]:
import numpy as np
import pandas as pd
import torch
import re
import time
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import Module, GRU, Embedding, Linear, Sigmoid, CrossEntropyLoss

# Part 1

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
"data/sentiment_analysis/train_pos_merged.txt"
# function clearing HTML tags from text
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

# preprocessing
def clean_text(path):
    reviews = []
    all_words = []
    with open(path) as pos:
        lines = pos.readlines()
        for line in lines:
            #clear html tags
            line = cleanhtml(line)
            # lower case and punctuation
            line = re.sub(r'[^a-zA-Z]', ' ', line.lower())
            # split to list of words
            words = line.split()
            # add list to reviews
            reviews.append(words)
            # extend words with new review
            all_words.extend(words)

    return reviews, all_words

def create_vocab(words):
    # create vocabulary with indexes
    vocab = {}
    id = 1
    for word in words:
        if word not in vocab.keys():
            vocab[word] = id
            id += 1
    return vocab


def vectorize_data(reviews, y, vocab, LENGTH=400):
    y = np.array([y for _ in range(len(reviews))])
    indexed_reviews = np.zeros((len(reviews), LENGTH), dtype = np.int64)
    for i, review in enumerate(reviews):
        indexed_review = []
        for word in review:
            indexed_review.append(vocab[word])
        indexed_reviews[i, max(LENGTH-len(review),0):] = indexed_review[:400]
    return indexed_reviews, y

def preprocessing(path1, path2, y1, y2, vocab, LENGTH=400):
    reviews1, words1 = clean_text(path1)
    reviews2, words2 = clean_text(path2)
    # words1.extend(words2)
    # print(words1)

    del words1, words2

    # vocab = create_vocab(words1)

    x1, y1 = vectorize_data(reviews1, y1, vocab, LENGTH)
    x2, y2 = vectorize_data(reviews2, y2, vocab, LENGTH)

    x = np.concatenate((x1, x2))
    y = np.concatenate((y1, y2))

    return x, y, vocab



        

In [31]:
reviews, words = clean_text("all_merged.txt")

In [32]:
vocab = create_vocab(words)
# vocab

In [34]:
train_x, train_y, vocab = preprocessing("train_pos_merged.txt", "train_neg_merged.txt", 0, 1, vocab)

In [53]:
input = torch.from_numpy(train_x[0])
embedding = Embedding(len(vocab), 3, padding_idx=0)
embedding(input)

tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000],
        ...,
        [-0.2770,  1.4500,  1.1403],
        [-0.3521, -0.6153, -0.2365],
        [-0.6114,  0.4785, -1.5156]], grad_fn=<EmbeddingBackward>)

In [43]:
batch_size = 100
train_data =  TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)


# Part 2

In [55]:
class GRU_model(Module):

    def __init__(self, vocab_size, input_dim, hidden_dim, n_layers=1, LENGTH=400):
        
        super(GRU_model, self).__init__()
        
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = Embedding(vocab_size, input_dim, padding_idx=0)
        self.gru = GRU(input_dim, hidden_dim, n_layers, batch_first=True)
        self.linear = Linear(hidden_dim, 2)
        self.sigmoid = Sigmoid()

    def forward(self, x, h):
        x = self.embedding(x)
        x, h = self.gru(x, h)
        # print(f"shape of x: {x.shape}; shape of h: {h.shape}; shape of x[:,-1]: {x[:,-1].shape}")
        x = self.linear(x[:,-1])
        # print(f"shape of x: {x.shape}")
        x = self.sigmoid(x)
        return x, h

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
        return hidden


In [37]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [61]:
def train(train_loader, vocab_size, learn_rate, input_dim=10, hidden_dim=16, EPOCHS=5):
    
    # Setting common hyperparameters
    # input_dim = next(iter(train_loader))[0].shape[1]
    # print(next(iter(train_loader))[0].shape[1])
    output_dim = 1
    n_layers = 1
    # Instantiating the model
    model = GRU_model(vocab_size, input_dim, hidden_dim, output_dim, n_layers)
    model.to(device)
    
    # Defining loss function and optimizer
    criterion = CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
    
    model.train()
    print("Starting Training")
    epoch_times = []
    # Start training loop
    for epoch in range(1,EPOCHS+1):
        start_time = time.time()
        h = model.init_hidden(batch_size)
        avg_loss = 0.
        counter = 0
        for x, label in train_loader:
            counter += 1
            h = h.data
            model.zero_grad()
            
            out, h = model(x.to(device), h)
            # print(f"shape of out.squeeze(): {out.squeeze().shape}; shape of label: {label.shape}")
            loss = criterion(out.squeeze(), label.to(device))
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            if counter%100 == 0:
                print("Epoch {}......Step: {}/{}....... Average Loss for Epoch: {}".format(epoch, counter, len(train_loader), avg_loss/counter))
        current_time = time.time()
        print("Epoch {}/{} Done, Total Loss: {}".format(epoch, EPOCHS, avg_loss/len(train_loader)))
        print("Total Time Elapsed: {} seconds".format(str(current_time-start_time)))
        epoch_times.append(current_time-start_time)
    print("Total Training Time: {} seconds".format(str(sum(epoch_times))))
    return model

def evaluate(model, test_x, test_y, label_scalers):
    model.eval()
    outputs = []
    targets = []
    start_time = time.clock()
    for i in test_x.keys():
        inp = torch.from_numpy(np.array(test_x[i]))
        labs = torch.from_numpy(np.array(test_y[i]))
        h = model.init_hidden(inp.shape[0])
        out, h = model(inp.to(device).float(), h)
        outputs.append(label_scalers[i].inverse_transform(out.cpu().detach().numpy()).reshape(-1))
        targets.append(label_scalers[i].inverse_transform(labs.numpy()).reshape(-1))
    print("Evaluation Time: {}".format(str(time.clock()-start_time)))
    sMAPE = 0
    for i in range(len(outputs)):
        sMAPE += np.mean(abs(outputs[i]-targets[i])/(targets[i]+outputs[i])/2)/len(outputs)
    print("sMAPE: {}%".format(sMAPE*100))
    return outputs, targets, sMAPE

In [65]:
train(
    train_loader, 
    vocab_size = len(vocab), 
    learn_rate=0.001, 
    hidden_dim=32, 
    EPOCHS=100
    )

Starting Training
Epoch 1/100 Done, Total Loss: 0.6935751636823019
Total Time Elapsed: 0.5336148738861084 seconds
Epoch 2/100 Done, Total Loss: 0.6920966704686483
Total Time Elapsed: 0.38527798652648926 seconds
Epoch 3/100 Done, Total Loss: 0.6907771289348602
Total Time Elapsed: 0.38527631759643555 seconds
Epoch 4/100 Done, Total Loss: 0.6887146393458049
Total Time Elapsed: 0.38058948516845703 seconds
Epoch 5/100 Done, Total Loss: 0.6853117565313975
Total Time Elapsed: 0.3940920829772949 seconds
Epoch 6/100 Done, Total Loss: 0.6785733779271443
Total Time Elapsed: 0.38701462745666504 seconds
Epoch 7/100 Done, Total Loss: 0.6672797044118245
Total Time Elapsed: 0.3849959373474121 seconds
Epoch 8/100 Done, Total Loss: 0.6521039068698883
Total Time Elapsed: 0.38364577293395996 seconds
Epoch 9/100 Done, Total Loss: 0.6364308436711629
Total Time Elapsed: 0.3886713981628418 seconds
Epoch 10/100 Done, Total Loss: 0.6135675470034282
Total Time Elapsed: 0.3943307399749756 seconds
Epoch 11/100 Don

GRU_model(
  (embedding): Embedding(39237, 10, padding_idx=0)
  (gru): GRU(10, 32, batch_first=True)
  (linear): Linear(in_features=32, out_features=2, bias=True)
  (sigmoid): Sigmoid()
)

torch.Size([3, 5])
torch.Size([3])
