In [54]:
import torch
import numpy as np
from string import punctuation
from collections import Counter
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [21]:
SEQ_LENGTH = 200
SPLIT = 0.8
BATCH_SIZE = 50
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EMBEDDING_DIM = 150
HIDDEN_DIM = 256
N_LAYERS = 2
OUTPUT_SIZE = 1
EPOCHS = 1
PRINT_EVERY = 100
CLIP=5
MODEL_ARCH='LSTM'

In [88]:
class SentimentDataset:
    def __init__(self):
        self.review = None
        self.labels = None
        self.reviews_split = None
        self.vocab_to_int = None
        self.int_to_vocab = None
        self.review_ints = None
        self.encoded_labels = None
        self.features = None

    def load_data(self):
        with open('D:/Projects/Sessions/TextClassification/DeepLearning/input/reviews.txt', 'r') as f:
            self.reviews = f.read()
        with open('/TextClassification/DeepLearning/input/labels.txt', 'r') as f:
            self.labels = f.read()

    def clean_data(self):
        self.reviews = self.reviews.lower()
        self.all_text = ''.join([c for c in self.reviews if c not in punctuation])
        self.reviews_split = self.all_text.split('\n')
        self.all_text = ' '.join(self.reviews_split)
        self.words = self.all_text.split()
        # return self.reviews_split

    def vocab_dict(self):
        counts = Counter(self.words)
        vocab = sorted(counts, key=counts.get, reverse=True)
        self.vocab_to_int = {word: ii for ii, word in enumerate(vocab)}
        self.int_to_vocab = {ii: word for word,ii in self.vocab_to_int.items() }
        # return self.vocab_to_int, self.int_to_vocab

    def encode_text(self):
        self.review_ints = []
        for reviews in self.reviews_split:
            self.review_ints.append([self.vocab_to_int[word] for word in reviews.split()])
        # return self.review_ints

    def encode_label(self):
        self.labels_split = self.labels.split('\n')
        self.encoded_labels = np.array([1 if label == 'positive' else 0 for label in self.labels_split])
        # return self.encoded_labels

    def remove_outliers(self):
        review_lens = Counter([len(x) for x in self.review_ints])
        print("Zero-length reviews: {}".format(review_lens[0]))
        print("Maximum review length: {}".format(max(review_lens)))
        print('Number of reviews before removing outliers: ', len(self.review_ints))
        non_zero_idx = [ii for ii, review in enumerate(self.review_ints) if len(review) != 0]
        self.review_ints = [self.review_ints[ii] for ii in non_zero_idx]
        self.encoded_labels = np.array([self.encoded_labels[ii] for ii in non_zero_idx])
        print('Number of reviews after removing outliers: ', len(self.review_ints))

        # return self.review_ints, self.encoded_labels

    def pad_features(self, seq_length):
        self.features = np.zeros((len(self.review_ints), config.SEQ_LENGTH), dtype=int)
        for i, row in enumerate(self.review_ints):
            self.features[i, -len(row):] = np.array(row)[:config.SEQ_LENGTH]

    def process_new_instance(self, input):
        input = str(input).lower()
        input =''.join([x for x in input if x not in punctuation])
        encoded_input =[]
        for token in input.split():
            encoded_input.append(self.vocab_to_int[token])
        padded_input = np.zeros((1, config.SEQ_LENGTH), dtype=int)
        padded_input[0, -len(encoded_input):] = np.array(encoded_input)[0:config.SEQ_LENGTH]
        return padded_input


In [48]:
def data_split(features, encoded_labels):
    split_idx = int(len(features) * config.SPLIT)
    train_x, remaining_x = features[:split_idx], features[split_idx:]
    train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

    test_idx = int(len(remaining_x) * 0.5)
    val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
    val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

    ## print out the shapes of your resultant feature data
    print("\t\t\tFeature Shapes:")
    print("Train set: \t\t{}".format(train_x.shape),
          "\nValidation set: \t{}".format(val_x.shape),
          "\nTest set: \t\t{}".format(test_x.shape))

    return train_x, train_y, val_x, val_y, test_x, test_y

In [79]:
sent_data = SentimentDataset()
sent_data.load_data()
sent_data.clean_data()

In [80]:
sent_data.vocab_dict()

In [81]:
sent_data.encode_text()

In [82]:
sent_data.encode_label()

In [83]:
sent_data.remove_outliers()

Zero-length reviews: 1
Maximum review length: 2514
Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


In [84]:
sent_data.pad_features(SEQ_LENGTH)

In [19]:
# sent_data = SentimentDataset()
# sent_data.load_data()
# sent_data.clean_data()
# sent_data.vocab_dict()
# sent_data.encode_text()
# sent_data.encode_label()
# sent_data.remove_outliers()
# sent_data.pad_features(SEQ_LENGTH)

Zero-length reviews: 1
Maximum review length: 2514
Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


In [49]:
features= sent_data.features
encoded_labels = sent_data.encoded_labels
train_x, train_y, val_x, val_y, test_x, test_y = data_split(features, encoded_labels)

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [50]:
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

In [51]:
# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE)

vocab_size = len(sent_data.vocab_to_int)+1
output_size = OUTPUT_SIZE

In [55]:
class SentimentLSTM(nn.Module):

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):

        super(SentimentLSTM, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x):

        batch_size = x.size(0)
        # embeddings and lstm_out
        x = x.long()
        # initial hidden states
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(config.DEVICE)
        c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(config.DEVICE)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, (h0,c0))

        # # stack up lstm outputs
        # lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        lstm_out, _ = torch.max(lstm_out,1)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]  # get last batch of labels

        # return last sigmoid output and hidden state
        return sig_out, hidden

In [56]:
net = SentimentLSTM(vocab_size, output_size, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS)

In [59]:
net

SentimentLSTM(
  (embedding): Embedding(74073, 150)
  (lstm): LSTM(150, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [61]:
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
net.to(device=DEVICE)
net.train()

SentimentLSTM(
  (embedding): Embedding(74073, 150)
  (lstm): LSTM(150, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [63]:
def train_fn(data_loader, valid_loader, model, optimizer, criterion, device, model_name="lstm"):
    counter = 0
    for e in range(config.EPOCHS):
        for inputs, labels in data_loader:
            counter += 1
            inputs, labels = inputs.to(device), labels.to(device)
            model.zero_grad()
            inputs = inputs.long()
            output, h = model(inputs)
            
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), CLIP)
            optimizer.step()

            # loss stats
            if counter % config.PRINT_EVERY == 0:
                # Get validation loss
                val_losses = []
                model.eval()
                for inputs, labels in valid_loader:
                    inputs, labels = inputs.to(config.DEVICE), labels.to(config.DEVICE)
                    # output, val_h = model(inputs)
                    output, val_h = model(inputs)
                    # output = model(inputs)
                    val_loss = criterion(output.squeeze(), labels.float())
                    val_losses.append(val_loss.item())

                model.train()
                print("Epoch: {}/{}...".format(e+1, config.EPOCHS),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))
    return model

In [64]:
net = train_fn(train_loader, valid_loader, net, optimizer, criterion, config.DEVICE)

Epoch: 1/1... Step: 100... Loss: 0.634634... Val Loss: 0.620509
Epoch: 1/1... Step: 200... Loss: 0.507708... Val Loss: 0.538738
Epoch: 1/1... Step: 300... Loss: 0.492999... Val Loss: 0.507180
Epoch: 1/1... Step: 400... Loss: 0.378106... Val Loss: 0.438782


In [91]:
def test_fn(test_loader, model, criterion, device):
    # Get test data loss and accuracy
    test_losses = []  # track loss
    num_correct = 0
    model.eval()
    # iterate over test data
    for inputs, labels in test_loader:

        inputs, labels = inputs.to(device), labels.to(device)
        # get predicted outputs

        # output, h = model(inputs)
        # output = model(inputs)
        if config.MODEL_ARCH == 'CNN':
            output = model(inputs)
        else:
            output, h = model(inputs)

        # calculate loss
        test_loss = criterion(output.squeeze(), labels.float())
        test_losses.append(test_loss.item())

        # convert output probabilities to predicted class (0 or 1)
        pred = torch.round(output.squeeze())  # rounds to the nearest integer

        # compare predictions to true label
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy()) #if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)

    # -- stats! -- ##
    # avg test loss
    print("Test loss: {:.3f}".format(np.mean(test_losses)))

    # accuracy over all test data
    test_acc = num_correct / len(test_loader.dataset)
    print("Test accuracy: {:.3f}".format(test_acc))

In [92]:
test_fn(test_loader, net, criterion, config.DEVICE)

Test loss: 0.444
Test accuracy: 0.794


In [None]:
[9, 114,10,  17, -1]
[0....0,9, 114,10,  17]

In [89]:
sent_data.vocab_to_int.get('dkjnqdfkjvn')

In [90]:
sent_data.process_new_instance(str("I love this movie abcd"))

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [None]:
def predict(model, data_object, input):

    padded_input = data_object.process_new_instance(str(input))
    padded_input = torch.from_numpy(padded_input)
    padded_input = padded_input.to(config.DEVICE)

    if config.MODEL_ARCH == 'CNN':
        output = model(padded_input)
    else:
        output, h = model(padded_input)
    # output, h = model(padded_input)
    # output= model(padded_input)
    confidence = output.squeeze()
    pred = torch.round(confidence)
    result = None
    if pred == 1:
        result = 'Positive'
    else:
        result = 'Negative'
        confidence = 1-confidence

    print(f' Review - {input} ::{result} --> confidence :{confidence}')

In [86]:
print(" Testing few insances ")
engine.predict(net, sent_data ," I Love this movie")
engine.predict(net, sent_data, " This movie is not good")
engine.predict(net, sent_data, "The worst movie I have seen; acting was terrible and I want my money back")
engine.predict(net, sent_data, " I enjoyed this movie")
engine.predict(net, sent_data, " this movie is pathetic")

 Testing few insances 
 Review -  I Love this movie ::Positive --> confidence :0.7799832820892334
 Review -  This movie is not good ::Positive --> confidence :0.5787915587425232
 Review - The worst movie I have seen; acting was terrible and I want my money back ::Negative --> confidence :0.9721530675888062
 Review -  I enjoyed this movie ::Positive --> confidence :0.7330417037010193
 Review -  this movie is pathetic ::Positive --> confidence :0.5494117140769958
