In [1]:
import torch
import gensim
from torch import nn

In [2]:
import numpy as np
with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()

In [3]:
reviews[:200]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  '

In [4]:
labels[:30]

'positive\nnegative\npositive\nneg'

In [5]:
from string import punctuation

In [6]:
reviews = reviews.lower()
all_text = ''.join([review for review in reviews if review not in punctuation])

reviews_split = all_text.split('\n')

all_text = ' '.join(reviews_split)

all_words = all_text.split()

In [87]:
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [8]:
from collections import Counter

In [9]:
counts = Counter(all_words)
reviews_lens = Counter([len(review.split()) for review in reviews_split])
print(f"Zero-length reviews: {reviews_lens[0]}")
print(f'Maximum review length: {max(reviews_lens)}')

Zero-length reviews: 1
Maximum review length: 2514


In [88]:
# remove outliers (0 length reviews)
print('Number of reviews before removing outliers: ', len(reviews_split))

non_zero_len_idx = [i for i, review in enumerate(reviews_split) if len(review) !=0]

reviews_split = [reviews_split[i] for i in non_zero_len_idx]
encoded_labels = [encoded_labels[i] for i in non_zero_len_idx]
print('Number of reviews after removing outliers: ', len(reviews_split))

Number of reviews before removing outliers:  25000
Number of reviews after removing outliers:  25000


In [11]:
from gensim.models import KeyedVectors

In [12]:
# decompressing pretrained embedding
! gzip -d GoogleNews-vectors-negative300-SLIM.bin.gz

In [13]:
embed_lookup = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300-SLIM.bin',
                                                 binary=True)

In [14]:
pretrained_words = []
# index_to_key: list contains all the word in corpus
for word in embed_lookup.index_to_key:
  pretrained_words.append(word)

In [15]:
row_idx = 1

word = pretrained_words[row_idx] # get words by index
embedding = embed_lookup[word] # embeddings by word
print(f"Size of Vocab: {len(pretrained_words)}")
print(f'Word in vocab: {word}')
print(f'Length of embedding: {len(embedding)}')

Size of Vocab: 299567
Word in vocab: for
Length of embedding: 300


In [16]:
print([word for word in pretrained_words[:10]])

['in', 'for', 'that', 'is', 'on', 'The', 'with', 'said', 'was', 'the']


In [17]:
# caculate cosine similarity

test_word = 'fabulous'
print(f'Similar words to {test_word}')

for similar_word in embed_lookup.similar_by_word(test_word):
  print(f'{similar_word[0]} (Similarity: {similar_word[1]:.2f})')

Similar words to fabulous
wonderful (Similarity: 0.76)
fantastic (Similarity: 0.76)
marvelous (Similarity: 0.73)
gorgeous (Similarity: 0.71)
lovely (Similarity: 0.71)
terrific (Similarity: 0.69)
amazing (Similarity: 0.69)
beautiful (Similarity: 0.67)
magnificent (Similarity: 0.67)
splendid (Similarity: 0.65)


In [18]:
from tqdm.auto import tqdm

In [19]:
# tokenize reviews
def tokenize_all_reviews(embed_lookup, reviews_split, number_of_reviews):
  review_words = [review.split() for review in reviews_split]
  tokenized_reviews = []
  for review_word in tqdm(review_words[:number_of_reviews]):
    tokenized_review = [embed_lookup.key_to_index[word] if word in embed_lookup.index_to_key else 0
                        for word in review_word]
    tokenized_reviews.append(tokenized_review)

  return tokenized_reviews

In [None]:
tokenized_reviews = tokenize_all_reviews(embed_lookup, reviews_split, 1000)
# run on colab to see tqdm 

  0%|          | 0/1000 [00:00<?, ?it/s]

In [21]:
seq_len = 200
features = np.zeros((len(reviews_split), seq_len), dtype = int)
features.shape

(25000, 200)

In [70]:
# padding
def pad_features(tokenized_reviews, seq_len):
  features = np.zeros((len(tokenized_reviews), seq_len), dtype = int)
  for i, row in tqdm(enumerate(tokenized_reviews)):
    features[i, -len(row):] = np.array(row)[:seq_len]

  return features

In [None]:
features = pad_features(tokenized_reviews,seq_len)
# run on colab to see tqdm 

0it [00:00, ?it/s]

In [24]:
features

array([[    0,     0,     0, ...,    13, 85275,  7451],
       [    0,     0,     0, ...,    14,   441,  4699],
       [16483,    26,     0, ...,   726,     6,     0],
       ...,
       [ 4365,    42,    19, ...,    78,     9,  1083],
       [51760,  2659,    16, ...,     3,  6917,     0],
       [    0,     0,     0, ...,    71,  3225,    13]])

In [79]:
features.shape

(1000, 200)

In [85]:
def config_labels(encoded_labels, number_of_reviews):
  encoded_labels = [encoded_labels[i] for i in range(number_of_reviews)]
  encoded_labels = np.array(encoded_labels)
  return encoded_labels

In [89]:
encoded_labels = config_labels(encoded_labels, 1000)

In [91]:
split_frac = 0.8
split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(800, 200) 
Validation set: 	(100, 200) 
Test set: 		(100, 200)


In [29]:
from torch.utils.data import TensorDataset, DataLoader
# TensorDataset: default dataset

In [92]:
torch.from_numpy(train_x).shape

torch.Size([800, 200])

In [93]:
torch.from_numpy(train_y).shape

torch.Size([800])

In [94]:
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

train_loader = DataLoader(train_dataset, batch_size = 20, shuffle = True)
valid_loader = DataLoader(valid_dataset, batch_size= 20, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size= 20, shuffle = False)

In [31]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [33]:
from torch.nn import functional as F

In [96]:

class SentimentCNN(nn.Module):
  def __init__(self, embed_model, vocab_size, output_size, embedding_dim,
               num_filters=100, kernel_sizes=[3, 4, 5], freeze_embeddings=True, drop_prob=0.5):
    super().__init__()
    self.num_filters = num_filters
    self.embedding_dim = embedding_dim
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.embedding.weight.data.copy_(torch.from_numpy(embed_model.vectors))

    if freeze_embeddings:
      self.embedding.requires_grad = False

    # need to add the channels dim as embedding dim is (batch_size, seq_len, embedding_dim)
    self.convs_1d = nn.ModuleList([
        nn.Conv2d(1, num_filters, kernel_size= (k, embedding_dim), padding = (k-2, 0))
        for k in kernel_sizes
    ])

    self.classifier = nn.Linear(len(kernel_sizes) * num_filters, output_size)
    self.dropout = nn.Dropout(p = drop_prob)
    self.sig = nn.Sigmoid()

  def conv_and_pool(self, x, conv):
    # squeeze last dim to get size: (batch_size, num_filters, conv_seq_length)
    x = F.relu(conv(x)).squeeze(3)
    # squeeze to get size: (batch_size, num_filters)
    x_max = F.max_pool1d(x, x.size(2)).squeeze(2)
    return x_max
  def forward(self, x):
    embeds = self.embedding(x)
    # unsqueeze for the channel dim
    embeds = embeds.unsqueeze(1)
    conv_results = [self.conv_and_pool(embeds, conv) for conv in self.convs_1d]
    x = torch.cat(conv_results, 1)
    x = self.dropout(x)
    logit = self.classifier(x)
    return self.sig(logit)

In [97]:
vocab_size = len(pretrained_words)
output_size = 1 # binary class (1 or 0)
embedding_dim = len(embed_lookup[pretrained_words[0]]) # 300-dim vectors
num_filters = 100
kernel_sizes = [3, 4, 5]

net = SentimentCNN(embed_lookup, vocab_size, output_size, embedding_dim,
                   num_filters, kernel_sizes).to(device)

print(net)

SentimentCNN(
  (embedding): Embedding(299567, 300)
  (convs_1d): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1), padding=(2, 0))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1), padding=(3, 0))
  )
  (classifier): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (sig): Sigmoid()
)


In [100]:
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [102]:
def train(model: torch.nn.Module,
          train_loader: torch.utils.data.DataLoader,
          valid_loader: torch.utils.data.DataLoader,
          epochs,
          device = device):
  model.train()
  for epoch in tqdm(range(epochs)):
    train_loss, train_acc = 0, 0
    for i, (X, y) in enumerate(train_loader):
      X, y = X.to(device), y.to(device)
      y_prob = model(X)
      loss = criterion(y_prob.squeeze(), y.float())
      train_loss += loss.item()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      y_pred = y_prob.squeeze().round()
      corrects_per_batch_train = torch.eq(y_pred, y.float()).sum().item()
      train_acc += (corrects_per_batch_train / len(y))

    valid_loss, valid_acc = 0, 0
    model.eval()
    with torch.inference_mode():
      for i, (X, y) in enumerate(valid_loader):
        X, y = X.to(device), y.to(device)
        y_prob = model(X)
        loss = criterion(y_prob.squeeze(), y.float())
        valid_loss += loss.item()
        y_pred = y_prob.squeeze().round()
        corrects_per_batch_valid = torch.eq(y_pred, y.float()).sum().item()
        valid_acc += (corrects_per_batch_valid / len(y))

    print(f'Epoch {epoch+1}')
    print(f'Train_loss: {train_loss / len(train_loader):.4f}')
    print(f'Train_acc: {train_acc / len(train_loader):.4f}')
    print(f'Valid_loss: {valid_loss / len(valid_loader):.4f}')
    print(f'Valid_acc: {valid_acc / len(valid_loader):.4f}')



In [None]:
train(net, train_loader, valid_loader, 5)
# run on colab to see tqdm 

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1
Train_loss: 0.1584
Train_acc: 0.9675
Valid_loss: 0.4119
Valid_acc: 0.8000
Epoch 2
Train_loss: 0.0380
Train_acc: 1.0000
Valid_loss: 0.4332
Valid_acc: 0.8000
Epoch 3
Train_loss: 0.0105
Train_acc: 1.0000
Valid_loss: 0.4568
Valid_acc: 0.8100
Epoch 4
Train_loss: 0.0051
Train_acc: 1.0000
Valid_loss: 0.4751
Valid_acc: 0.8200
Epoch 5
Train_loss: 0.0032
Train_acc: 1.0000
Valid_loss: 0.4940
Valid_acc: 0.8300


In [105]:
def test(model: torch.nn.Module,
          test_loader: torch.utils.data.DataLoader,
          device = device):
  test_loss, test_acc = 0, 0
  model.eval()
  with torch.inference_mode():
    for i, (X, y) in tqdm(enumerate(test_loader)):
      X, y = X.to(device), y.to(device)
      y_prob = model(X)
      loss = criterion(y_prob.squeeze(), y.float())
      test_loss += loss.item()
      y_pred = y_prob.squeeze().round()
      corrects_per_batch = torch.eq(y_pred, y.float()).sum().item()
      test_acc += (corrects_per_batch / len(y))

  print(f'Test_loss: {test_loss / len(test_loader):.4f}')
  print(f'Test_acc: {test_acc / len(test_loader):.4f}')

In [None]:
test(net, test_loader)
# run on colab to see tqdm 

0it [00:00, ?it/s]

Test_loss: 0.4903
Test_acc: 0.8500
