<a href="https://colab.research.google.com/github/aravindh1293/beginners-pytorch-deep-learning/blob/master/pytorch_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xvf /content/aclImdb_v1.tar.gz

In [0]:
import torch
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
import glob
import os
import pandas as pd
import numpy as np

In [0]:
pos_path = "/content/aclImdb/train/pos"
neg_path = "/content/aclImdb/train/neg"

In [0]:
pos_files = glob.glob(pos_path + "/*.txt")
neg_files = glob.glob(neg_path + "/*.txt")

In [0]:
reviews = []
labels = []

for doc in pos_files:
  with open(doc, "r") as f:
    content = f.read()
  reviews.append(content)
  labels.append(1)

for doc in neg_files:
  with open(doc, "r") as f:
    content = f.read()
  reviews.append(content)
  labels.append(0)

In [0]:
imdb_raw = pd.DataFrame([reviews, labels]).T.sample(frac=1).reset_index(drop=True)
imdb_raw.columns = ["reviews", "targets"]
imdb_raw.head()

In [0]:
print(imdb_raw.info())
print("value_counts : ",imdb_raw['targets'].value_counts())

In [0]:
imdb_raw.targets.value_counts().plot("bar")

In [0]:
def tokenize(review):
  return review.lower().split(" ")

In [0]:
tokenized_review = []
for rev in imdb_raw.reviews.values:
  tokenized = tokenize(rev)
  tokenized_review.append(tokenized)

In [0]:
imdb_raw['tokenized_review'] = tokenized_review

In [0]:
imdb_raw

In [0]:
# clean punctuations from reviews
from string import punctuation

cleaned_rev = []
for rev in imdb_raw.tokenized_review.values:
  cleaned_words = []
  for word in rev:
    if word not in punctuation:
      cleaned_words.append(word)
  cleaned_rev.append(cleaned_words)

In [0]:
imdb_raw['cleaned_reviews'] = cleaned_rev

In [0]:
imdb_raw

In [0]:
# create vocab with most common words

from collections import Counter

In [0]:
all_word_list = []
unique_words = set()

for rev in imdb_raw.cleaned_reviews.values:
  for word in rev:
    all_word_list.append(word)
    unique_words.add(word)

In [0]:
len(unique_words)

In [0]:
word_counts = Counter(all_word_list)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)

In [0]:
vocab_to_int = {word : ii for ii, word in enumerate(vocab)}

In [0]:
review_ints = []

for rev in imdb_raw.cleaned_reviews.values:
  word2int = []
  for word in rev:
    num = vocab_to_int[word]
    word2int.append(num)
  review_ints.append(word2int)

In [0]:
imdb_raw['review_ints'] = review_ints

In [0]:
imdb_raw.head()

In [0]:
data = imdb_raw[["review_ints", "targets"]]
data.head()

In [0]:
# truncate review to 200 words
# or pad if review has less than 200 words
truncated_rev = []
for rev in data.review_ints.values:
  if len(rev)>200:
    trunc_rev = np.array(rev[:200])
  else:
    trunc_rev = np.zeros((200), dtype=int)
    trunc_rev[-len(rev):] = rev
  truncated_rev.append(trunc_rev)

In [0]:
data['truncated_reviews'] = truncated_rev

In [0]:
data

In [0]:
x = np.array(data['truncated_reviews'].values)
y = np.array(data['targets'].values)

In [0]:
print(len(x), len(y))

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

In [0]:
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

print(x_val.shape)
print(y_val.shape)

In [0]:
type(np.array(y_train[5]))

In [0]:
class ImdbDataset(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    x = torch.from_numpy((self.x[idx]))
    y = torch.from_numpy(np.array(self.y[idx]))
    return x,y

In [0]:
train_dataset = ImdbDataset(x_train, y_train)
valid_dataset = ImdbDataset(x_val, y_val)
test_dataset = ImdbDataset(x_test, y_test)

In [0]:
print(len(train_dataset))
print(len(valid_dataset))
print(len(test_dataset))

In [0]:
train_loader = DataLoader(train_dataset, batch_size=2000, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=500, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=500, shuffle=False, num_workers=4)

In [0]:
import torch.nn as nn

In [0]:
vocab_size = len(vocab_to_int) + 1
model = nn.Sequential(
    nn.Embedding(num_embeddings=vocab_size, embedding_dim=120),
    nn.ReLU(nn.LSTM(120, 256, 8)),
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=24000 ,out_features = 1),
    nn.Sigmoid()
)

In [0]:
model

In [0]:
opt = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [0]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
EPOCHS = 50

model = model.to(DEVICE)
for epoch in range(EPOCHS):
  total_loss = 0.0
  counter = 0
  model.train()
  for ii, data in enumerate(train_loader):
    counter += 1
    inp, target = data
    # target = target.unsqueeze(0)
    # print(inp.size(), target.size())
    # print(ii, target)
    inp = inp.to(DEVICE)
    target = target.to(DEVICE)

    opt.zero_grad()
    out = model(inp)
    # print(out.size())
    # print(ii, out.squeeze())
    loss = criterion(out.squeeze(), target.float())
    loss.backward()
    total_loss += loss.item()
    opt.step()
  print("train loss : epoch-",epoch, "-",  total_loss/counter)

In [0]:
EPOCHS = 50

model = model.to(DEVICE)

total_loss = 0.0
counter = 0
model.eval()
for ii, data in enumerate(test_loader):
  counter += 1
  inp, target = data
  # target = target.unsqueeze(0)
  # print(inp.size(), target.size())
  # print(ii, target)
  inp = inp.to(DEVICE)
  target = target.to(DEVICE)

  out = model(inp)
  # print(out.size())
  # print(ii, out.squeeze())
  loss = criterion(out.squeeze(), target.float())
  total_loss += loss.item()
print("test loss : epoch-", total_loss/counter)

In [0]:
EPOCHS = 50

model = model.to(DEVICE)

total_loss = 0.0
counter = 0
model.eval()
for ii, data in enumerate(valid_loader):
  counter += 1
  inp, target = data
  # target = target.unsqueeze(0)
  # print(inp.size(), target.size())
  # print(ii, target)
  inp = inp.to(DEVICE)
  target = target.to(DEVICE)

  out = model(inp)
  # print(out.size())
  # print(ii, out.squeeze())
  loss = criterion(out.squeeze(), target.float())
  total_loss += loss.item()
print("test loss : ", total_loss/counter)