<a href="https://colab.research.google.com/github/bhagatpandey369/sentiment-analysis/blob/main/sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Workflow**
1. Import Libararies
2. Preparing Data
3. Build a model
4. loss fuction and optimizer
5. train and evaluation
6. train the model
7. visulization our model
8. testing model

In [None]:
!pip install datasets torchtext torchdata

# **Praparing data**
1. Load data
2. Tokenization data
3. creating data splits
4. creating a vocabulary
5. numericaling data
6. conveting into tensor
7. creating the data loader


In [None]:
import collections
import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import torch.optim as optim
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import tqdm

In [None]:
train_data, test_data = datasets.load_dataset('imdb',split=['train','test'])

In [None]:
train_data, test_data

In [None]:
train_data.features

In [None]:
train_data[24999]

# **Tokenization**

In [None]:

tokenizer = get_tokenizer('basic_english')


In [None]:
tokenizer('Hi my name is Bhagat Pandey and my email address is pandeybhagat369@gmail.com')

In [None]:
def tokenize_example(example,tokenizer,max_length):
  tokens = tokenizer(example['text'])[:max_length]
  return {'tokens': tokens}

In [None]:
max_length = 256
train_data = train_data.map(
    tokenize_example,
    fn_kwargs={'tokenizer':tokenizer,'max_length':max_length}
)
test_data = test_data.map(
    tokenize_example,
    fn_kwargs={'tokenizer':tokenizer,'max_length':max_length}
)

In [None]:
train_data

In [None]:
train_data.features

In [None]:
train_data[24999]['tokens'][:25]

In [None]:
test_size = 0.25
train_valid_data  = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

In [None]:
train_valid_data['train']

In [None]:
len(train_data), len(valid_data), len(test_data)

# **Creating Vocabulary**

In [None]:
min_freq = 5
special_tokens = ['<unk>','<pad>']
vocab = build_vocab_from_iterator(
    train_data['tokens'],
    min_freq = min_freq,
    specials = special_tokens
)

In [None]:
len(vocab)

In [None]:
vocab.get_itos()[:10]

In [None]:
vocab['and']

In [None]:
unk_index = vocab['<unk>']
pad_index = vocab['<pad>']

In [None]:
'cat' in vocab

In [None]:
'bhagat' in vocab

In [None]:
vocab.set_default_index(unk_index)

In [None]:
vocab['some token']

In [None]:
vocab.lookup_indices(['and','bhagat','some token','cat','dog','kathmandu'])

# **Numericalization**

In [None]:
def numericalize_example(example, vocab):
  ids = vocab.lookup_indices(example['tokens'])
  return {'ids':ids}

In [None]:
train_data = train_data.map(numericalize_example,fn_kwargs={'vocab':vocab})
valid_data = valid_data.map(numericalize_example,fn_kwargs={'vocab':vocab})
test_data = test_data.map(numericalize_example,fn_kwargs={'vocab':vocab})

In [None]:
train_data

In [None]:
train_data[0]['tokens'][:5]

In [None]:
train_data[0]['ids'][:10]

# **Converting into tensors**

In [None]:
train_data

In [None]:
train_data = train_data.with_format(type='torch',columns=['ids','label'])
valid_data = valid_data.with_format(type='torch',columns=['ids','label'])
test_data = test_data.with_format(type='torch',columns=['ids','label'])

In [None]:
train_data[100]

In [None]:
train_data[100]['label']

In [None]:
train_data[100]['ids'][:10]

In [None]:
train_data[0].keys()

In [None]:
vocab.lookup_tokens(train_data[0]['ids'].tolist())

# **Creating Data Loaders**

In [None]:
def get_collate_fn(pad_index):
  def collate_fn(batch):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = torch.nn.utils.rnn.pad_sequence(
        batch_ids,
        padding_value = pad_index,
        batch_first=True
    )
    batch_label = [i['label'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {'ids':batch_ids,'label':batch_label}
    return batch

  return collate_fn


In [None]:
def get_data_loader(dataset,batch_size, pad_index, shuffle=False):
  collate_fn = get_collate_fn(pad_index)
  data_loader = torch.utils.data.DataLoader(
      dataset=dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      collate_fn=collate_fn
  )
  return data_loader

In [None]:
batch_size = 512

train_data_loader = get_data_loader(train_data,batch_size,pad_index,shuffle=True)
valid_data_loader = get_data_loader(valid_data,batch_size,pad_index,shuffle=False)
test_data_loader = get_data_loader(test_data,batch_size,pad_index,shuffle=False)

In [None]:
len(train_data_loader), len(valid_data_loader), len(test_data_loader)

# **Build the Model**

In [None]:
class NBow(nn.Module):
  def __init__(self,vocab_size,embedding_dim,output_dim,pad_indes):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_index)
    self.fc = nn.Linear(embedding_dim,output_dim)

  def forward(self,ids):
    embedded = self.embedding(ids)
    pooled = embedded.mean(dim=1)
    prediction = self.fc(pooled)
    return prediction

In [None]:
vocab_size = len(vocab)
embedding_dim = 300
output_dim = len(train_data.unique('label'))
model = NBow(vocab_size,embedding_dim,output_dim,pad_index)

In [None]:
len(vocab)

In [None]:
len(train_data.unique('label'))

In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"The model has {count_parameters(model):,} trainable parameters")

In [None]:
vectors=torchtext.vocab.GloVe()


In [None]:
vectors.get_vecs_by_tokens(['and']).shape

In [None]:
vectors.get_vecs_by_tokens(['apple'])[:,:10]

In [None]:
pretrain_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

In [None]:
pretrain_embedding.shape

In [None]:
model.embedding

In [None]:
model.embedding.weight

In [None]:
pretrain_embedding

In [None]:
model.embedding.weight.data = pretrain_embedding

In [None]:
model.embedding.weight

# **Loss and Optimizer**

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
device

# **Train and Evaluation Model**

In [None]:
def train(data_loader,model,criterion,optimizer,device):
  model.train()
  epoch_loss = []
  epoch_acc = []
  for batch in tqdm.tqdm(data_loader,desc='training...'):
    ids = batch['ids'].to(device)
    label = batch['label'].to(device)
    prediction = model(ids)
    loss = criterion(prediction, label)
    accuracy = get_accuracy(prediction, label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    epoch_loss.append(loss.item())
    epoch_acc.append(accuracy.item())
    return np.mean(epoch_loss), np.mean(epoch_acc)


In [None]:
def evaluate(data_loader,model,criterion,device):
  model.eval()
  epoch_loss = []
  epoch_acc = []
  with torch.no_grad():
    for batch in tqdm.tqdm(data_loader,desc='evaluating...'):
      ids = batch['ids'].to(device)
      label = batch['label'].to(device)
      prediction = model(ids)
      loss = criterion(prediction, label)
      accuracy = get_accuracy(prediction, label)
      epoch_loss.append(loss.item())
      epoch_acc.append(accuracy.item())
  return np.mean(epoch_loss), np.mean(epoch_acc)

In [None]:
def get_accuracy(prediction,label):
  batch_size,_=prediction.shape
  predicted_classes=prediction.argmax(dim=-1)
  correct_prediction=predicted_classes.eq(label).sum()
  accuracy=correct_prediction / batch_size
  return accuracy

# **Train Loop**

In [None]:
n_epoch = 15
best_valid_loss = float('inf')
metrics = collections.defaultdict(list)
for epoch in range(n_epoch):
  train_loss, train_acc = train(train_data_loader,model,criterion,optimizer,device)
  valid_loss, valid_acc = evaluate(valid_data_loader,model,criterion,device)
  metrics['train_loss'].append(train_loss)
  metrics['train_acc'].append(train_acc)
  metrics['valid_loss'].append(valid_loss)
  metrics['valid_acc'].append(valid_acc)
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(),'nbow.pt')
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


# **Visulization our Model**

In [None]:
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1,1,1)
ax.plot(metrics['train_loss'],label='train loss')
ax.plot(metrics['valid_loss'],label='valid loss')
ax.set_xlabel('epoch')
ax.set_ylabel('loss')
ax.set_xticks(range(n_epoch))
ax.legend()
ax.grid()


In [None]:

fig=plt.figure(figsize=(10,6))
ax=fig.add_subplot(1,1,1)
ax.plot(metrics['train_acc'],label='train acc')
ax.plot(metrics['valid_acc'],label='valid acc')
ax.set_xlabel('epoch')
ax.set_ylabel('accuracy')
ax.set_xticks(range(n_epoch))
ax.legend()
ax.grid()

In [None]:
model.load_state_dict(torch.load('nbow.pt'))
test_loss, test_acc = evaluate(test_data_loader,model,criterion,device)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
def predict_sentiment(text,model,tokenizer,vocab,device):
  tokens = tokenizer(text)
  ids = vocab.lookup_indices(tokens)
  tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
  prediction = model(tensor).squeeze(dim=0)
  probability = torch.softmax(prediction,dim=-1)
  predicted_class = prediction.argmax(dim=-1).item()
  predicted_probability = probability[predicted_class].item()
  return predicted_class, predicted_probability

In [None]:

text="this film is terrible!"
predict_sentiment(text,model,tokenizer,vocab,device)

In [None]:

text="this film is great!"
predict_sentiment(text,model,tokenizer,vocab,device)

In [None]:

text = "This film is not great, it's terrible!"

predict_sentiment(text, model, tokenizer, vocab, device)

In [None]:

text = "This film is happy"

predict_sentiment(text, model, tokenizer, vocab, device)