In [None]:
from  google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchtext==0.10.0.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0.
  Downloading torchtext-0.10.0-cp38-cp38-manylinux1_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp38-cp38-manylinux1_x86_64.whl (831.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m831.4/831.4 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.13.0+cu116
    Uninstalling torch-1.13.0+cu116:
      Successfully uninstalled torch-1.13.0+cu116
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.14.0
    Uninstalling torchtext-0.14.0:
      Successfully uninstalled torchtext-0.14.0
[31mERROR: pip's dependency resolver does not currently take into account all the

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data , datasets
import random
import spacy

In [None]:
source_folder = '/content/drive/MyDrive/data'
destination_folder ='/content/drive/MyDrive/data_save'

preparing data , we'll set the seed, define the Fields and get the train/valid/test splits

In [None]:
seed =1234
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

Text = data.Field(tokenize = 'spacy',tokenizer_language = 'en_core_web_sm',include_lengths= True)
Label = data.LabelField(dtype=torch.float)

define the fields

In [None]:
fields = [('text', Text),  ('Sentiment', Label)]

pass the location of the train/valid/test data

In [None]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = source_folder ,
                                        train = 'train.csv',
                                        validation = 'valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

In [None]:
print(vars(train_data[0]))

{'text': ['man', 'need', 'finish', 'working', 'play', 'persona'], 'Sentiment': '0'}


downloading the vectors and associating them with the correct words in our vocabulary and we'll be using the "glove.6B.100d" vectors" glove is the algorithm used to calculate the vectors and by setting unk_init to torch.Tensor.normal_. This will now initialize those words via a Gaussian distribution.

In [None]:
from torchtext.vocab import Vectors
max_size = 25000
Text.build_vocab(train_data,max_size=max_size,vectors = "glove.6B.100d",unk_init=torch.Tensor.normal_)
Label.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                           
100%|█████████▉| 399999/400000 [00:12<00:00, 32776.63it/s]


placing the tensors on the GPU if one is available

In [None]:
Batch_size = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


create the iterators

In [None]:
train_iterator,valid_iterator,test_iterator = data.BucketIterator.splits((train_data,valid_data,test_data),
                                                                  batch_size=Batch_size,sort_key = lambda x: len(x.text),
                                                                  sort_within_batch=True,
                                                                  device=device)

build the LSTM class

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
       
        
        embedded = self.dropout(self.embedding(text))
        
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
       
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        
            
        return self.fc(hidden)



set the hayper parameters and create LSTM instance

In [None]:
input_dim=len(Text.vocab)
Embedding_dim =100
Hidden_dim=256
Output_dim =1
N_layers=2
Bidirectional = True
Dropout = 0.5
pad_IDX= Text.vocab.stoi[Text.pad_token]


model = RNN(input_dim,Embedding_dim,Hidden_dim,Output_dim,N_layers,Bidirectional,Dropout,pad_IDX)

We retrieve the embeddings from the field's vocab, and check they're the correct size

In [None]:
pre_embeddings = Text.vocab.vectors
print(pre_embeddings.shape)

torch.Size([25002, 100])


We then replace the initial weights of the embedding layer with the pre-trained embeddings

In [None]:
model.embedding.weight.data.copy_(pre_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.3669,  0.4154,  0.1348,  ...,  0.0244,  0.2211,  0.4317],
        ...,
        [-0.2222, -0.8740,  0.2754,  ..., -0.2819, -0.2160, -0.2122],
        [ 1.2713,  0.6307,  0.4610,  ..., -0.9668,  1.5761, -1.0732],
        [-0.9261,  0.5799, -1.9593,  ..., -0.6351, -0.1507,  0.9007]])

initialize and token to all zeros to explicitly tell our model that, initially, they are irrelevant for determining sentiment.

In [None]:
unk_idx = Text.vocab.stoi[Text.unk_token]
model.embedding.weight.data[unk_idx]=torch.zeros(Embedding_dim)
model.embedding.weight.data[pad_IDX]=torch.zeros(Embedding_dim)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3669,  0.4154,  0.1348,  ...,  0.0244,  0.2211,  0.4317],
        ...,
        [-0.2222, -0.8740,  0.2754,  ..., -0.2819, -0.2160, -0.2122],
        [ 1.2713,  0.6307,  0.4610,  ..., -0.9668,  1.5761, -1.0732],
        [-0.9261,  0.5799, -1.9593,  ..., -0.6351, -0.1507,  0.9007]])


We define the optimizer and the criterion and place the model and criterion on the GPU

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

implement the function to calculate accuracy

In [None]:
def binary_accuracy(preds,y):
  rounded_preds=torch.round(torch.sigmoid(preds))
  correct = (rounded_preds==y).float()
  acc = correct.sum()/len(correct)
  return acc

define the train function

In [None]:
def train(model,iterator,optimizer,criterion):
  epoch_loss=0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad()
    text,text_lengths=batch.text
    preds = model(text,text_lengths).squeeze(1)
    loss = criterion(preds,batch.Sentiment)
    acc = binary_accuracy(preds,batch.Sentiment)
    loss.backward()
    optimizer.step()
    epoch_loss+= loss.item()
    epoch_acc+= acc.item()

  return   epoch_loss/len(iterator) , epoch_acc/len(iterator)

define the evaluation function

In [None]:
def evaluate(model,iterator,criterion):
  epoch_loss=0
  epoch_acc=0
  with torch.no_grad():
    for batch in iterator:
      text,text_lengths= batch.text
      preds = model(text,text_lengths).squeeze(1)
      loss = criterion(preds,batch.Sentiment)
      acc = binary_accuracy(preds,batch.Sentiment)
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return    epoch_loss/len(iterator) , epoch_acc/len(iterator)


In [None]:
import time
def epoch_time(start,end):
  timeE = end - start
  minutes = int(timeE/60)
  secs = int(timeE-(minutes * 60))
  return minutes , secs

model training

In [None]:
N_epochs =10
best_loss = float('inf')
for epoch in range(N_epochs):

  start = time.time()

  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss,valid_acc = evaluate(model,valid_iterator,criterion)


  end = time.time()

  epoch_minutes ,epoch_seconds = epoch_time(start,end)


  if valid_loss<best_loss:
    best_loss = valid_loss
    torch.save(model.state_dict(),'tut2-model.pt')
  
  print(f'epoch_no : {epoch+1:02} | Epoch Time: {epoch_minutes}m {epoch_seconds}s')
  print(f'\train_loss : {train_loss:.2f} | train_acc: {train_acc*100:.2f}%')
  print(f'\valid_loss: {valid_loss:.2f} |  valid_acc: {valid_acc*100:.2f}%') 

epoch_no : 01 | Epoch Time: 0m 9s
	rain_loss : 0.32 | train_acc: 86.21%
alid_loss: 0.41 |  valid_acc: 81.54%
epoch_no : 02 | Epoch Time: 0m 9s
	rain_loss : 0.31 | train_acc: 86.83%
alid_loss: 0.44 |  valid_acc: 81.65%
epoch_no : 03 | Epoch Time: 0m 9s
	rain_loss : 0.29 | train_acc: 87.41%
alid_loss: 0.43 |  valid_acc: 81.02%
epoch_no : 04 | Epoch Time: 0m 9s
	rain_loss : 0.28 | train_acc: 88.14%
alid_loss: 0.44 |  valid_acc: 81.46%
epoch_no : 05 | Epoch Time: 0m 9s
	rain_loss : 0.27 | train_acc: 88.68%
alid_loss: 0.46 |  valid_acc: 81.11%
epoch_no : 06 | Epoch Time: 0m 9s
	rain_loss : 0.25 | train_acc: 89.29%
alid_loss: 0.48 |  valid_acc: 80.93%
epoch_no : 07 | Epoch Time: 0m 9s
	rain_loss : 0.24 | train_acc: 89.79%
alid_loss: 0.52 |  valid_acc: 81.07%
epoch_no : 08 | Epoch Time: 0m 9s
	rain_loss : 0.23 | train_acc: 90.34%
alid_loss: 0.49 |  valid_acc: 80.69%
epoch_no : 09 | Epoch Time: 0m 9s
	rain_loss : 0.22 | train_acc: 90.71%
alid_loss: 0.52 |  valid_acc: 80.37%
epoch_no :

test accuracy

In [None]:
model.load_state_dict(torch.load('tut2-model.pt'))
test_loss,test_acc = evaluate(model,test_iterator,criterion)
print(f'test_loss:{test_loss:.3f} | test_acc:{test_acc*100:3f}%')

test_loss:0.412 | test_acc:81.743968%


testing the user input

In [None]:
nlp = spacy.load('en_core_web_sm')
def predict(model,sentence):
  model.eval()
  tokenized= [t.text for t in nlp.tokenizer(sentence)]
  indexed = [Text.vocab.stoi[t] for t in tokenized]
  length = [len(indexed)]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)
  length_tensor = torch.LongTensor(length)
  preds= torch.sigmoid(model(tensor,length_tensor))
  return preds.item()


positive tweet

In [None]:
predict(model," almost appreciate everyone")

0.8462774157524109

negative tweet

In [None]:
predict(model,"paaain left chest heart gosh feel sick")

0.0036441341508179903