<a href="https://colab.research.google.com/github/ajitjadhav10/UMBC/blob/main/DATA%20690%20NLP/DATA_690_NLP_HW_4_PART_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This exercise will allow you to use PyTorch. Refer to the DATA 690 Resources.

1. Import the required libraries.
2. Load the dataset containing a set of 1,000 product reviews from Amazon, which are
paired with a label of 0 (for negative reviews) or 1 (for positive reviews). Get the data at
https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences.
3. Separate the data into two variables: one containing the reviews and the other containing
the labels. Remove the punctuation from the reviews.
4. Create a variable containing the vocabulary of the entire set of reviews.
5. Additionally, create a dictionary that maps each word to an integer, where the words will
be the keys and the integers will be the values.
6. Encode the reviews data by replacing each word in a review for its paired integer.
7. Create a class containing the architecture of the network.
8. Make sure that you include an embedding layer. Initialize the model using 64 embedding
dimensions and 128 neurons for 3 LSTM layers.
9. Define the loss function, an optimization algorithm, and the number of epochs to train for.
For example, you can use binary cross-entropy loss as the loss function, the Adam
optimizer, and train for 10 epochs.
10. Create a for loop that goes through the different epochs and through every single
review individually. For each review, perform a prediction, calculate the loss function, and
update the parameters of the network. Additionally, calculate the accuracy of the network
over that training data.
11. Plot the progress of the loss function and accuracy over time.

In [142]:
#importing the required libraries

import pandas as pd
from string import punctuation
import numpy as np
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
import json
import nltk

In [143]:
#loading the data

data_1 = pd.read_csv('amazon_cells_labelled.txt',delimiter='\t', header=None)

data_1.columns = ["Review", "Sentiment"]

data_1.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [144]:
#Create a variable containing the vocabulary of the entire set of reviews.

nltk.download('punkt')

def word_vocab(data):
    text = list(data_1['Review'].values)
    clean_text = []
    for t in text:
        clean_text.append(t.translate(str.maketrans('', '', punctuation)).lower().rstrip())
    tokenized = [word_tokenize(x) for x in clean_text]
    all_text = []
    for tokens in tokenized:
        for t in tokens:
            all_text.append(t)
    return tokenized, set(all_text)

reviews, vocab = word_vocab(data)

reviews[0]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['so',
 'there',
 'is',
 'no',
 'way',
 'for',
 'me',
 'to',
 'plug',
 'it',
 'in',
 'here',
 'in',
 'the',
 'us',
 'unless',
 'i',
 'go',
 'by',
 'a',
 'converter']

In [145]:
#Additionally, create a dictionary that maps each word to an integer, where the words will be the keys and the integers will be the values.

def vocab_dictionaries(words):
    word_to_int_dict = {w:i+1 for i, w in enumerate(words)}
    int_to_word_dict = {i:w for w, i in word_to_int_dict.items()}
    return word_to_int_dict, int_to_word_dict

word_to_int_dict, int_to_word_dict = vocab_dictionaries(vocab)

int_to_word_dict

{1: 'plastic',
 2: 'which',
 3: 'attractive',
 4: 'listening',
 5: 'satisfied',
 6: 'that',
 7: 'due',
 8: 'factor',
 9: 'sex',
 10: 'touches',
 11: 'v325i',
 12: 'leaf',
 13: 'completely',
 14: 'ride',
 15: 'sources',
 16: 'unsatisfactory',
 17: 'another',
 18: 'blackberry',
 19: 'chinese',
 20: 'ac',
 21: 'latest',
 22: 'accessory',
 23: 'fairly',
 24: 'beautiful',
 25: 'garbage',
 26: 'cellphone',
 27: 'else',
 28: 'bar',
 29: 'works',
 30: 'sorry',
 31: 'microsofts',
 32: 'slide',
 33: 'tones',
 34: '1',
 35: 'methe',
 36: 'lost',
 37: 'material',
 38: 'making',
 39: 'piece',
 40: 'thru',
 41: 'encourage',
 42: 'worked',
 43: 'most',
 44: 'right',
 45: 'tool',
 46: 'small',
 47: 'razr',
 48: 'scary',
 49: 'someone',
 50: 'fun',
 51: 'cutouts',
 52: 'designs',
 53: 'address',
 54: 'tries',
 55: '10',
 56: 'fire',
 57: 'made',
 58: 'peachykeen',
 59: 'penny',
 60: 'engineered',
 61: 'here',
 62: 'times',
 63: 'despite',
 64: 'wit',
 65: 'bars',
 66: 'plantronics',
 67: 'needed',
 68:

In [146]:
word_to_int_dict

{'plastic': 1,
 'which': 2,
 'attractive': 3,
 'listening': 4,
 'satisfied': 5,
 'that': 6,
 'due': 7,
 'factor': 8,
 'sex': 9,
 'touches': 10,
 'v325i': 11,
 'leaf': 12,
 'completely': 13,
 'ride': 14,
 'sources': 15,
 'unsatisfactory': 16,
 'another': 17,
 'blackberry': 18,
 'chinese': 19,
 'ac': 20,
 'latest': 21,
 'accessory': 22,
 'fairly': 23,
 'beautiful': 24,
 'garbage': 25,
 'cellphone': 26,
 'else': 27,
 'bar': 28,
 'works': 29,
 'sorry': 30,
 'microsofts': 31,
 'slide': 32,
 'tones': 33,
 '1': 34,
 'methe': 35,
 'lost': 36,
 'material': 37,
 'making': 38,
 'piece': 39,
 'thru': 40,
 'encourage': 41,
 'worked': 42,
 'most': 43,
 'right': 44,
 'tool': 45,
 'small': 46,
 'razr': 47,
 'scary': 48,
 'someone': 49,
 'fun': 50,
 'cutouts': 51,
 'designs': 52,
 'address': 53,
 'tries': 54,
 '10': 55,
 'fire': 56,
 'made': 57,
 'peachykeen': 58,
 'penny': 59,
 'engineered': 60,
 'here': 61,
 'times': 62,
 'despite': 63,
 'wit': 64,
 'bars': 65,
 'plantronics': 66,
 'needed': 67,
 'pc

In [148]:
#Encode the reviews data by replacing each word in a review for its paired integer.

def padding_text(tok_rev, seq_len):
    
    reviews = []
    
    for review in tok_rev:
        if len(review) >= seq_len:
            reviews.append(review[:seq_len])
        else:
            reviews.append(['']*(seq_len-len(review)) + review)
        
    return np.array(reviews)

padded_sentences = padding_text(reviews, seq_len = 50)

padded_sentences[0]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', 'so', 'there',
       'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here',
       'in', 'the', 'us', 'unless', 'i', 'go', 'by', 'a', 'converter'],
      dtype='<U22')

In [149]:
int_to_word_dict[0] = ''
word_to_int_dict[''] = 0

In [150]:
encoded_sentences = np.array([[word_to_int_dict[word] for word in review] for review in padded_sentences])

encoded_sentences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0, 1855,  775,  569, 1568,
        112,  604, 1472, 1640, 1396,  119,  686,   61,  686, 1177,  341,
        302,  469,  396,  822, 1082,  968])

In [151]:
#Create a class containing the architecture of the network.

class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.8):
        super().__init__()
        
        self.n_vocab = n_vocab  
        self.n_layers = n_layers 
        self.n_hidden = n_hidden 
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                          
        embedded_words = self.embedding(input_words)
        lstm_out, h = self.lstm(embedded_words) 
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden)
        fc_out = self.fc(lstm_out)                  
        sigmoid_out = self.sigmoid(fc_out)              
        sigmoid_out = sigmoid_out.view(batch_size, -1)  
        
        sigmoid_last = sigmoid_out[:, -1]
        
        return sigmoid_last, h
    

In [153]:
#Make sure that you include an embedding layer. Initialize the model using 64 embedding dimensions and 128 neurons for 3 LSTM layers.

n_vocab = len(word_to_int_dict)
n_embed = 64
n_hidden = 128
n_output = 1
n_layers = 3

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

#net.to(device)
print(net)

SentimentLSTM(
  (embedding): Embedding(1906, 64)
  (lstm): LSTM(64, 128, num_layers=3, batch_first=True, dropout=0.8)
  (dropout): Dropout(p=0.8, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [154]:
labels = np.array([int(x) for x in data['label'].values])

train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2

total = len(encoded_sentences)
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = torch.Tensor(encoded_sentences[:train_cutoff]).long(), torch.Tensor(labels[:train_cutoff]).long()
valid_x, valid_y = torch.Tensor(encoded_sentences[train_cutoff : valid_cutoff]).long(), torch.Tensor(labels[train_cutoff : valid_cutoff]).long()
test_x, test_y = torch.Tensor(encoded_sentences[valid_cutoff:]).long(), torch.Tensor(labels[valid_cutoff:])

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 1

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [155]:
print_every = 2400
step = 0
n_epochs = 3
clip = 5  
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)