<a href="https://colab.research.google.com/github/adityamishra5050/Sentiment-Analysis-of-Company-Reviews-/blob/main/DL_Project_Sentiment_Bilstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
#import jovian
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [None]:
#loading the data
reviews = pd.read_csv("/content/drive/MyDrive/Company reviews/train.csv")
print(reviews.shape)
reviews.head()

(60000, 3)


Unnamed: 0,Id,Review,Rating
0,0,Very good value and a great tv very happy and ...,5
1,1,After 6 month still can't access my account,3
2,2,I couldn't make an official review on a produc...,1
3,3,"Fantastic! Extremely easy to use website, fant...",5
4,4,So far annoyed as hell with this bt monthly pa...,1


In [None]:
#keeping only relevant columns and calculating sentence lengths
reviews = reviews[['Review', 'Rating']]
reviews.columns = ['Review', 'Rating']
reviews['review_length'] = reviews['Review'].apply(lambda x: len(x.split()))
reviews.head()

Unnamed: 0,Review,Rating,review_length
0,Very good value and a great tv very happy and ...,5,18
1,After 6 month still can't access my account,3,8
2,I couldn't make an official review on a produc...,1,92
3,"Fantastic! Extremely easy to use website, fant...",5,32
4,So far annoyed as hell with this bt monthly pa...,1,49


In [None]:
#changing ratings to 0-numbering
zero_numbering = {1:0, 2:1, 3:2, 4:3, 5:4}
reviews['Rating'] = reviews['Rating'].apply(lambda x: zero_numbering[x])

In [None]:
#mean sentence length
np.mean(reviews['review_length'])

56.56325

In [None]:
#tokenization
tok = spacy.load('en_core_web_sm')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [None]:
#count number of occurences of each word
counts = Counter()
for index, row in reviews.iterrows():
    counts.update(tokenize(row['Review']))

In [None]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 29235
num_words after: 16303


In [None]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [None]:
reviews['encoded'] = reviews['Review'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
reviews.head()

  reviews['encoded'] = reviews['Review'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))


Unnamed: 0,Review,Rating,review_length,encoded
0,Very good value and a great tv very happy and ...,4,18,"[[2, 3, 4, 5, 6, 7, 8, 2, 9, 5, 10, 11, 12, 5,..."
1,After 6 month still can't access my account,2,8,"[[17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0,..."
2,I couldn't make an official review on a produc...,0,92,"[[26, 27, 22, 28, 29, 30, 31, 32, 6, 33, 32, 3..."
3,"Fantastic! Extremely easy to use website, fant...",4,32,"[[87, 39, 88, 89, 62, 90, 91, 39, 87, 92, 93, ..."
4,So far annoyed as hell with this bt monthly pa...,0,49,"[[36, 105, 106, 60, 107, 53, 57, 108, 109, 110..."


In [None]:
#check how balanced the dataset is
Counter(reviews['Rating'])

Counter({4: 34679, 2: 1679, 0: 18663, 3: 3350, 1: 1629})

In [None]:
X = list(reviews['encoded'])
y = list(reviews['Rating'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [None]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

In [None]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long
            y = y.long
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            #nn.BCEWithLogitsLoss()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x
        y = y
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [None]:
batch_size = 5000
vocab_size = len(words)
print(vocab_size)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

16305


In [None]:
model_fixed4 =  Bidirectionallstm_fixed_len(vocab_size, 50, 50, 4)

In [None]:
from torch.nn.utils.rnn import pad_sequence


In [None]:
def train123_model(model, epochs, lr):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        for inputs, labels in train_dl:
            inputs = inputs
            labels = labels

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Print the loss every epoch
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

In [None]:
train_model(model_fixed4, epochs=5, lr=0.01)

TypeError: ignored

In [None]:
trainbilstm_model(model_fixed4, epochs=5, lr=0.01)

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, output_size)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device=x.device)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device=x.device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])

        return out


In [None]:
model5 = BiLSTM(vocab_size,50,50,5)

In [None]:
def trainbilstm_model(model, epochs, lr):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        for inputs, labels, l in train_dl:
            optimizer.zero_grad()
            outputs = model(inputs,l)

            # convert labels to one-hot encoding
            labels_onehot = F.one_hot(labels, num_classes=model.linear.out_features).float()

            loss = criterion(outputs, labels_onehot)
            loss.backward()
            optimizer.step()

        # print training loss at the end of each epoch
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


In [None]:
def Collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    inputs = pad_sequence(inputs, batch_first=True, padding_value=word2idx['PAD'])
    labels = torch.LongTensor(labels)

    return inputs, labels
