In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [38]:
csv = pd.read_csv(r'../data/tripadvisor_hotel_reviews.csv')

In [39]:
sentiment_dict = {1: "Negative",
                 2: "Neutral",
                 3: "Positive"}

In [40]:
def rating_to_sentiment(rating):
    if rating>3 and rating<=5:
        return 3
    elif rating == 3:
        return 2
    else:
        return 1

In [41]:
tqdm.pandas()
csv['Sentiment'] = csv['Rating'].progress_apply(rating_to_sentiment)

100%|██████████| 20491/20491 [00:00<00:00, 436659.38it/s]


In [42]:
from torchtext.data.functional import to_map_style_dataset
train_iter = [(label,text)for label,text in zip(csv['Sentiment'].to_list(),csv['Review'].to_list())]
train_dataset = to_map_style_dataset(train_iter)

In [43]:
from torch.utils.data.dataset import random_split
num_train = int(len(train_dataset) * 0.95)
split_train_dataset, split_test_dataset = random_split(train_dataset, [num_train, len(train_dataset) - num_train])


In [44]:
# Function to tokenize the text
def yield_tokens(data_iter,tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

# Build vocabulary from tokens of training set
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(yield_tokens(train_iter,tokenizer), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define collate_batch function to get single collated tensor for batch in form needed by nn.EmbeddingBag
def collate_batch(batch,tokenizer,vocab):
    # Pipelines for processing text and labels
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1
    
    label_list, text_list, offsets = [], [], [0]
    # Iterate through batch, processing text and adding text, labels and offsets to lists
    for (label, text) in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)  

In [46]:
batch_size = 32
# Create training, validation and test set DataLoaders using custom collate_batch function
train_dataloader = DataLoader(split_train_dataset, batch_size=batch_size,
                              shuffle=True, collate_fn = lambda x: collate_batch(x,tokenizer,vocab))
val_dataloader = DataLoader(split_test_dataset, batch_size=batch_size,
                              shuffle=True, collate_fn = lambda x: collate_batch(x,tokenizer,vocab))



In [47]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(LSTMModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim, sparse=True)
   
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        
        self.fc = nn.Linear(hidden_dim, num_classes)

        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets).unsqueeze(1)
        lstm_output, (ht, ct) = self.lstm(embedded)
        #print(embedded.shape, self.embedding(text, offsets).shape,ht.shape,ht[-1].shape,self.fc(ht[-1]).shape)
        return self.fc(ht[-1])

# Model parameters
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 64
num_classes = 3

# Create the LSTM model, loss function, and optimizer
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1)
#optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    train_acc = 0.0
    model.train()

    for (labels, text, offsets) in tqdm(train_dataloader):
        text = text.to(device)
        labels = labels.to(device)
        offsets = offsets.to(device)
       # print('text.shape:',text.shape)

        optimizer.zero_grad()
        outputs = model(text, offsets)
        
        loss = criterion(outputs, labels)
   
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == labels).sum().item()
        

    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataset)

 

    print(f'Epoch {epoch+1}/{num_epochs}, train_loss: {train_loss:.4f}, train_acc: {train_acc:.4f}')


100%|██████████| 609/609 [00:05<00:00, 103.93it/s]


Epoch 1/100, train_loss: 0.6750, train_acc: 0.7144


100%|██████████| 609/609 [00:05<00:00, 121.44it/s]


Epoch 2/100, train_loss: 0.5049, train_acc: 0.7732


100%|██████████| 609/609 [00:04<00:00, 128.53it/s]


Epoch 3/100, train_loss: 0.4369, train_acc: 0.7955


100%|██████████| 609/609 [00:04<00:00, 125.10it/s]


Epoch 4/100, train_loss: 0.4068, train_acc: 0.8050


100%|██████████| 609/609 [00:04<00:00, 126.55it/s]


Epoch 5/100, train_loss: 0.3753, train_acc: 0.8118


100%|██████████| 609/609 [00:04<00:00, 128.79it/s]


Epoch 6/100, train_loss: 0.3618, train_acc: 0.8173


100%|██████████| 609/609 [00:04<00:00, 136.94it/s]


Epoch 7/100, train_loss: 0.3450, train_acc: 0.8244


100%|██████████| 609/609 [00:04<00:00, 140.55it/s]


Epoch 8/100, train_loss: 0.3322, train_acc: 0.8283


100%|██████████| 609/609 [00:04<00:00, 141.17it/s]


Epoch 9/100, train_loss: 0.3186, train_acc: 0.8331


100%|██████████| 609/609 [00:04<00:00, 139.56it/s]


Epoch 10/100, train_loss: 0.3119, train_acc: 0.8345


100%|██████████| 609/609 [00:04<00:00, 128.28it/s]


Epoch 11/100, train_loss: 0.3041, train_acc: 0.8379


100%|██████████| 609/609 [00:04<00:00, 123.81it/s]


Epoch 12/100, train_loss: 0.2978, train_acc: 0.8397


100%|██████████| 609/609 [00:04<00:00, 133.51it/s]


Epoch 13/100, train_loss: 0.2883, train_acc: 0.8430


100%|██████████| 609/609 [00:04<00:00, 126.28it/s]


Epoch 14/100, train_loss: 0.2827, train_acc: 0.8477


100%|██████████| 609/609 [00:05<00:00, 111.90it/s]


Epoch 15/100, train_loss: 0.2720, train_acc: 0.8487


100%|██████████| 609/609 [00:05<00:00, 119.66it/s]


Epoch 16/100, train_loss: 0.2673, train_acc: 0.8500


100%|██████████| 609/609 [00:05<00:00, 115.88it/s]


Epoch 17/100, train_loss: 0.2605, train_acc: 0.8533


100%|██████████| 609/609 [00:04<00:00, 130.44it/s]


Epoch 18/100, train_loss: 0.2545, train_acc: 0.8555


100%|██████████| 609/609 [00:04<00:00, 127.75it/s]


Epoch 19/100, train_loss: 0.2521, train_acc: 0.8561


100%|██████████| 609/609 [00:04<00:00, 121.82it/s]


Epoch 20/100, train_loss: 0.2466, train_acc: 0.8601


100%|██████████| 609/609 [00:04<00:00, 122.09it/s]


Epoch 21/100, train_loss: 0.2379, train_acc: 0.8633


100%|██████████| 609/609 [00:04<00:00, 128.19it/s]


Epoch 22/100, train_loss: 0.2331, train_acc: 0.8636


100%|██████████| 609/609 [00:05<00:00, 115.35it/s]


Epoch 23/100, train_loss: 0.2227, train_acc: 0.8682


100%|██████████| 609/609 [00:05<00:00, 119.70it/s]


Epoch 24/100, train_loss: 0.2211, train_acc: 0.8697


100%|██████████| 609/609 [00:05<00:00, 121.54it/s]


Epoch 25/100, train_loss: 0.2131, train_acc: 0.8720


100%|██████████| 609/609 [00:06<00:00, 100.61it/s]


Epoch 26/100, train_loss: 0.2130, train_acc: 0.8713


100%|██████████| 609/609 [00:05<00:00, 111.11it/s]


Epoch 27/100, train_loss: 0.2056, train_acc: 0.8761


100%|██████████| 609/609 [00:05<00:00, 117.83it/s]


Epoch 28/100, train_loss: 0.2025, train_acc: 0.8765


100%|██████████| 609/609 [00:05<00:00, 116.51it/s]


Epoch 29/100, train_loss: 0.2011, train_acc: 0.8765


100%|██████████| 609/609 [00:04<00:00, 133.14it/s]


Epoch 30/100, train_loss: 0.1918, train_acc: 0.8796


100%|██████████| 609/609 [00:04<00:00, 126.49it/s]


Epoch 31/100, train_loss: 0.1930, train_acc: 0.8804


100%|██████████| 609/609 [00:05<00:00, 113.23it/s]


Epoch 32/100, train_loss: 0.1853, train_acc: 0.8835


100%|██████████| 609/609 [00:05<00:00, 107.00it/s]


Epoch 33/100, train_loss: 0.1778, train_acc: 0.8864


100%|██████████| 609/609 [00:04<00:00, 130.02it/s]


Epoch 34/100, train_loss: 0.1720, train_acc: 0.8880


100%|██████████| 609/609 [00:04<00:00, 133.45it/s]


Epoch 35/100, train_loss: 0.1695, train_acc: 0.8901


100%|██████████| 609/609 [00:04<00:00, 133.04it/s]


Epoch 36/100, train_loss: 0.1670, train_acc: 0.8901


100%|██████████| 609/609 [00:04<00:00, 130.07it/s]


Epoch 37/100, train_loss: 0.1632, train_acc: 0.8919


100%|██████████| 609/609 [00:05<00:00, 120.07it/s]


Epoch 38/100, train_loss: 0.1587, train_acc: 0.8930


100%|██████████| 609/609 [00:04<00:00, 124.59it/s]


Epoch 39/100, train_loss: 0.1570, train_acc: 0.8951


100%|██████████| 609/609 [00:04<00:00, 134.77it/s]


Epoch 40/100, train_loss: 0.1598, train_acc: 0.8930


100%|██████████| 609/609 [00:04<00:00, 132.73it/s]


Epoch 41/100, train_loss: 0.1519, train_acc: 0.8981


100%|██████████| 609/609 [00:04<00:00, 134.19it/s]


Epoch 42/100, train_loss: 0.1513, train_acc: 0.8974


100%|██████████| 609/609 [00:04<00:00, 129.62it/s]


Epoch 43/100, train_loss: 0.1419, train_acc: 0.8994


100%|██████████| 609/609 [00:04<00:00, 136.83it/s]


Epoch 44/100, train_loss: 0.1402, train_acc: 0.9007


100%|██████████| 609/609 [00:05<00:00, 104.86it/s]


Epoch 45/100, train_loss: 0.1386, train_acc: 0.9024


100%|██████████| 609/609 [00:06<00:00, 101.39it/s]


Epoch 46/100, train_loss: 0.1399, train_acc: 0.9023


100%|██████████| 609/609 [00:06<00:00, 101.21it/s]


Epoch 47/100, train_loss: 0.1267, train_acc: 0.9052


100%|██████████| 609/609 [00:06<00:00, 89.72it/s] 


Epoch 48/100, train_loss: 0.1418, train_acc: 0.9045


100%|██████████| 609/609 [00:05<00:00, 117.23it/s]


Epoch 49/100, train_loss: 0.1254, train_acc: 0.9071


100%|██████████| 609/609 [00:05<00:00, 103.33it/s]


Epoch 50/100, train_loss: 0.1176, train_acc: 0.9123


100%|██████████| 609/609 [00:04<00:00, 133.23it/s]


Epoch 51/100, train_loss: 0.1262, train_acc: 0.9087


100%|██████████| 609/609 [00:04<00:00, 122.96it/s]


Epoch 52/100, train_loss: 0.1217, train_acc: 0.9123


100%|██████████| 609/609 [00:04<00:00, 133.43it/s]


Epoch 53/100, train_loss: 0.1111, train_acc: 0.9124


100%|██████████| 609/609 [00:04<00:00, 124.01it/s]


Epoch 54/100, train_loss: 0.1088, train_acc: 0.9139


100%|██████████| 609/609 [00:05<00:00, 108.90it/s]


Epoch 55/100, train_loss: 0.1120, train_acc: 0.9143


100%|██████████| 609/609 [00:05<00:00, 111.00it/s]


Epoch 56/100, train_loss: 0.1125, train_acc: 0.9153


100%|██████████| 609/609 [00:05<00:00, 111.98it/s]


Epoch 57/100, train_loss: 0.1026, train_acc: 0.9159


100%|██████████| 609/609 [00:04<00:00, 123.16it/s]


Epoch 58/100, train_loss: 0.0983, train_acc: 0.9192


100%|██████████| 609/609 [00:05<00:00, 111.55it/s]


Epoch 59/100, train_loss: 0.1056, train_acc: 0.9166


100%|██████████| 609/609 [00:04<00:00, 132.02it/s]


Epoch 60/100, train_loss: 0.0838, train_acc: 0.9232


100%|██████████| 609/609 [00:04<00:00, 139.87it/s]


Epoch 61/100, train_loss: 0.0974, train_acc: 0.9185


100%|██████████| 609/609 [00:04<00:00, 138.44it/s]


Epoch 62/100, train_loss: 0.0988, train_acc: 0.9192


100%|██████████| 609/609 [00:04<00:00, 142.88it/s]


Epoch 63/100, train_loss: 0.0895, train_acc: 0.9224


100%|██████████| 609/609 [00:04<00:00, 138.90it/s]


Epoch 64/100, train_loss: 0.0860, train_acc: 0.9234


100%|██████████| 609/609 [00:04<00:00, 135.72it/s]


Epoch 65/100, train_loss: 0.0859, train_acc: 0.9225


100%|██████████| 609/609 [00:05<00:00, 118.35it/s]


Epoch 66/100, train_loss: 0.0774, train_acc: 0.9257


100%|██████████| 609/609 [00:05<00:00, 114.92it/s]


Epoch 67/100, train_loss: 0.0816, train_acc: 0.9276


100%|██████████| 609/609 [00:05<00:00, 115.29it/s]


Epoch 68/100, train_loss: 0.0774, train_acc: 0.9284


100%|██████████| 609/609 [00:04<00:00, 130.51it/s]


Epoch 69/100, train_loss: 0.0794, train_acc: 0.9280


100%|██████████| 609/609 [00:04<00:00, 121.83it/s]


Epoch 70/100, train_loss: 0.0808, train_acc: 0.9291


100%|██████████| 609/609 [00:05<00:00, 120.27it/s]


Epoch 71/100, train_loss: 0.0732, train_acc: 0.9287


100%|██████████| 609/609 [00:04<00:00, 128.21it/s]


Epoch 72/100, train_loss: 0.0643, train_acc: 0.9312


100%|██████████| 609/609 [00:04<00:00, 124.88it/s]


Epoch 73/100, train_loss: 0.0743, train_acc: 0.9297


100%|██████████| 609/609 [00:06<00:00, 100.35it/s]


Epoch 74/100, train_loss: 0.0612, train_acc: 0.9315


100%|██████████| 609/609 [00:05<00:00, 118.32it/s]


Epoch 75/100, train_loss: 0.0730, train_acc: 0.9304


100%|██████████| 609/609 [00:05<00:00, 111.66it/s]


Epoch 76/100, train_loss: 0.0716, train_acc: 0.9327


100%|██████████| 609/609 [00:07<00:00, 80.66it/s] 


Epoch 77/100, train_loss: 0.0621, train_acc: 0.9318


100%|██████████| 609/609 [00:06<00:00, 93.86it/s] 


Epoch 78/100, train_loss: 0.0635, train_acc: 0.9324


100%|██████████| 609/609 [00:05<00:00, 107.29it/s]


Epoch 79/100, train_loss: 0.0723, train_acc: 0.9309


100%|██████████| 609/609 [00:05<00:00, 112.74it/s]


Epoch 80/100, train_loss: 0.0553, train_acc: 0.9341


100%|██████████| 609/609 [00:05<00:00, 119.28it/s]


Epoch 81/100, train_loss: 0.0646, train_acc: 0.9346


100%|██████████| 609/609 [00:05<00:00, 107.79it/s]


Epoch 82/100, train_loss: 0.0437, train_acc: 0.9398


100%|██████████| 609/609 [00:04<00:00, 124.70it/s]


Epoch 83/100, train_loss: 0.0606, train_acc: 0.9357


100%|██████████| 609/609 [00:05<00:00, 118.32it/s]


Epoch 84/100, train_loss: 0.0465, train_acc: 0.9387


100%|██████████| 609/609 [00:04<00:00, 127.81it/s]


Epoch 85/100, train_loss: 0.0599, train_acc: 0.9350


100%|██████████| 609/609 [00:04<00:00, 127.81it/s]


Epoch 86/100, train_loss: 0.0552, train_acc: 0.9353


100%|██████████| 609/609 [00:05<00:00, 106.83it/s]


Epoch 87/100, train_loss: 0.0515, train_acc: 0.9383


100%|██████████| 609/609 [00:05<00:00, 111.86it/s]


Epoch 88/100, train_loss: 0.0458, train_acc: 0.9395


100%|██████████| 609/609 [00:05<00:00, 105.38it/s]


Epoch 89/100, train_loss: 0.0460, train_acc: 0.9392


100%|██████████| 609/609 [00:05<00:00, 119.42it/s]


Epoch 90/100, train_loss: 0.0504, train_acc: 0.9394


100%|██████████| 609/609 [00:05<00:00, 113.65it/s]


Epoch 91/100, train_loss: 0.0383, train_acc: 0.9403


100%|██████████| 609/609 [00:05<00:00, 111.71it/s]


Epoch 92/100, train_loss: 0.0396, train_acc: 0.9418


100%|██████████| 609/609 [00:06<00:00, 94.80it/s] 


Epoch 93/100, train_loss: 0.0363, train_acc: 0.9420


100%|██████████| 609/609 [00:06<00:00, 90.88it/s] 


Epoch 94/100, train_loss: 0.0683, train_acc: 0.9348


100%|██████████| 609/609 [00:05<00:00, 111.17it/s]


Epoch 95/100, train_loss: 0.0389, train_acc: 0.9417


100%|██████████| 609/609 [00:05<00:00, 108.01it/s]


Epoch 96/100, train_loss: 0.0355, train_acc: 0.9428


100%|██████████| 609/609 [00:05<00:00, 116.25it/s]


Epoch 97/100, train_loss: 0.0295, train_acc: 0.9444


100%|██████████| 609/609 [00:05<00:00, 108.96it/s]


Epoch 98/100, train_loss: 0.0501, train_acc: 0.9419


100%|██████████| 609/609 [00:05<00:00, 117.90it/s]


Epoch 99/100, train_loss: 0.0265, train_acc: 0.9454


100%|██████████| 609/609 [00:05<00:00, 117.26it/s]

Epoch 100/100, train_loss: 0.0527, train_acc: 0.9380





In [48]:
import torch

# Assuming `model` is your trained PyTorch model
torch.save(model.state_dict(), 'model_LSTM.pth')


In [51]:
loaded_model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_classes)
loaded_model.load_state_dict(torch.load('model_LSTM.pth'))
loaded_model.eval()


LSTMModel(
  (embedding): EmbeddingBag(64280, 128, mode='mean')
  (lstm): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)

In [79]:
import torch
from sklearn.metrics import classification_report

def evaluate(dataloader, model):
    model.eval()
    all_labels, all_predictions = [], []

    with torch.no_grad():
        for idx, (label, text, offset) in enumerate(dataloader):
            predict = model(text, offset)
            predicted_labels = predict.argmax(1)
            all_labels.extend(label.tolist())
            all_predictions.extend(predicted_labels.tolist())

    report = classification_report(all_labels, all_predictions)
    return report
accu_test = evaluate(val_dataloader,model)
print(accu_test)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       145
           1       0.97      0.88      0.92       128
           2       0.98      1.00      0.99       752

    accuracy                           0.98      1025
   macro avg       0.98      0.95      0.96      1025
weighted avg       0.98      0.98      0.98      1025

