In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm



In [3]:
csv = pd.read_csv(r'../data/tripadvisor_hotel_reviews.csv')

In [178]:
sentiment_dict = {1: "Negative",
                 2: "Neutral",
                 3: "Positive"}

In [179]:
def rating_to_sentiment(rating):
    if rating>3 and rating<=5:
        return 3
    elif rating == 3:
        return 2
    else:
        return 1

In [180]:
tqdm.pandas()
csv['Sentiment'] = csv['Rating'].progress_apply(rating_to_sentiment)

100%|██████████| 20491/20491 [00:00<00:00, 487426.53it/s]


In [182]:
from torchtext.data.functional import to_map_style_dataset
train_iter = [(label,text)for label,text in zip(csv['Sentiment'].to_list(),csv['Review'].to_list())]
train_dataset = to_map_style_dataset(train_iter)

In [183]:
from torch.utils.data.dataset import random_split
num_train = int(len(train_dataset) * 0.95)
split_train_dataset, split_valid_dataset = random_split(train_dataset, [num_train, len(train_dataset) - num_train])


In [184]:
# Function to tokenize the text
def yield_tokens(data_iter,tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

# Build vocabulary from tokens of training set
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(yield_tokens(train_iter,tokenizer), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [185]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define collate_batch function to get single collated tensor for batch in form needed by nn.EmbeddingBag
def collate_batch(batch,tokenizer,vocab):
    # Pipelines for processing text and labels
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1
    
    label_list, text_list, offsets = [], [], [0]
    # Iterate through batch, processing text and adding text, labels and offsets to lists
    for (label, text) in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)  

In [186]:
batch_size = 32
# Create training, validation and test set DataLoaders using custom collate_batch function
train_dataloader = DataLoader(split_train_dataset, batch_size=batch_size,
                              shuffle=True, collate_fn = lambda x: collate_batch(x,tokenizer,vocab))
val_dataloader = DataLoader(split_valid_dataset, batch_size=batch_size,
                              shuffle=True, collate_fn = lambda x: collate_batch(x,tokenizer,vocab))


# Set up dict for dataloaders to use in training
train_dataloaders = {'train':train_dataloader,'val':val_dataloader}

# Store size of training and validation sets
dataset_sizes = {'train':len(split_train_dataset),'val':len(split_valid_dataset)}

In [221]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(LSTMModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim, sparse=True)
   
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        
        self.fc = nn.Linear(hidden_dim, num_classes)

        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets).unsqueeze(1)
        lstm_output, (ht, ct) = self.lstm(embedded)
        #print(embedded.shape, self.embedding(text, offsets).shape,ht.shape,ht[-1].shape,self.fc(ht[-1]).shape)
        return self.fc(ht[-1])

# Model parameters
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 64
num_classes = 3

# Create the LSTM model, loss function, and optimizer
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1)
# Training loop
num_epochs = 100

for epoch in range(num_epochs):
    train_loss = 0.0
    train_acc = 0.0
    model.train()

    for (labels, text, offsets) in tqdm(train_dataloader):
        text = text.to(device)
        labels = labels.to(device)
        offsets = offsets.to(device)
       # print('text.shape:',text.shape)

        optimizer.zero_grad()
        outputs = model(text, offsets)
        
        loss = criterion(outputs, labels)
   
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == labels).sum().item()
        

    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataset)

    val_loss = 0.0
    val_acc = 0.0
    model.eval()

    val_loss /= len(val_dataloader)
 

    print(f'Epoch {epoch+1}/{num_epochs}, train_loss: {train_loss:.4f}, train_acc: {train_acc:.4f}')


100%|██████████| 609/609 [00:05<00:00, 111.96it/s]


Epoch 1/100, train_loss: 0.6678, train_acc: 0.7165


100%|██████████| 609/609 [00:04<00:00, 122.51it/s]


Epoch 2/100, train_loss: 0.5043, train_acc: 0.7734


100%|██████████| 609/609 [00:04<00:00, 134.37it/s]


Epoch 3/100, train_loss: 0.4346, train_acc: 0.7955


100%|██████████| 609/609 [00:04<00:00, 139.34it/s]


Epoch 4/100, train_loss: 0.4013, train_acc: 0.8067


100%|██████████| 609/609 [00:04<00:00, 130.94it/s]


Epoch 5/100, train_loss: 0.3757, train_acc: 0.8128


100%|██████████| 609/609 [00:04<00:00, 136.49it/s]


Epoch 6/100, train_loss: 0.3548, train_acc: 0.8222


100%|██████████| 609/609 [00:04<00:00, 129.12it/s]


Epoch 7/100, train_loss: 0.3404, train_acc: 0.8239


100%|██████████| 609/609 [00:04<00:00, 122.22it/s]


Epoch 8/100, train_loss: 0.3332, train_acc: 0.8280


100%|██████████| 609/609 [00:04<00:00, 129.31it/s]


Epoch 9/100, train_loss: 0.3204, train_acc: 0.8318


100%|██████████| 609/609 [00:04<00:00, 138.05it/s]


Epoch 10/100, train_loss: 0.3138, train_acc: 0.8324


100%|██████████| 609/609 [00:04<00:00, 130.07it/s]


Epoch 11/100, train_loss: 0.2985, train_acc: 0.8398


100%|██████████| 609/609 [00:04<00:00, 125.87it/s]


Epoch 12/100, train_loss: 0.2962, train_acc: 0.8413


100%|██████████| 609/609 [00:04<00:00, 139.26it/s]


Epoch 13/100, train_loss: 0.2868, train_acc: 0.8447


100%|██████████| 609/609 [00:04<00:00, 145.14it/s]


Epoch 14/100, train_loss: 0.2826, train_acc: 0.8452


100%|██████████| 609/609 [00:04<00:00, 130.28it/s]


Epoch 15/100, train_loss: 0.2745, train_acc: 0.8488


100%|██████████| 609/609 [00:04<00:00, 126.90it/s]


Epoch 16/100, train_loss: 0.2680, train_acc: 0.8489


100%|██████████| 609/609 [00:04<00:00, 136.76it/s]


Epoch 17/100, train_loss: 0.2607, train_acc: 0.8527


100%|██████████| 609/609 [00:04<00:00, 131.96it/s]


Epoch 18/100, train_loss: 0.2570, train_acc: 0.8528


100%|██████████| 609/609 [00:05<00:00, 120.90it/s]


Epoch 19/100, train_loss: 0.2525, train_acc: 0.8540


100%|██████████| 609/609 [00:04<00:00, 136.63it/s]


Epoch 20/100, train_loss: 0.2449, train_acc: 0.8606


100%|██████████| 609/609 [00:04<00:00, 128.84it/s]


Epoch 21/100, train_loss: 0.2383, train_acc: 0.8597


100%|██████████| 609/609 [00:04<00:00, 130.64it/s]


Epoch 22/100, train_loss: 0.2314, train_acc: 0.8638


100%|██████████| 609/609 [00:04<00:00, 144.52it/s]


Epoch 23/100, train_loss: 0.2296, train_acc: 0.8646


100%|██████████| 609/609 [00:04<00:00, 132.76it/s]


Epoch 24/100, train_loss: 0.2249, train_acc: 0.8656


100%|██████████| 609/609 [00:04<00:00, 140.58it/s]


Epoch 25/100, train_loss: 0.2187, train_acc: 0.8702


100%|██████████| 609/609 [00:04<00:00, 140.45it/s]


Epoch 26/100, train_loss: 0.2122, train_acc: 0.8713


100%|██████████| 609/609 [00:04<00:00, 136.59it/s]


Epoch 27/100, train_loss: 0.2109, train_acc: 0.8715


100%|██████████| 609/609 [00:04<00:00, 141.89it/s]


Epoch 28/100, train_loss: 0.2058, train_acc: 0.8734


100%|██████████| 609/609 [00:04<00:00, 134.48it/s]


Epoch 29/100, train_loss: 0.1948, train_acc: 0.8778


100%|██████████| 609/609 [00:04<00:00, 131.18it/s]


Epoch 30/100, train_loss: 0.1976, train_acc: 0.8780


100%|██████████| 609/609 [00:04<00:00, 131.79it/s]


Epoch 31/100, train_loss: 0.1916, train_acc: 0.8792


100%|██████████| 609/609 [00:04<00:00, 143.11it/s]


Epoch 32/100, train_loss: 0.1889, train_acc: 0.8830


100%|██████████| 609/609 [00:04<00:00, 143.33it/s]


Epoch 33/100, train_loss: 0.1883, train_acc: 0.8807


100%|██████████| 609/609 [00:04<00:00, 140.04it/s]


Epoch 34/100, train_loss: 0.1781, train_acc: 0.8859


100%|██████████| 609/609 [00:04<00:00, 141.98it/s]


Epoch 35/100, train_loss: 0.1747, train_acc: 0.8868


100%|██████████| 609/609 [00:04<00:00, 142.25it/s]


Epoch 36/100, train_loss: 0.1808, train_acc: 0.8861


100%|██████████| 609/609 [00:04<00:00, 137.05it/s]


Epoch 37/100, train_loss: 0.1646, train_acc: 0.8914


100%|██████████| 609/609 [00:04<00:00, 136.66it/s]


Epoch 38/100, train_loss: 0.1653, train_acc: 0.8903


100%|██████████| 609/609 [00:04<00:00, 139.13it/s]


Epoch 39/100, train_loss: 0.1580, train_acc: 0.8936


100%|██████████| 609/609 [00:04<00:00, 134.86it/s]


Epoch 40/100, train_loss: 0.1613, train_acc: 0.8940


100%|██████████| 609/609 [00:04<00:00, 122.20it/s]


Epoch 41/100, train_loss: 0.1523, train_acc: 0.8950


100%|██████████| 609/609 [00:04<00:00, 136.24it/s]


Epoch 42/100, train_loss: 0.1529, train_acc: 0.8949


100%|██████████| 609/609 [00:04<00:00, 142.63it/s]


Epoch 43/100, train_loss: 0.1451, train_acc: 0.8972


100%|██████████| 609/609 [00:04<00:00, 143.57it/s]


Epoch 44/100, train_loss: 0.1370, train_acc: 0.9018


100%|██████████| 609/609 [00:04<00:00, 141.53it/s]


Epoch 45/100, train_loss: 0.1337, train_acc: 0.9043


100%|██████████| 609/609 [00:04<00:00, 142.98it/s]


Epoch 46/100, train_loss: 0.1387, train_acc: 0.9031


100%|██████████| 609/609 [00:04<00:00, 140.63it/s]


Epoch 47/100, train_loss: 0.1390, train_acc: 0.9033


100%|██████████| 609/609 [00:04<00:00, 142.95it/s]


Epoch 48/100, train_loss: 0.1283, train_acc: 0.9083


100%|██████████| 609/609 [00:04<00:00, 134.51it/s]


Epoch 49/100, train_loss: 0.1379, train_acc: 0.9045


100%|██████████| 609/609 [00:04<00:00, 136.91it/s]


Epoch 50/100, train_loss: 0.1199, train_acc: 0.9090


100%|██████████| 609/609 [00:04<00:00, 139.60it/s]


Epoch 51/100, train_loss: 0.1234, train_acc: 0.9083


100%|██████████| 609/609 [00:04<00:00, 140.25it/s]


Epoch 52/100, train_loss: 0.1143, train_acc: 0.9119


100%|██████████| 609/609 [00:04<00:00, 144.44it/s]


Epoch 53/100, train_loss: 0.1224, train_acc: 0.9097


100%|██████████| 609/609 [00:04<00:00, 138.41it/s]


Epoch 54/100, train_loss: 0.1011, train_acc: 0.9167


100%|██████████| 609/609 [00:04<00:00, 126.35it/s]


Epoch 55/100, train_loss: 0.1146, train_acc: 0.9146


100%|██████████| 609/609 [00:04<00:00, 135.12it/s]


Epoch 56/100, train_loss: 0.1140, train_acc: 0.9136


100%|██████████| 609/609 [00:04<00:00, 144.88it/s]


Epoch 57/100, train_loss: 0.0975, train_acc: 0.9181


100%|██████████| 609/609 [00:04<00:00, 127.96it/s]


Epoch 58/100, train_loss: 0.0958, train_acc: 0.9182


100%|██████████| 609/609 [00:06<00:00, 93.58it/s] 


Epoch 59/100, train_loss: 0.0954, train_acc: 0.9192


100%|██████████| 609/609 [00:04<00:00, 135.29it/s]


Epoch 60/100, train_loss: 0.0956, train_acc: 0.9181


100%|██████████| 609/609 [00:04<00:00, 127.22it/s]


Epoch 61/100, train_loss: 0.0958, train_acc: 0.9204


100%|██████████| 609/609 [00:04<00:00, 129.21it/s]


Epoch 62/100, train_loss: 0.0876, train_acc: 0.9213


100%|██████████| 609/609 [00:04<00:00, 131.68it/s]


Epoch 63/100, train_loss: 0.1101, train_acc: 0.9185


100%|██████████| 609/609 [00:04<00:00, 128.68it/s]


Epoch 64/100, train_loss: 0.0875, train_acc: 0.9237


100%|██████████| 609/609 [00:04<00:00, 145.35it/s]


Epoch 65/100, train_loss: 0.1113, train_acc: 0.9185


100%|██████████| 609/609 [00:04<00:00, 147.01it/s]


Epoch 66/100, train_loss: 0.0782, train_acc: 0.9263


100%|██████████| 609/609 [00:04<00:00, 141.23it/s]


Epoch 67/100, train_loss: 0.0840, train_acc: 0.9250


100%|██████████| 609/609 [00:04<00:00, 144.33it/s]


Epoch 68/100, train_loss: 0.0787, train_acc: 0.9247


100%|██████████| 609/609 [00:04<00:00, 136.94it/s]


Epoch 69/100, train_loss: 0.0839, train_acc: 0.9265


100%|██████████| 609/609 [00:04<00:00, 132.23it/s]


Epoch 70/100, train_loss: 0.0795, train_acc: 0.9281


100%|██████████| 609/609 [00:04<00:00, 138.09it/s]


Epoch 71/100, train_loss: 0.0692, train_acc: 0.9294


100%|██████████| 609/609 [00:04<00:00, 137.91it/s]


Epoch 72/100, train_loss: 0.0722, train_acc: 0.9282


100%|██████████| 609/609 [00:04<00:00, 138.26it/s]


Epoch 73/100, train_loss: 0.0719, train_acc: 0.9286


100%|██████████| 609/609 [00:04<00:00, 143.40it/s]


Epoch 74/100, train_loss: 0.0684, train_acc: 0.9308


100%|██████████| 609/609 [00:04<00:00, 147.29it/s]


Epoch 75/100, train_loss: 0.0732, train_acc: 0.9296


100%|██████████| 609/609 [00:04<00:00, 125.38it/s]


Epoch 76/100, train_loss: 0.0684, train_acc: 0.9316


100%|██████████| 609/609 [00:04<00:00, 135.24it/s]


Epoch 77/100, train_loss: 0.0619, train_acc: 0.9327


100%|██████████| 609/609 [00:04<00:00, 141.16it/s]


Epoch 78/100, train_loss: 0.0968, train_acc: 0.9273


100%|██████████| 609/609 [00:04<00:00, 134.13it/s]


Epoch 79/100, train_loss: 0.0564, train_acc: 0.9338


100%|██████████| 609/609 [00:04<00:00, 141.00it/s]


Epoch 80/100, train_loss: 0.0640, train_acc: 0.9331


100%|██████████| 609/609 [00:04<00:00, 136.18it/s]


Epoch 81/100, train_loss: 0.0478, train_acc: 0.9369


100%|██████████| 609/609 [00:04<00:00, 134.58it/s]


Epoch 82/100, train_loss: 0.0954, train_acc: 0.9267


100%|██████████| 609/609 [00:04<00:00, 135.85it/s]


Epoch 83/100, train_loss: 0.0458, train_acc: 0.9385


100%|██████████| 609/609 [00:04<00:00, 132.85it/s]


Epoch 84/100, train_loss: 0.0467, train_acc: 0.9375


100%|██████████| 609/609 [00:04<00:00, 137.74it/s]


Epoch 85/100, train_loss: 0.0502, train_acc: 0.9371


100%|██████████| 609/609 [00:05<00:00, 119.25it/s]


Epoch 86/100, train_loss: 0.0364, train_acc: 0.9411


100%|██████████| 609/609 [00:05<00:00, 119.01it/s]


Epoch 87/100, train_loss: 0.0691, train_acc: 0.9340


100%|██████████| 609/609 [00:04<00:00, 145.90it/s]


Epoch 88/100, train_loss: 0.0529, train_acc: 0.9371


100%|██████████| 609/609 [00:04<00:00, 144.52it/s]


Epoch 89/100, train_loss: 0.0423, train_acc: 0.9402


100%|██████████| 609/609 [00:04<00:00, 140.91it/s]


Epoch 90/100, train_loss: 0.0589, train_acc: 0.9370


100%|██████████| 609/609 [00:04<00:00, 143.42it/s]


Epoch 91/100, train_loss: 0.0491, train_acc: 0.9390


100%|██████████| 609/609 [00:04<00:00, 146.15it/s]


Epoch 92/100, train_loss: 0.0509, train_acc: 0.9379


100%|██████████| 609/609 [00:04<00:00, 146.94it/s]


Epoch 93/100, train_loss: 0.0367, train_acc: 0.9422


100%|██████████| 609/609 [00:04<00:00, 142.78it/s]


Epoch 94/100, train_loss: 0.0502, train_acc: 0.9392


100%|██████████| 609/609 [00:04<00:00, 143.47it/s]


Epoch 95/100, train_loss: 0.0415, train_acc: 0.9423


100%|██████████| 609/609 [00:04<00:00, 141.19it/s]


Epoch 96/100, train_loss: 0.0312, train_acc: 0.9432


100%|██████████| 609/609 [00:04<00:00, 144.94it/s]


Epoch 97/100, train_loss: 0.0526, train_acc: 0.9393


100%|██████████| 609/609 [00:04<00:00, 147.42it/s]


Epoch 98/100, train_loss: 0.0828, train_acc: 0.9296


100%|██████████| 609/609 [00:04<00:00, 146.01it/s]


Epoch 99/100, train_loss: 0.0416, train_acc: 0.9406


100%|██████████| 609/609 [00:04<00:00, 144.94it/s]

Epoch 100/100, train_loss: 0.0300, train_acc: 0.9440





In [None]:
import torch
from sklearn.metrics import classification_report

def evaluate(dataloader, model):
    model.eval()
    all_labels, all_predictions = [], []

    with torch.no_grad():
        for idx, (label, text, offset) in enumerate(dataloader):
            predict = model(text, offset)
            predicted_labels = predict.argmax(1)
            all_labels.extend(label.tolist())
            all_predictions.extend(predicted_labels.tolist())

    report = classification_report(all_labels, all_predictions)
    return report
accu_test = evaluate(val_dataloader,model)
print(accu_test)