### Data processing


In [1]:
import pandas as pd
df = pd.read_csv('amazon.csv')
df = df[:300]


In [2]:
df['Words'] = df['review_body'].apply(lambda x: x.split(" "))
df['Words'] = df['Words'].apply(lambda x: [word.lower() for word in x])
df['Count'] = df['review_body'].apply(lambda x: len(x.split(" ")))
rows_to_delete = df[df['Count'] < 10].index
df.drop(rows_to_delete, inplace=True)

max_seq_len = df.Count.max()
max_seq_len 


1      [lots, of, ads<br, />slow, processing, speed<b...
2      [excellent, unit., , the, versatility, of, thi...
3      [i, bought, this, on, amazon, prime, so, i, en...
6      [this, kindle, works, well, but, the, battery,...
7      [i, really, enjoy, my, new, kindle,, it, is, e...
                             ...                        
292    [replaced, my, first, gen, kindle., it, can, a...
293    [i, had, expected, this, to, have, 4g,, but, i...
294    [it, was, everything, i, wanted, and, more., a...
295    [i, like, having, a, kindle, to, read, and, ge...
296    [i, am, impressed,, this, our, fourth, kindle,...
Name: Words, Length: 197, dtype: object


In [3]:
class dictionary():
    def __init__(self):
        self.word2idx = {"<PAD>" : 0}
        self.idx2word = ["<PAD>"]
    
    def add(self,word):
        word_lower = word.lower()
        if word not in self.word2idx:
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            
    def __len__(self):
        return len(self.word2idx)

In [4]:
dic = dictionary()

for idx,row in df.iterrows():
    for word in row.Words:
        dic.add(word)

### dataset building

In [5]:
from  torch.utils.data import  Dataset, DataLoader
import  torch

class sentimentdataset(Dataset):
    def __init__(self,df,max_seq_len):
            self.data = df.Words.tolist()
            self.label = df.star_rating.tolist()
            self.max_seq_len= max_seq_len
    def __getitem__(self,idx):
        tokens = []
        for word in self.data[idx]:
            tokens.append(dic.word2idx[word])
            
        for  i  in  range(self.max_seq_len-len(tokens)):
            tokens.append(dic.word2idx['<PAD>'])
            
        return torch.tensor(tokens).long(),torch.tensor(self.label[idx]).long()
    def __len__(self):
        return len(self.data)

In [6]:
batch_sizes = 16

train_dataset = sentimentdataset(df,max_seq_len)
train_dataloader = DataLoader(train_dataset,batch_sizes,shuffle = True,drop_last = True)

In [8]:
test_df = df.sample(frac=0.2)
test_dataset = sentimentdataset(test_df,max_seq_len)
test_dataloader = DataLoader(test_dataset,batch_sizes,shuffle = True,drop_last = True)

### LSTM model  building


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LanguageModel(nn.Module):
    def __init__(self, max_seq_len, num_layers, hidden_dim, embedding_dim, vocab_sizes, dropout_rate=0.5):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(vocab_sizes, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=num_layers,
                            dropout=dropout_rate,
                            batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(hidden_dim * max_seq_len, 5)  
    
    def forward(self, x, hidden, cell):
        
        embedding = self.embedding(x)
        
        output, (h, c) = self.lstm(embedding, (hidden, cell))
        y = self.dropout(output)
        y = self.flatten(y)
        y = torch.softmax(y, dim=1)  
        return y
    
    def init_hidden(self, batch_sizes):
        hidden = torch.zeros(self.num_layers, batch_sizes, self.hidden_dim)
        cell = torch.zeros(self.num_layers, batch_sizes, self.hidden_dim)
        return hidden, cell


### parameter determination

In [10]:
import torch.optim as optim

num_layers = 2
hidden_dim = 512
embedding_dim = 256
vocab_sizes = len(dic)

max_seq_len = 2362
batch_sizes = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LanguageModel(max_seq_len,num_layers,hidden_dim,embedding_dim,vocab_sizes).to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-3)



### model training

In [11]:
def train_model(dataloader):
    model.train()
    total_loss = 0
    for idx, (x, y) in enumerate(dataloader):
        x = x.to(device)
        y = y.to(device)
        hidden, cell = model.init_hidden(batch_sizes)
        hidden = hidden.to(device)
        cell = cell.to(device)
        optimizer.zero_grad()
        
        y_pred = model(x, hidden, cell)
        
        loss = F.cross_entropy(y_pred, y)  
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    TrainLoss = total_loss / len(train_dataloader)
    print("Average loss:", TrainLoss)
    
    model.eval()
    return TrainLoss

### model testing

In [12]:

def test_model(dataloader):
    model.eval()

    total_loss = 0.
    for idx, (x, y) in enumerate(dataloader):
        x = x.to(device)
        y = y.to(device)
        hidden, cell = model.init_hidden(batch_sizes)
        hidden = hidden.to(device)
        cell = cell.to(device)
        y_pred = model(x, hidden, cell)
        loss = F.cross_entropy(y_pred, y)
        total_loss += loss.item()
        
    TestLoss = total_loss/len(test_dataloader)
    print(f"Test loss: {TestLoss}")
    return TestLoss

### model evaluation

In [None]:
import matplotlib.pyplot as plt

num_epoch = 5
train_losses = []
test_losses = []

for i in range(num_epoch):
    print(f"==== Epoch {i+1} ====")
    train_loss = train_model(train_dataloader)
    test_loss = test_model(test_dataloader)
    train_losses.append(train_loss)
    test_losses.append(test_loss)



plt.plot(range(1, num_epoch + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epoch + 1), test_losses, label='Test Loss')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss over Epochs')
plt.legend()

plt.show

Duplicate key in file WindowsPath('D:/anaconda/lib/site-packages/matplotlib/mpl-data/matplotlibrc'), line 758 ('font.family :sans-serif')
Duplicate key in file WindowsPath('D:/anaconda/lib/site-packages/matplotlib/mpl-data/matplotlibrc'), line 759 ('font.sans-serif :SimHei')
Duplicate key in file WindowsPath('D:/anaconda/lib/site-packages/matplotlib/mpl-data/matplotlibrc'), line 760 ('axes.unicode_minus :False')


==== Epoch 1 ====
Average loss: 12.553682724634806
Test loss: 12.553681373596191
==== Epoch 2 ====
Average loss: 12.553683121999105
Test loss: 12.553681373596191
==== Epoch 3 ====
Average loss: 12.553683916727701
Test loss: 12.55368185043335
==== Epoch 4 ====
Average loss: 12.553683519363403
Test loss: 12.553682804107666
==== Epoch 5 ====
Average loss: 12.55367922782898
Test loss: 12.553680896759033
