### 数据处理

In [125]:
import pandas as pd
df = pd.read_csv('amazon.csv')
df = df[:300]


In [165]:
df['Words'] = df['review_body'].apply(lambda x: x.split(" "))
df['Words'] = df['Words'].str.lower()
df['Words'] = df['Words'].astype(str)
df['Count'] = df['review_body'].apply(lambda x: len(x.split(" ")))
rows_to_delete = df[df['Count'] < 10].index
df.drop(rows_to_delete, inplace=True)

max_seq_len = df.Count.max()
max_seq_len 

553

In [166]:
class dictionary():
    def __init__(self):
        self.word2idx = {"<PAD>" : 0}
        self.idx2word = ["<PAD>"]
    
    def add(self,word):
        word_lower = word.lower()
        if word not in self.word2idx:
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            
    def __len__(self):
        return len(self.word2idx)

In [167]:
dic = dictionary()

for idx,row in df.iterrows():
    for word in row.Words:
        dic.add(word)

### 数据集的构建

In [168]:
from  torch.utils.data import  Dataset, DataLoader
class sentimentdataset(Dataset):
    def __init__(self,df,max_seq_len):
            self.data = df.Words.tolist()
            self.label = df.star_rating.tolist()
            self.max_seq_len= max_seq_len
    def __getitem__(self,idx):
        tokens = []
        for word in self.data[idx]:
            tokens.append(dic.word2idx[word])
            
        for  i  in  range(self.max_seq_len-len(tokens)):
            tokens.append(dic.word2idx['<PAD>'])
            
        return torch.tensor(tokens).long(),torch.tensor(self.label[idx]).long()
    def __len__(self):
        return len(self.data)

In [169]:
batch_sizes = 16

dataset = sentimentdataset(df,max_seq_len)
dataloader = DataLoader(dataset,batch_sizes,shuffle = True)

### LSTM模型构建


In [170]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LanguageModel(nn.Module):
    def __init__(self, max_seq_len, num_layers, hidden_dim, embedding_dim, vocab_sizes, dropout_rate=0.5):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(vocab_sizes, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=num_layers,
                            dropout=dropout_rate,
                            batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(hidden_dim * max_seq_len, 5)  # Output dimension is 5 for multi-class classification
    
    def forward(self, x, hidden, cell):
        embedding = self.embedding(x)
        output, (h, c) = self.lstm(embedding, (hidden, cell))
        y = self.dropout(output)
        y = self.flatten(y)
        y = torch.softmax(y, dim=1)  # Use softmax instead of sigmoid for multi-class classification
        return y
    
    def init_hidden(self, batch_sizes):
        hidden = torch.zeros(self.num_layers, batch_sizes, self.hidden_dim)
        cell = torch.zeros(self.num_layers, batch_sizes, self.hidden_dim)
        return hidden, cell


### 参数设定

In [171]:
import torch.optim as optim

num_layers = 2
hidden_dim = 512
embedding_dim = 256
vocab_sizes = len(dic)

max_seq_len = 2362
batch_sizes = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Languagemodel(max_seq_len,num_layers,hidden_dim,embedding_dim,vocab_sizes).to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-3)



### 模型训练

In [172]:
def train_model():
    model.train()
    total_loss = 0
    for idx, (x, y) in enumerate(dataloader):
        x = x.to(device)
        y = y.to(device)
        hidden, cell = model.init_hidden(batch_sizes)
        hidden = hidden.to(device)
        cell = cell.to(device)
        optimizer.zero_grad()
        
        y_pred = model(x, hidden, cell)
        
        loss = F.cross_entropy(y_pred, y)  
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    print("Average loss:", average_loss)
    
    model.eval()


In [173]:
train_model()

RuntimeError: Expected hidden[0] size (2, 5, 512), got [2, 16, 512]

In [174]:

def test_model(model, dataloader):
    model.eval()

    total_loss = 0.
    for idx, (x, y) in enumerate(dataloader):
        x = x.to(device)
        y = y.to(device)
        hidden, cell = model.init_hidden(batch_sizes)
        hidden = hidden.to(device)
        cell = cell.to(device)
        y_pred = model(x)
        loss = F.cross_entropy(y_pred, y)
        total_loss += loss.item()
    print(f"Test loss: {total_loss/len(test_dataset)}")