In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
import os
import logging
logging.basicConfig(
    level=logging.WARN,stream=sys.stdout,format='%(levelname)s:%(message)s')

VOCAB_SIZE = 15000

In [None]:
# 编写LSTM模型代码
class TinyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes=2):
        super(TinyLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        # 输入门i_t
        self.W_xi = nn.Parameter(torch.randn(input_size, hidden_size))
        self.W_hi = nn.Parameter(torch.randn(hidden_size, hidden_size))
        self.b_i = nn.Parameter(torch.zeros(hidden_size))
        # 遗忘门f_t
        self.W_xf = nn.Parameter(torch.randn(input_size, hidden_size))
        self.W_hf = nn.Parameter(torch.randn(hidden_size, hidden_size))
        self.b_f = nn.Parameter(torch.zeros(hidden_size))
        # 输出门o_t
        self.W_xo = nn.Parameter(torch.randn(input_size, hidden_size))
        self.W_ho = nn.Parameter(torch.randn(hidden_size, hidden_size))
        self.b_o = nn.Parameter(torch.zeros(hidden_size))
        # 候选单元g_t
        self.W_xg = nn.Parameter(torch.randn(input_size, hidden_size))
        self.W_hg = nn.Parameter(torch.randn(hidden_size, hidden_size))
        self.b_g = nn.Parameter(torch.zeros(hidden_size))

        #   初始化参数
        for param in self.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

    def forward(self, x, h_0=None, c_0=None):
        batch_size, seq_len, _ = x.size()
        if h_0 is None:
            h_0 = torch.zeros(batch_size,self.hidden_size, device=x.device)
        if c_0 is None:
            c_0 = torch.zeros(batch_size, self.hidden_size, device=x.device)
        h_pre, c_pre = h_0, c_0
        h_all = []
        for t in range(seq_len):
            x_t = x[:,t, :] # 获取当前时间步的输入
            i_t = torch.sigmoid(x_t @ self.W_xi + h_pre @ self.W_hi + self.b_i)
            f_t = torch.sigmoid(x_t @ self.W_xf + h_pre @ self.W_hf + self.b_f)
            o_t = torch.sigmoid(x_t @ self.W_xo + h_pre @ self.W_ho + self.b_o)
            g_t = torch.tanh(x_t @ self.W_xg + h_pre @ self.W_hg + self.b_g)
            c_t = f_t * c_pre + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
            h_pre, c_pre = h_t, c_t
            h_all.append(h_t.unsqueeze(1))
        output = torch.cat(h_all, dim=1)  # (batch_size, seq_len, hidden_size)
        return output, h_t, c_t
    
class TinyLSTMClassifier(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, embed_dim=64, hidden_size=128, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = TinyLSTM(embed_dim, hidden_size)  # 你手写的LSTM
        # self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)  # PyTorch内置LSTM
        self.fc = nn.Linear(hidden_size, num_classes)   # 如果用拼接，hidden_size*2

    def forward(self, x):
        # x: (batch, seq_len)
        emb = self.embedding(x)  # (batch, seq_len, embed_dim)
        output, h_pre, c_pre = self.lstm(emb)
        # 常用方式1：只用最后一个隐藏状态
        logits = self.fc(h_pre)
        # 常用方式2：池化
        # pooled = output.mean(dim=1)
        # logits = self.fc(pooled)
        # 结合方式
        # features = torch.cat([h_pre, pooled], dim=1)
        # logits = self.fc(features)
        return logits


In [None]:
# 编写GCNN模型代码
class GCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes):
        super(GCNN, self).__init__()

        self.embedding_table = nn.Embedding(vocab_size, embedding_dim)
        nn.init.xavier_uniform_(self.embedding_table.weight)

        self.conv_A_1 = nn.Conv1d(embedding_dim, 64, 15, stride=7)
        self.conv_B_1 = nn.Conv1d(embedding_dim, 64, 15, stride=7)

        self.conv_A_2 = nn.Conv1d(64, 64, 15, stride=7)
        self.conv_B_2 = nn.Conv1d(64, 64, 15, stride=7)

        self.output_linear1 = nn.Linear(64, 128)
        self.output_linear2 = nn.Linear(128, num_classes)

    def forward(self, word_index):
        # 定义GCN网络的算子操作流程，基于句子单词ID输入得到分类logits输出
        
        # 1.通过word_index得到word_embedding
        # word_index shape:[batch_size, seq_len]
        word_embedding = self.embedding_table(word_index)
    
        # word_embedding shape:[batch_size, seq_len, embedding_dim]
        word_embedding = word_embedding.permute(0, 2, 1)
        # 2.编写第一层1D门卷积模块
        A = self.conv_A_1(word_embedding)
        B = self.conv_B_1(word_embedding)
        H = A * torch.sigmoid(B)

        A = self.conv_A_2(H)
        B = self.conv_B_2(H)
        H = A * torch.sigmoid(B)

        # 3.池化并经过全连接层
        pool_output = torch.mean(H, dim=-1)
        linear1_output = self.output_linear1(pool_output)
        logits = self.output_linear2(linear1_output)
        return logits

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size = VOCAB_SIZE, embed_dim=64,num_class=2):
        super(TextClassificationModel, self).__init__()
        # 
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, token_index):
        embedded = self.embedding(token_index)
        return self.fc(embedded)

In [15]:
import os
import torch
from torch import nn
from d2l import torch as d2l

# build IMDB DataLoader
BATCH_SIZE = 64


#@save
d2l.DATA_HUB['aclImdb'] = (
    'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
    '01ada507287d82875905620988597833ad4e0903')

data_dir = d2l.download_extract('aclImdb', 'aclImdb')

In [16]:
#@save
def read_imdb(data_dir, is_train):
    """读取IMDb评论数据集文本序列和标签"""
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
                                   label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

train_data = read_imdb(data_dir, is_train=True)
print('训练集数目：', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('标签：', y, 'review:', x[0:60])

训练集数目： 25000
标签： 1 review: For a movie that gets no respect there sure are a lot of mem
标签： 1 review: Bizarre horror movie filled with famous faces but stolen by 
标签： 1 review: A solid, if unremarkable film. Matthau, as Einstein, was won


In [None]:
#@save
def load_data_imdb(batch_size, num_steps=500):
    """返回数据迭代器和IMDb评论数据集的词表"""
    data_dir = d2l.download_extract('aclImdb', 'aclImdb')
    train_data = read_imdb(data_dir, True)
    test_data = read_imdb(data_dir, False)
    train_tokens = d2l.tokenize(train_data[0], token='word')
    test_tokens = d2l.tokenize(test_data[0], token='word')
    vocab = d2l.Vocab(train_tokens, min_freq=5)
    train_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])),
                                batch_size)
    test_iter = d2l.load_array((test_features, torch.tensor(test_data[1])),
                               batch_size,
                               is_train=False)
    return train_iter, test_iter, vocab

In [None]:
def train(train_data,test_data, model,optimizer,num_epoch,lr,
          log_step_interval, save_step_interval,eval_step_interval,save_path,resume=""):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f'device:{device}')
    start_epoch = 0
    start_step = 0
    if resume:
        checkpoint = torch.load(resume, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        start_step = checkpoint['step']
        logging.info(f"Resuming from epoch {start_epoch}, step {start_step}")
    for epoch_index in range(start_epoch, num_epoch):
        ema_loss = 0
        num_batches = len(train_data)
        for batch_index, (token_index,target ) in enumerate(train_data):
            print(f'epoch_index:{epoch_index} batch_index:{batch_index}')
            token_index = token_index.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            step = num_batches * epoch_index + batch_index + 1
            logits = model(token_index)
            bce_loss = F.binary_cross_entropy(torch.sigmoid(logits), F.one_hot(target, num_classes=2).to(torch.float32))
            ema_loss = ema_loss * 0.9 + bce_loss * 0.1
            bce_loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if step % log_step_interval == 0:
                logging.info(f"Epoch {epoch_index}, Step {step}, Loss: {ema_loss.item():.4f}")
            if epoch_index == num_epoch - 1 or step % save_step_interval == 0:
                checkpoint = {
                    'epoch': epoch_index,
                    'step': step,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict()
                }
                torch.save(checkpoint, os.path.join(save_path, f"checkpoint_epoch_{epoch_index}_step_{step}.pt"))
                logging.info(f"Checkpoint saved at epoch {epoch_index}, step {step}")
            if step % eval_step_interval == 0:
                logging.info(f"Evaluating at step {step}...")
                # Evaluate the model on the test data       
                model.eval()
                ema_loss_eval = 0
                total_acc_account = 0
                total_account = 0
                for eval_batch_index,( eval_token_index,eval_target) in enumerate(test_data):
                    total_account += eval_target.shape[0]
                    eval_logits = model(eval_token_index)
                    total_acc_account += (torch.argmax(eval_logits, dim=1) == eval_target).sum().item()
                    eval_bce_loss = F.binary_cross_entropy(torch.sigmoid(eval_logits), F.one_hot(eval_target, num_classes=2).to(torch.float32))
                    ema_loss_eval = ema_loss_eval * 0.9 + eval_bce_loss * 0.1
                print(f"Evaluation Loss: {ema_loss_eval.item():.4f}, Accuracy: {total_acc_account / total_account:.4f}")
                logging.info(f"Evaluation Loss: {ema_loss_eval.item():.4f}, Accuracy: {total_acc_account / total_account:.4f}")
                model.train()

In [None]:

# Load data
train_iter, test_iter, vocab = load_data_imdb(BATCH_SIZE)

In [None]:



# Initialize model
# model = TextClassificationModel(vocab_size=len(vocab), embed_dim=64, num_class=2)
    # Evaluation Loss: 0.7483, Accuracy: 0.5403
    # Evaluation Loss: 0.6055, Accuracy: 0.7680
    # Evaluation Loss: 0.5356, Accuracy: 0.8023
    # Evaluation Loss: 0.3907, Accuracy: 0.8227
    # Evaluation Loss: 0.4177, Accuracy: 0.8483
    # Evaluation Loss: 0.3278, Accuracy: 0.8553
    # Evaluation Loss: 0.3279, Accuracy: 0.8632
    # Evaluation Loss: 0.3185, Accuracy: 0.8691
    # Evaluation Loss: 0.2957, Accuracy: 0.8717
    # Evaluation Loss: 0.3145, Accuracy: 0.8761
    # Evaluation Loss: 0.2690, Accuracy: 0.8767
    # Evaluation Loss: 0.3092, Accuracy: 0.8798
    # Evaluation Loss: 0.2458, Accuracy: 0.8780
    # Evaluation Loss: 0.3051, Accuracy: 0.8811
    # Evaluation Loss: 0.2951, Accuracy: 0.8830
model = TinyLSTMClassifier(vocab_size=len(vocab), embed_dim=64, num_classes=2,hidden_size=128)
# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training parameters
num_epoch = 20
log_step_interval = 100
save_step_interval = 500
eval_step_interval = 500
save_path = "./checkpoints"

if not os.path.exists(save_path):
    os.makedirs(save_path)

# Train the model
train(train_iter, test_iter, model, optimizer, num_epoch, 0.001,
        log_step_interval, save_step_interval, eval_step_interval, save_path)

In [None]:
# 通过保存的模型进行预测
def predict(model, text, vocab, max_length=500):
    model.eval()
    tokens = d2l.tokenize([text], token='word')
    # 将tokens转换为索引
    indices = [vocab[token] for token in tokens[0]]
    # 如果长度不足 max_length，则用 vocab['<pad>'] 填充
    if len(indices) < max_length:
        indices += [vocab['<pad>']] * (max_length - len(indices))
    # 如果长度超过 max_length，则截断
    indices = indices[:max_length]
    indices = torch.tensor(indices).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        logits = model(indices)
        prediction = torch.argmax(logits, dim=1).item()
    return "Positive" if prediction == 1 else "Negative"
if __name__ == "__main__":
    # Example usage
    model_path = "./checkpoints/checkpoint_epoch_19_step_7500.pt"
    model = TextClassificationModel(vocab_size=len(vocab), embed_dim=64, num_class=2)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.load_state_dict(torch.load(model_path, map_location=device)['model_state_dict'])
    model.to(device)
    # example_text = "The movie is boring, and I didn't love it"
    example_text = "someone said the movie was boring, but I really enjoyed it"
    prediction = predict(model, example_text, vocab)
    print(f"Prediction for '{example_text}': {prediction}")