In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
import gensim
import jieba
import pandas as pd

In [None]:
PAD = '<PAD>'
UNK = '<UNK>'

label_dic = {'positive': 1, 'negative': 2, 'neutral': 0}

def read_json(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as f:
      anslist = json.load(f)
    return anslist

def build_dataset(file_path, mode, wv, padding_size = 32):
    assert mode in ['train', 'test']
    res = read_json(file_path)
    text, labels = [], []
    for dic in tqdm(res):
        corpus = dic['content']
        if mode == 'train':
            label = label_dic[dic['label']]
        words = list(jieba.cut(corpus))
        if len(words) < padding_size:
            words += [PAD] * (padding_size - len(words))
        else:
            words = words[: padding_size]
        unk_idx = wv.word2idx[UNK]
        idxs = [wv.word2idx.get(word, unk_idx) for word in words]
        if mode == 'train':
            text.append(idxs)
            labels.append(label)
        else:
            text.append(idxs)
    if mode == 'train':
        return text, labels
    else:
        return text

class Mydataset(Dataset):
    def __init__(self, file_path, mode, wv):
        if mode == 'train':
            self.x, self.y = build_dataset(file_path, mode, wv)
            assert len(self.x) == len(self.y)
            self.y = torch.tensor(self.y)
        else:
            self.x = build_dataset(file_path, mode, wv)
        self.mode = mode
        self.x = torch.tensor(self.x)
        self.len = len(self.x)
    def __len__(self):
        return self.len
    def __getitem__(self, index):
        if self.mode == 'train':
            return self.x[index], self.y[index]
        else:
            return self.x[index]

In [None]:
class word2vec():
  def __init__(self, pretrained_w2v_path = '/content/drive/MyDrive/情感分类实验数据2023/sgns.weibo.bigram-char'):
    w2vmodel = gensim.models.KeyedVectors.load_word2vec_format(pretrained_w2v_path, binary = False, encoding = 'utf-8')
    self.vocab_size = len(w2vmodel.index_to_key)
    self.word2idx = w2vmodel.key_to_index
    self.word2idx.update({'<UNK>': self.vocab_size, '<PAD>': self.vocab_size + 1})
    self.vocab_size += 2
    self.vector_size = w2vmodel.vector_size
    self.vocab = w2vmodel.index_to_key
    self.vocab.append('<UNK>')
    self.vocab.append('<PAD>')
    self.vectors = w2vmodel.vectors
    self.vectors = torch.tensor(np.append(np.append(
        self.vectors, self.vectors.mean(axis=0).reshape(1,-1), axis=0),
        self.vectors.mean(axis=0).reshape(1,-1), axis=0))

In [None]:
class LSTM(nn.Module):
  def __init__(self, w2vmodel, drop_rate = 0.3):
    super(LSTM, self).__init__()
    self.embedding = nn.Embedding.from_pretrained(w2vmodel.vectors, freeze=True)
    embedding_dim = w2vmodel.vector_size
    self.LSTM = nn.LSTM(embedding_dim, 32, num_layers=3, bidirectional=True, batch_first=True, dropout=drop_rate)
    self.Fc = nn.Linear(64, 3)

  def forward(self, x):
    output = self.embedding(x)
    output, _ = self.LSTM(output)
    output = self.Fc(output[:, -1, :])
    return output

In [None]:
w2v = word2vec()

In [None]:
model = LSTM(w2v)

In [None]:
train_data = Mydataset('/content/drive/MyDrive/情感分类实验数据2023/public-data/train_data/train_data.json', mode='train', wv=w2v)
train_loader = DataLoader(train_data, batch_size=16, shuffle=False)

  0%|          | 0/8606 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.259 seconds.
DEBUG:jieba:Loading model cost 1.259 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.
100%|██████████| 8606/8606 [00:03<00:00, 2234.73it/s]


In [None]:
import torch.nn.functional as Fun

def train(model, train_iter):
    # 启用dropout
    model.train()
    # 设置adam优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(150):
      print("Epoch [{}/{}]".format(epoch+1, 150))
      for i, (trains, labels) in enumerate(train_iter):
        outputs = model(trains)
        model.zero_grad()
        loss = Fun.cross_entropy(outputs, labels)
        if i == 0:
          print(loss.item())
        loss.backward()
        optimizer.step()

In [None]:
train(model, train_loader)

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/情感分类实验数据2023/LSTM.pt')

In [None]:
model = LSTM(w2v)
model.load_state_dict(torch.load('/content/drive/MyDrive/情感分类实验数据2023/LSTM.pt'))

In [None]:
dev_data = Mydataset('/content/drive/MyDrive/情感分类实验数据2023/public-data/eval_data/eval_data.json', mode='train', wv=w2v)
dev_loader = DataLoader(dev_data, batch_size=16, shuffle=False)

100%|██████████| 2000/2000 [00:00<00:00, 4527.08it/s]


In [None]:
test_data = Mydataset('/content/drive/MyDrive/情感分类实验数据2023/public-data/test_data/test.json', mode='test', wv=w2v)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

100%|██████████| 3000/3000 [00:00<00:00, 4289.23it/s]


In [None]:
import torch.nn.functional as Fun

def dev(model, dev_iter):
  model.eval()
  loss_total, acc = 0, 0
  eval_all = np.array([], dtype=int)
  with torch.no_grad():
    for i, (text, label) in enumerate(tqdm(dev_iter)):
      outputs = model(text)
      loss = Fun.cross_entropy(outputs, label)
      loss_total += loss
      acc += sum([1 if mylabel == truelabel else 0 for mylabel, truelabel in zip(torch.max(outputs.cpu().data, 1)[1].numpy(), label.numpy())])
      eval_all = np.append(eval_all, torch.max(outputs.cpu().data, 1)[1].numpy())
  print(f'\nacc:{acc/len(dev_data)}, total_loss:{loss}')

In [None]:
dev(model, dev_loader)

100%|██████████| 125/125 [00:00<00:00, 264.35it/s]


acc:0.7355, total_loss:1.831783413887024





In [None]:
def test(model, test_iter, save_path):
  model.eval()
  predict_all, predict_result = np.array([], dtype=int), []
  with torch.no_grad():
    for trains in tqdm(test_iter):
      outputs = model(trains)
      predict_all = np.append(predict_all, torch.max(outputs.cpu().data, 1)[1].numpy())
  for i, label in enumerate(predict_all):
    predict_result.append([i + 1, label])
  predict = pd.DataFrame(predict_result)
  predict.to_csv(save_path, index = False, header = False)

In [None]:
test(model, test_loader, '/content/drive/MyDrive/情感分类实验数据2023/predict_LSTM.csv')

100%|██████████| 188/188 [00:00<00:00, 262.03it/s]


In [None]:
def build_dataset_no_padding(file_path, mode, wv):
    assert mode in ['train', 'test']
    res = read_json(file_path)
    text, labels = [], []
    for dic in tqdm(res):
        corpus = dic['content']
        if mode == 'train':
            label = label_dic[dic['label']]
        words = list(jieba.cut(corpus))
        unk_idx = wv.word2idx[UNK]
        idxs = torch.tensor([wv.word2idx.get(word, unk_idx) for word in words])
        if mode == 'train':
            text.append(idxs)
            labels.append(label)
        else:
            text.append(idxs)
    if mode == 'train':
        return text, labels
    else:
        return text

class Mydataset_no_padding(Dataset):
    def __init__(self, file_path, mode, wv):
        if mode == 'train':
            self.x, self.y = build_dataset_no_padding(file_path, mode, wv)
            assert len(self.x) == len(self.y)
            self.y = torch.tensor(self.y)
        else:
            self.x = build_dataset(file_path, mode, wv)
        self.mode = mode
        self.len = len(self.x)
    def __len__(self):
        return self.len
    def __getitem__(self, index):
        if self.mode == 'train':
            return self.x[index], self.y[index]
        else:
            return self.x[index]

In [None]:
def collate_fn(train_data):
  txt, label = zip(*train_data)
  txt = nn.utils.rnn.pad_sequence(txt, batch_first=True, padding_value=0)
  return txt, torch.tensor(label)

In [None]:
no_padding_train_data = Mydataset_no_padding('/content/drive/MyDrive/情感分类实验数据2023/public-data/train_data/train_data.json', 'train', w2v)
no_padding_train_loader = DataLoader(no_padding_train_data, batch_size=16, shuffle=False, collate_fn=collate_fn)

100%|██████████| 8606/8606 [00:02<00:00, 3037.41it/s]


In [None]:
no_padding_model = LSTM(w2v)

In [None]:
train(no_padding_model, no_padding_train_loader)

Epoch [1/150]
1.1696312427520752
Epoch [2/150]
0.7966829538345337
Epoch [3/150]
0.5051431059837341
Epoch [4/150]
0.5211045742034912
Epoch [5/150]
0.6621553301811218
Epoch [6/150]
0.4631933569908142
Epoch [7/150]
0.3347940146923065
Epoch [8/150]
0.27229049801826477
Epoch [9/150]
0.33727675676345825
Epoch [10/150]
0.1788896918296814
Epoch [11/150]
0.1550033837556839
Epoch [12/150]
0.13392925262451172
Epoch [13/150]
0.11539328098297119
Epoch [14/150]
0.07143248617649078
Epoch [15/150]
0.07877279818058014
Epoch [16/150]
0.09784678369760513
Epoch [17/150]
0.04177502170205116
Epoch [18/150]
0.02188311330974102
Epoch [19/150]
0.023423045873641968
Epoch [20/150]
0.01709747686982155
Epoch [21/150]
0.018516259267926216
Epoch [22/150]
0.057692863047122955
Epoch [23/150]
0.04360463097691536
Epoch [24/150]
0.1422058492898941
Epoch [25/150]
0.011003856547176838
Epoch [26/150]
0.01336454413831234
Epoch [27/150]
0.01953195407986641
Epoch [28/150]
0.017946526408195496
Epoch [29/150]
0.00923092197626829

In [None]:
torch.save(no_padding_model.state_dict(), '/content/drive/MyDrive/情感分类实验数据2023/NO_PADDING_LSTM.pt')

In [None]:
no_padding_dev_data = Mydataset('/content/drive/MyDrive/情感分类实验数据2023/public-data/eval_data/eval_data.json', 'train', w2v)
no_padding_dev_loader = DataLoader(no_padding_dev_data, batch_size=16, shuffle=False, collate_fn=collate_fn)

100%|██████████| 2000/2000 [00:00<00:00, 5001.49it/s]


In [None]:
dev(no_padding_model, no_padding_dev_loader)

100%|██████████| 125/125 [00:00<00:00, 231.26it/s]


acc:0.703, total_loss:1.45534086227417





In [None]:
def collate_fn2(train_data):
  txt = nn.utils.rnn.pad_sequence(train_data, batch_first=True, padding_value=0)
  return txt

In [None]:
no_padding_test_data = Mydataset('/content/drive/MyDrive/情感分类实验数据2023/public-data/test_data/test.json', 'test', w2v)
no_padding_test_loader = DataLoader(no_padding_test_data, batch_size=16, shuffle=False, collate_fn=collate_fn2)

100%|██████████| 3000/3000 [00:00<00:00, 5053.98it/s]


In [None]:
test(no_padding_model, no_padding_test_loader, '/content/drive/MyDrive/情感分类实验数据2023/predict_np_LSTM.csv')

100%|██████████| 188/188 [00:01<00:00, 155.62it/s]


In [None]:
#18:35 start