In [None]:
!pip install jieba

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
import gensim
import jieba
import pandas as pd

In [None]:
PAD = '<PAD>'
UNK = '<UNK>'

label_dic = {'positive': 1, 'negative': 2, 'neutral': 0}

def read_json(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as f:
      anslist = json.load(f)
    return anslist

def build_dataset(file_path, mode, wv, padding_size = 32):
    assert mode in ['train', 'test']
    res = read_json(file_path)
    text, labels = [], []
    for dic in tqdm(res):
        corpus = dic['content']
        if mode == 'train':
            label = label_dic[dic['label']]
        words = list(jieba.cut(corpus))
        if len(words) < padding_size:
            words += [PAD] * (padding_size - len(words))
        else:
            words = words[: padding_size]
        unk_idx = wv.word2idx[UNK]
        idxs = [wv.word2idx.get(word, unk_idx) for word in words]
        if mode == 'train':
            text.append(idxs)
            labels.append(label)
        else:
            text.append(idxs)
    if mode == 'train':
        return text, labels
    else:
        return text

class Mydataset(Dataset):
    def __init__(self, file_path, mode, wv):
        if mode == 'train':
            self.x, self.y = build_dataset(file_path, mode, wv)
            assert len(self.x) == len(self.y)
            self.y = torch.tensor(self.y)
        else:
            self.x = build_dataset(file_path, mode, wv)
        self.mode = mode
        self.x = torch.tensor(self.x)
        self.len = len(self.x)
    def __len__(self):
        return self.len
    def __getitem__(self, index):
        if self.mode == 'train':
            return self.x[index], self.y[index]
        else:
            return self.x[index]

In [None]:
class word2vec():
  def __init__(self, pretrained_w2v_path = '/content/drive/MyDrive/情感分类实验数据2023/sgns.weibo.bigram-char'):
    w2vmodel = gensim.models.KeyedVectors.load_word2vec_format(pretrained_w2v_path, binary = False, encoding = 'utf-8')
    self.vocab_size = len(w2vmodel.index_to_key)
    self.word2idx = w2vmodel.key_to_index
    self.word2idx.update({'<UNK>': self.vocab_size, '<PAD>': self.vocab_size + 1})
    self.vocab_size += 2
    self.vector_size = w2vmodel.vector_size
    self.vocab = w2vmodel.index_to_key
    self.vocab.append('<UNK>')
    self.vocab.append('<PAD>')
    self.vectors = w2vmodel.vectors
    self.vectors = torch.tensor(np.append(np.append(
        self.vectors, self.vectors.mean(axis=0).reshape(1,-1), axis=0),
        self.vectors.mean(axis=0).reshape(1,-1), axis=0))

In [None]:
class CNN(nn.Module):
  def __init__(self, w2vmodel, drop_rate = 0.3, conv_channel = 4, train_path = '/content/drive/MyDrive/情感分类实验数据2023/public-data/train_data/train_data.json',
               eval_path = '/content/drive/MyDrive/情感分类实验数据2023/public-data/eval_data/eval_data.json',
               test_path = '/content/drive/MyDrive/情感分类实验数据2023/public-data/test_data/test.json'):
    super(CNN, self).__init__()
    self.embedding = nn.Embedding.from_pretrained(w2vmodel.vectors, freeze=True)
    embedding_dim = w2vmodel.vector_size
    self.Conv1 = nn.Conv2d(1, conv_channel, (1, embedding_dim))
    self.Conv2 = nn.Conv2d(1, conv_channel, (2, embedding_dim))
    self.Conv3 = nn.Conv2d(1, conv_channel, (3, embedding_dim))
    self.Conv4 = nn.Conv2d(1, conv_channel, (4, embedding_dim))
    self.Dropout = nn.Dropout(drop_rate)
    self.Fc = nn.Linear(conv_channel * 4, 3)

  def forward(self, input):
    embeds = self.embedding(input)
    embeds = embeds.unsqueeze(1)
    conv1_out = torch.max(self.Conv1(embeds), dim = 2)[0].squeeze(-1)
    conv2_out = torch.max(self.Conv2(embeds), dim = 2)[0].squeeze(-1)
    conv3_out = torch.max(self.Conv3(embeds), dim = 2)[0].squeeze(-1)
    conv4_out = torch.max(self.Conv4(embeds), dim = 2)[0].squeeze(-1)

    conv_out = torch.cat((conv1_out, conv2_out, conv3_out, conv4_out), dim=1)
    output = self.Dropout(conv_out)
    output = self.Fc(conv_out)
    return output


In [None]:
w2v = word2vec()

In [None]:
model = CNN(w2v)

In [None]:
len(w2v.vectors)

195199

In [None]:
train_data = Mydataset('/content/drive/MyDrive/情感分类实验数据2023/public-data/train_data/train_data.json', mode='train', wv=w2v)
train_loader = DataLoader(train_data, batch_size=16, shuffle=False)

100%|██████████| 8606/8606 [00:01<00:00, 4912.66it/s]


In [None]:
import torch.nn.functional as Fun

def train(model, train_iter):
    # 启用dropout
    model.train()
    # 设置adam优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(150):
      print("Epoch [{}/{}]".format(epoch+1, 150))
      for i,(trains,labels) in enumerate(train_iter):
        outputs = model(trains)
        model.zero_grad()
        loss = Fun.cross_entropy(outputs, labels)
        if i == 0:
          print(loss.item())
        loss.backward()
        optimizer.step()

In [None]:
train(model, train_loader)

Epoch [1/150]
0.01083025336265564
Epoch [2/150]
0.008436683565378189
Epoch [3/150]
0.01412936206907034
Epoch [4/150]
0.011085019446909428
Epoch [5/150]
0.007467744406312704
Epoch [6/150]
0.009604339487850666
Epoch [7/150]
0.010187134146690369
Epoch [8/150]
0.010816633701324463
Epoch [9/150]
0.01376034040004015
Epoch [10/150]
0.008771408349275589
Epoch [11/150]
0.009532655589282513
Epoch [12/150]
0.010728029534220695
Epoch [13/150]
0.01007876731455326
Epoch [14/150]
0.010083887726068497
Epoch [15/150]
0.014406884089112282
Epoch [16/150]
0.01081971637904644
Epoch [17/150]
0.010185056366026402
Epoch [18/150]
0.009118973277509212
Epoch [19/150]
0.009724986739456654
Epoch [20/150]
0.009506264701485634
Epoch [21/150]
0.012435771524906158
Epoch [22/150]
0.009844368323683739
Epoch [23/150]
0.010964572429656982
Epoch [24/150]
0.010279026813805103
Epoch [25/150]
0.00913620088249445
Epoch [26/150]
0.009834842756390572
Epoch [27/150]
0.009561119601130486
Epoch [28/150]
0.009828882291913033
Epoch [

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/情感分类实验数据2023/CNN.pt')

In [None]:
model = CNN(w2v)
model.load_state_dict(torch.load('/content/drive/MyDrive/情感分类实验数据2023/CNN.pt'))

<All keys matched successfully>

In [None]:
test_data = Mydataset('/content/drive/MyDrive/情感分类实验数据2023/public-data/test_data/test.json', mode='test', wv=w2v)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

100%|██████████| 3000/3000 [00:00<00:00, 3624.13it/s]


In [None]:
dev_data = Mydataset('/content/drive/MyDrive/情感分类实验数据2023/public-data/eval_data/eval_data.json', mode='train', wv=w2v)
dev_loader = DataLoader(dev_data, batch_size=16, shuffle=False)

100%|██████████| 2000/2000 [00:00<00:00, 4631.37it/s]


In [None]:
import torch.nn.functional as Fun

def dev(model, dev_iter):
  model.eval()
  loss_total, acc = 0, 0
  eval_all = np.array([], dtype=int)
  with torch.no_grad():
    for i, (text, label) in enumerate(tqdm(dev_iter)):
      outputs = model(text)
      loss = Fun.cross_entropy(outputs, label)
      loss_total += loss
      acc += sum([1 if mylabel == truelabel else 0 for mylabel, truelabel in zip(torch.max(outputs.cpu().data, 1)[1].numpy(), label.numpy())])
      eval_all = np.append(eval_all, torch.max(outputs.cpu().data, 1)[1].numpy())
  print(f'\nacc:{acc/len(dev_data)}, total_loss:{loss}')

In [None]:
dev(model, dev_loader)

100%|██████████| 125/125 [00:00<00:00, 314.73it/s]


acc:0.7345, total_loss:1.3434284925460815





In [None]:
def test(model, test_iter):
  model.eval()
  predict_all, predict_result = np.array([], dtype=int), []
  with torch.no_grad():
    for trains in tqdm(test_iter):
      outputs = model(trains)
      predict_all = np.append(predict_all, torch.max(outputs.cpu().data, 1)[1].numpy())
  for i, label in enumerate(predict_all):
    predict_result.append([i + 1, label])
  predict = pd.DataFrame(predict_result)
  predict.to_csv('/content/drive/MyDrive/情感分类实验数据2023/predict.csv', index = False, header = False)

In [None]:
test(model, test_loader)

100%|██████████| 188/188 [00:00<00:00, 318.28it/s]
