In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# from embed import Embedding

In [2]:
# embed_file = 'sgns.sogou.word'

EMBED_DIM = 300

def is_valid(seg):
    for w in seg:
        if not ('\u4e00' <= w and w <= '\u9fff'):
            return False
    return True


class Embed:
    # def __init__(self, file_path='sgns.sogou.word'):
    def __init__(self, file_path='../data/sgns.sogou.word'):
        self.idx_seg = ['unk']
        self.seg_idx = {'unk': 0}
        self.idx_emb = [[0.0 for i in range(EMBED_DIM)]]
        
        with open(file_path, 'r') as f:
            for idx, line in enumerate(f.readlines(), start=1):
                emb = line.split()[1:]
                seg = line.split()[0]
                # print(emb, seg)
                if is_valid(seg) and (seg not in self.seg_idx):
                    self.idx_seg.append(seg)
                    self.seg_idx[seg] = idx
                    self.idx_emb.append([float(i) for i in emb])
    def embed(self, seg):
        if seg in self.seg_idx:
            return self.seg_idx[seg]
        else:
            return self.seg_idx['unk']

In [3]:
# s = Embed()

# (s.seg_idx[','])
# (s.seg_idx['的'])
# s.embed('，')
# s.embed('我国')

In [None]:
VOCAB_SIZE = 364182


class SeqRNN(nn.Module):
    '''
    vocab_size:词向量维度
    hidden_size:隐藏单元数量决定输出长度
    output_size:输出类别为8，维数为1
    '''

    def __init__(self, vocab_size=300, hidden_size=10, output_size=8, pretrained_embed=Embed().idx_emb):
        super(SeqRNN, self).__init__()
        self.embed_dim = vocab_size
        self.embed = nn.Embedding(VOCAB_SIZE, self.embed_dim)
        self.vocab_size = vocab_size  # 这个为词向量的维数300
        self.hidden_size = hidden_size  # 隐藏单元数
        self.output_size = output_size  # 最后输出size

        self.rnn = nn.RNN(self.vocab_size, self.hidden_size,
                          batch_first=True, dropout=0.5)
        self.linear = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input):
        input = self.embed(input)
        # print(input)
        # print('embeded size:', input.shape)
        h0 = torch.zeros(1, 1, self.hidden_size)
        # print('h0 size:', h0.shape)
        output, hidden = self.rnn(input, h0)
        output = output[:, -1, :]
        output = self.linear(output)
        output = torch.nn.functional.softmax(output, dim=1)
        return output

# rnn_model = SeqRNN()
# cnn_model = TextCNN()

In [21]:
import torch 
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import json
from torch.utils.data import Dataset, DataLoader
from scipy.stats import pearsonr
from sklearn.metrics import f1_score
import random


weightFile='./pkl/rnn_weight'
train_file='../data/train_dic.json'
test_file='../data/test_dic.json'


with open(train_file,'r') as f:
    train_dic = json.load(f)
with open(test_file,'r') as f:
    test_dic=json.load(f)


EPOCH=20
BATCH_SIZE=64
lr=0.001
max_len=len(train_dic['label'])

class trainset(Dataset):
    def __init__(self):
        self.textdata=torch.LongTensor(train_dic['indexed_text'])
        self.labeldata=torch.LongTensor(train_dic['emo'])

    def __len__(self):
        return len(self.textdata)

    def __getitem__(self,index):
        return self.textdata[index],self.labeldata[index]

class validset(Dataset):
    def __init__(self):
        self.textdata=torch.LongTensor(test_dic['indexed_text'])
        self.labeldata=torch.LongTensor(test_dic['emo'])

    def __len__(self):
        return len(self.textdata)

    def __getitem__(self,index):
        return self.textdata[index],self.labeldata[index]

text = trainset()
textloader = DataLoader(dataset=text,batch_size=BATCH_SIZE,shuffle=True)

In [24]:
from tqdm import tqdm
model = SeqRNN()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
textloader = DataLoader(dataset=text,batch_size=1,shuffle=True)
cnt = 0
calloss = nn.CrossEntropyLoss()


for epoch in range(2):
    aveloss = 0
    batchnum = 0
    for text, label in tqdm(textloader):
        batchnum += 1
        optimizer.zero_grad()
        out = model(text)
        loss = calloss(out, label)
        loss.backward()
        aveloss += loss.item()
        optimizer.step()
    aveloss /= batchnum
    print('Epoch:', epoch, 'aveloss:', aveloss)
    torch.save(model.state_dict(), weightFile+str(epoch)+'.pkl')

100%|██████████| 2342/2342 [22:42<00:00,  1.72it/s] 


Epoch: 0 aveloss: 1.9216297582924418


100%|██████████| 2342/2342 [14:58<00:00,  2.61it/s]


Epoch: 1 aveloss: 1.8602517210551555


In [1]:
# FOR TEST

test = validset()
testloader = DataLoader(dataset=test, batch_size=1, shuffle=False)
testmodel = SeqRNN()
# opt=torch.optim.Adam(testmodel.parameters(),lr=LR)

correct = 0
total = 0
epoch = 8
coef = 0

ground = list()
pred = list()

testmodel.load_state_dict(torch.load(weightFile+str(0)+'.pkl'))
for text, label in tqdm(testloader):
    testmodel.eval()
    out = testmodel(text)
    for ind in range(len(out)):
        v0 = test_dic['label'][ind][1:]
        ol = []
        for i in range(len(out[ind])):
            ol.append(float(out[ind][i]))
        c = pearsonr(ol, v0)
        coef += c[0]
    prediction = torch.argmax(out, 1)
    ground.append(label)
    pred.append(prediction)
    correct += (prediction == label).sum().float()
    total += len(label)
v = np.array(test_dic['emo'])
print(correct)
print(total)
print('acc:', correct.item()/total)
print(coef)
print('Coef:', coef/total)

# tensor(1217.)
# 2228
# acc: 0.546229802513465
# 717.9179559345431
# Coef: 0.3222252944050912
# F-score: 0.18830698287220027
# F-score: 0.29171621217657023
# F-score: 0.24558080808080807
# F-score: 0.1971957671957672
# F-score: 0.13852813852813853
# 0.2035984339260584

In [48]:
tot = 0
cnt = 0

pred_l = list()
true_l = list()

for i, j in zip(ground, pred):
    pred_l.append(i.item())
    true_l.append(j.item())
    tot += f1_score(i.data,j.data,average='macro')
    cnt += 1
print('acc:', tot / cnt)

# epoch 1
# acc: 0.46005385996409337

# epoch 0
# acc: 0.45601436265709155

acc: 0.45601436265709155


In [49]:
print('micro f1:', f1_score(pred_l,true_l,average='micro'))
print('macro f1:', f1_score(pred_l,true_l,average='macro'))
print('pearson:', pearsonr(pred_l, true_l))

# epoch 1
# micro f1: 0.46005385996409337
# macro f1: 0.09927234898735894
# pearson: (0.04629288575370093, 0.028885338186686736)

micro f1: 0.45601436265709155
macro f1: 0.09096386015493027
pearson: (0.03383621855677044, 0.11033532616353672)


In [None]:
for i, j in zip(ground, pred):
    print('F-score:',f1_score(i.data,j.data,average='micro'))
    tot += f1_score(i.data,j.data,average='micro')
    cnt += 1
print(tot / cnt)

In [None]:
for i, j in zip(ground, pred):
    print('F-score:',f1_score(i.data,j.data,average='macro'))
    tot += f1_score(i.data,j.data,average='macro')
    cnt += 1
print(tot / cnt)