In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# from embed import Embedding

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

EMBED_DIM = 300

def is_valid(seg):
    for w in seg:
        if not ('\u4e00' <= w and w <= '\u9fff'):
            return False
    return True


class Embed:
    # def __init__(self, file_path='sgns.sogou.word'):
    def __init__(self, file_path='sgns.sogou.word'):
        self.idx_seg = ['unk']
        self.seg_idx = {'unk': 0}
        self.idx_emb = [[0.0 for i in range(EMBED_DIM)]]
        
        with open(file_path, 'r') as f:
            for idx, line in enumerate(f.readlines(), start=1):
                emb = line.split()[1:]
                seg = line.split()[0]
                # print(emb, seg)
                if is_valid(seg) and (seg not in self.seg_idx):
                    self.idx_seg.append(seg)
                    self.seg_idx[seg] = idx
                    self.idx_emb.append([float(i) for i in emb])
    def embed(self, seg):
        if seg in self.seg_idx:
            return self.seg_idx[seg]
        else:
            return self.seg_idx['unk']

In [3]:
s = Embed()

In [5]:
import torch 
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import json
from torch.utils.data import Dataset, DataLoader
from scipy.stats import pearsonr
from sklearn.metrics import f1_score
import random


weightFile='weight'
train_file='../git/train_dic.json'
test_file='../git/test_dic.json'


EPOCH=2
BATCH_SIZE=64
lr=0.001

with open(train_file,'r') as f:
    train_dic = json.load(f)
with open(test_file,'r') as f:
    test_dic=json.load(f)

max_len=len(train_dic['label'])
class trainset(Dataset):
    def __init__(self):
        self.textdata=torch.LongTensor(train_dic['indexed_text'])
        self.labeldata=torch.LongTensor(train_dic['emo'])

    def __len__(self):
        return len(self.textdata)

    def __getitem__(self,index):
        return self.textdata[index],self.labeldata[index]

class validset(Dataset):
    def __init__(self):
        self.textdata=torch.LongTensor(test_dic['indexed_text'])
        self.labeldata=torch.LongTensor(test_dic['emo'])

    def __len__(self):
        return len(self.textdata)

    def __getitem__(self,index):
        return self.textdata[index],self.labeldata[index]

text = trainset()
textloader = DataLoader(dataset=text,batch_size=BATCH_SIZE,shuffle=True)

In [7]:
VOCAB_SIZE = 364182

# CUDA


class SeqRNN(nn.Module):
    '''
    vocab_size:词向量维度
    hidden_size:隐藏单元数量决定输出长度
    output_size:输出类别为8，维数为1
    '''

    def __init__(self, vocab_size=300, hidden_size=10, output_size=8, pretrained_embed=Embed().idx_emb):
        super(SeqRNN, self).__init__()
        self.embed_dim = vocab_size
        self.embed = nn.Embedding(VOCAB_SIZE, self.embed_dim)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.rnn = nn.RNN(self.vocab_size, self.hidden_size,
                          batch_first=True, dropout=0.5)
        self.linear = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input):
        input = self.embed(input)
        # print(input)
        # print('embeded size:', input.shape)
        h0 = torch.zeros(1, 1, self.hidden_size)
        h0 = h0.to(device)
        # print('h0 size:', h0.shape)
        output, hidden = self.rnn(input, h0)
        output = output[:, -1, :]
        output = self.linear(output)
        output = torch.nn.functional.softmax(output, dim=1)
        return output


# rnn_model = SeqRNN()
# cnn_model = TextCNN()

In [None]:
from tqdm import tqdm

device = torch.device("cuda")
textloader = DataLoader(dataset=text, batch_size=1, shuffle=True)

model = SeqRNN()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

model = model.to(device)

cnt = 0
calloss = nn.CrossEntropyLoss()

for epoch in range(2):
    aveloss = 0
    batchnum = 0
    for text, label in tqdm(textloader):
        text = text.to(device)
        label = label.to(device)
        batchnum += 1
        optimizer.zero_grad()
        out = model(text)
        loss = calloss(out, label)
        loss.backward()
        aveloss += loss.item()
        optimizer.step()
    aveloss /= batchnum
    print('Epoch:', epoch, 'aveloss:', aveloss)
    torch.save(model.state_dict(), weightFile+str(epoch)+'.pkl')

In [None]:
import torch 
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import json
from torch.utils.data import Dataset, DataLoader
from scipy.stats import pearsonr
from sklearn.metrics import f1_score
import random


weightFile='weight'
train_file='../git/train_dic.json'
test_file='../git/test_dic.json'


EPOCH=2
BATCH_SIZE=64
lr=0.001

with open(train_file,'r') as f:
    train_dic = json.load(f)
with open(test_file,'r') as f:
    test_dic=json.load(f)

max_len=len(train_dic['label'])


class trainset(Dataset):
    def __init__(self):
        self.textdata=torch.LongTensor(train_dic['indexed_text'])
        self.labeldata=torch.LongTensor(train_dic['emo'])

    def __len__(self):
        return len(self.textdata)

    def __getitem__(self,index):
        return self.textdata[index],self.labeldata[index]

class validset(Dataset):
    def __init__(self):
        self.textdata=torch.LongTensor(test_dic['indexed_text'])
        self.labeldata=torch.LongTensor(test_dic['emo'])

    def __len__(self):
        return len(self.textdata)

    def __getitem__(self,index):
        return self.textdata[index],self.labeldata[index]

text = trainset()
textloader = DataLoader(dataset=text,batch_size=BATCH_SIZE,shuffle=True)

model = TextCNN()
optimizer = torch.optim.Adam(model.parameters(),lr=lr)

cnt=0
calloss = nn.CrossEntropyLoss()

for epoch in range(EPOCH):
    aveloss=0
    batchnum=0
    for text,label in textloader:
        batchnum+=1
        optimizer.zero_grad()
        out=model(text)
        loss=calloss(out,label)
        loss.backward()
        aveloss+=loss.item()
        optimizer.step()
    aveloss/=batchnum
    print('Epoch:',epoch,'aveloss:',aveloss)
    torch.save(model.state_dict(), weightFile+str(epoch)+'.pkl')

In [None]:
test = validset()
testloader = DataLoader(dataset=test, batch_size=BATCH_SIZE, shuffle=False)
testmodel = TextCNN()
# opt=torch.optim.Adam(testmodel.parameters(),lr=LR)

correct = 0
total = 0
epoch = 8
coef = 0

ground = list()
pred = list()

testmodel.load_state_dict(torch.load(weightFile+str(1)+'.pkl'))
for text, label in testloader:
    #     opt.zero_grad()
    testmodel.eval()
    out = testmodel(text)
    for ind in range(len(out)):
        v0 = test_dic['label'][ind][1:]
        ol = []
        for i in range(len(out[ind])):
            ol.append(float(out[ind][i]))
        c = pearsonr(ol, v0)
        coef += c[0]
    prediction = torch.argmax(out, 1)
    ground.append(label)
    pred.append(prediction)
    correct += (prediction == label).sum().float()
    total += len(label)
v = np.array(test_dic['emo'])
print(correct)
print(total)
print('acc:', correct.item()/total)
print(coef)
print('Coef:', coef/total)



tot = 0
cnt = 0
for i, j in zip(ground, pred):
    print('F-score:', f1_score(i.data, j.data, average='macro'))
    tot += f1_score(i.data, j.data, average='macro')
    cnt += 1
print(tot / cnt)

# some logs
# tensor(1217.)
# 2228
# acc: 0.546229802513465
# 717.9179559345431
# Coef: 0.3222252944050912
# F-score: 0.18830698287220027
# F-score: 0.29171621217657023
# F-score: 0.24558080808080807
# F-score: 0.17189314750290358
# F-score: 0.23976608187134504
# F-score: 0.21186521120075932
# F-score: 0.20497154836777481
# F-score: 0.23169482846902203
# F-score: 0.21553586984805803
# F-score: 0.16167247386759584
# F-score: 0.26652014652014655
# F-score: 0.19197994987468672
# F-score: 0.14716242661448145
# F-score: 0.1794213557205301
# F-score: 0.375312518169661
# F-score: 0.16726190476190478
# F-score: 0.16849529780564265
# F-score: 0.2399525027402265
# F-score: 0.14369747899159663
# F-score: 0.1473485946102579
# F-score: 0.23508691147691954
# F-score: 0.21349080172609586
# F-score: 0.15907184791724907
# F-score: 0.20887445887445888
# F-score: 0.13934713934713933
# F-score: 0.19055598779101082
# F-score: 0.1446312410239081
# F-score: 0.20155348363195658
# F-score: 0.19544740973312402
# F-score: 0.26449248073108883
# F-score: 0.21944721944721943
# F-score: 0.1875
# F-score: 0.1971957671957672
# F-score: 0.24056695992179858
# F-score: 0.13852813852813853
# 0.2035984339260584

In [None]:
for i, j in zip(ground, pred):
    print('F-score:',f1_score(i.data,j.data,average='micro'))
    tot += f1_score(i.data,j.data,average='micro')
    cnt += 1
print(tot / cnt)

In [None]:
for i, j in zip(ground, pred):
    print('F-score:',f1_score(i.data,j.data,average='macro'))
    tot += f1_score(i.data,j.data,average='macro')
    cnt += 1
print(tot / cnt)