In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# from embed import Embedding

In [2]:
# embed_file = 'sgns.sogou.word'
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = '2'
# device = torch.device("cuda")

EMBED_DIM = 300

def is_valid(seg):
    for w in seg:
        if not ('\u4e00' <= w and w <= '\u9fff'):
            return False
    return True


class Embed:
    def __init__(self, file_path='../data/sgns.sogou.word'):
        self.idx_seg = ['unk']
        self.seg_idx = {'unk': 0}
        self.idx_emb = [[0.0 for i in range(EMBED_DIM)]]
        
        with open(file_path, 'r') as f:
            for idx, line in enumerate(f.readlines(), start=1):
                emb = line.split()[1:]
                seg = line.split()[0]
                # print(emb, seg)
                if is_valid(seg) and (seg not in self.seg_idx):
                    self.idx_seg.append(seg)
                    self.seg_idx[seg] = idx
                    self.idx_emb.append([float(i) for i in emb])
    def embed(self, seg):
        if seg in self.seg_idx:
            return self.seg_idx[seg]
        else:
            return self.seg_idx['unk']

In [4]:
# s = Embed()

# (s.seg_idx[','])
# (s.seg_idx['的'])
# s.embed('，')
# s.embed('我国')

In [3]:
VOCAB_SIZE = 364182


class TextCNN(nn.Module):
    def __init__(self, channel_out=80, kernel_size=[2,3,4], dropout=0.5, pretrained_embed=Embed().idx_emb):
        super(TextCNN, self).__init__()
        channel_in = 1
        self.classes = 8
        self.embed_dim = 300
        self.embed = nn.Embedding(VOCAB_SIZE, self.embed_dim)
        # self.embed.weight.data.copy_(torch.from_numpy(np.array(pretrained_embed)))
        
        self.conv1 = nn.Conv2d(in_channels=channel_in, out_channels=channel_out, kernel_size=(kernel_size[0], self.embed_dim))
        self.conv2 = nn.Conv2d(in_channels=channel_in, out_channels=channel_out, kernel_size=(kernel_size[1], self.embed_dim))
        self.conv3 = nn.Conv2d(in_channels=channel_in, out_channels=channel_out, kernel_size=(kernel_size[2], self.embed_dim))
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_size) * channel_out, self.classes)
    
    def forward(self, x):
        # x = x.to(device)
        x = self.embed(x)
        x = x.unsqueeze(1)

        x1=self.conv1(x)
        x1=F.relu(x1.squeeze(3))
        x1=F.max_pool1d(x1,x1.size(2)).squeeze(2)

        x2=self.conv2(x)
        x2=F.relu(x2.squeeze(3))
        x2=F.max_pool1d(x2,x2.size(2)).squeeze(2)

        x3=self.conv3(x)
        x3=F.relu(x3.squeeze(3))
        x3=F.max_pool1d(x3,x3.size(2)).squeeze(2)

        x=torch.cat((x1,x2,x3),1)
        x=self.dropout(x)

        logit=self.fc(x)
        return logit

In [6]:
model = TextCNN()

In [7]:
# model(torch.LongTensor([1,1,11,] * 400))

In [5]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import json
from torch.utils.data import Dataset, DataLoader
from scipy.stats import pearsonr
from sklearn.metrics import f1_score
import random


weightFile = './pkl/weight'
train_file = '../data/train_dic.json'
test_file = '../data/test_dic.json'


with open(train_file, 'r') as f:
    train_dic = json.load(f)
with open(test_file, 'r') as f:
    test_dic = json.load(f)


EPOCH = 20
BATCH_SIZE = 64
lr = 0.001
max_len = len(train_dic['label'])


class trainset(Dataset):
    def __init__(self):
        self.textdata = torch.LongTensor(train_dic['indexed_text'])
        self.labeldata = torch.LongTensor(train_dic['emo'])

    def __len__(self):
        return len(self.textdata)

    def __getitem__(self, index):
        return self.textdata[index], self.labeldata[index]


class validset(Dataset):
    def __init__(self):
        self.textdata = torch.LongTensor(test_dic['indexed_text'])
        self.labeldata = torch.LongTensor(test_dic['emo'])

    def __len__(self):
        return len(self.textdata)

    def __getitem__(self, index):
        return self.textdata[index], self.labeldata[index]


def train(EPOCH=20):
    text = trainset()
    textloader = DataLoader(dataset=text, batch_size=BATCH_SIZE, shuffle=True)

    model = TextCNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    cnt = 0
    calloss = nn.CrossEntropyLoss()

    for epoch in range(EPOCH):
        aveloss = 0
        batchnum = 0
        for text, label in textloader:
            batchnum += 1
            optimizer.zero_grad()
            out = model(text)
            loss = calloss(out, label)
            loss.backward()
            aveloss += loss.item()
            optimizer.step()
        aveloss /= batchnum
        print('Epoch:', epoch, 'aveloss:', aveloss)
        torch.save(model.state_dict(), weightFile+str(epoch)+'.pkl')

# train(EPOCH)

# Epoch: 0 aveloss: 2.0004355617471643
# Epoch: 1 aveloss: 1.6682660386369035

In [10]:
# TESTING

test = validset()
testloader = DataLoader(dataset=test, batch_size=BATCH_SIZE, shuffle=False)
testmodel = TextCNN()
# opt=torch.optim.Adam(testmodel.parameters(),lr=LR)

correct = 0
total = 0
epoch = 8
coef = 0


ground = list()
pred = list()
ID = 20
testmodel.load_state_dict(torch.load(weightFile+str(ID - 1)+'.pkl'))
for text, label in testloader:
    testmodel.eval()
    out = testmodel(text)
    for index in range(len(out)):
        v0 = test_dic['label'][index][1:]
        out_list = []
        for i in range(len(out[index])):
            out_list.append(float(out[index][i]))
        c = pearsonr(out_list, v0)
        coef += c[0]
    prediction = torch.argmax(out, 1)
    ground.append(label)
    pred.append(prediction)
    correct += (prediction == label).sum().float()
    total += len(label)
v = np.array(test_dic['emo'])
print('correct cnt:', correct)
print('total cnt:', total)
print('acc:', correct.item()/total)
print('Coef:', coef/total)

macro_tot = 0
micro_tot = 0
cnt = 0
for i, j in zip(ground, pred):
    macro_tot += f1_score(i.data, j.data, average='macro')
    micro_tot += f1_score(i.data, j.data, average='micro')
    cnt += 1
print('f_score macro_ave:', macro_tot / cnt)
print('f_score micro_ave:', micro_tot / cnt)

# tensor(1217.)
# 2228
# acc: 0.546229802513465
# 717.9179559345431
# Coef: 0.3222252944050912
# F-score: 0.18830698287220027
# F-score: 0.29171621217657023
# ...
# F-score: 0.1875
# F-score: 0.24056695992179858
# F-score: 0.13852813852813853
# 0.2035984339260584

correct cnt: tensor(1290.)
total cnt: 2228
acc: 0.5789946140035906
Coef: 0.2927596610542403
f_score macro_ave: 0.27336545967632814
f_score micro_ave: 0.5785714285714286
