In [60]:
from gensim.models import KeyedVectors
from utils import *
from dataset import *
import os
import torch
import torch.utils.data
import numpy as np
from model import *
from config import *
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, precision_score
import numpy as np
import logging
from torch.utils.tensorboard import SummaryWriter
import time

In [5]:
tokens = get_tokens(dir='../data/csv/without_name/')

token2id, id2token = get_token_id(tokens)

dfs = read_data(dir='../data/csv/without_name/')
x_train, y_train = get_input(dfs[0], token2id)
x_test, y_test = get_input(dfs[1], token2id)
x_valid, y_valid = get_input(dfs[1], token2id)
x_train, x_test, x_valid = padding(x_train), padding(x_test), padding(x_valid)

100%|██████████| 44974/44974 [00:17<00:00, 2518.53it/s]
100%|██████████| 5621/5621 [00:02<00:00, 2427.35it/s]
100%|██████████| 5624/5624 [00:02<00:00, 2530.56it/s]
100%|██████████| 44974/44974 [00:18<00:00, 2469.69it/s]
100%|██████████| 5621/5621 [00:02<00:00, 2357.62it/s]
100%|██████████| 5621/5621 [00:02<00:00, 2188.04it/s]


In [6]:
config = Config()

training_set = dataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(training_set, batch_size=config.batch_size, shuffle=True, num_workers=4)

valid_set = dataset(x_valid, y_valid)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=config.batch_size, num_workers=4)

device = config.device

In [7]:
wv = KeyedVectors.load_word2vec_format('../data/sgns.wiki.bigram-char', binary=False)

In [8]:
embedding_weights = torch.zeros((len(token2id)+1, config.embed))
for token in token2id:
    index = token2id[token]
    embedding_weights[index, :] = torch.from_numpy(wv[token]) if token in wv else torch.rand(config.embed)
embedding_weights = embedding_weights.to(device)

In [63]:
def eval(model, data_loader):
    model.eval()
    Y_true, Y_pre, hits = np.array([]), np.array([]), 0
    for x, y in data_loader:
        #print(y.shape)
        Y_true = np.concatenate((Y_true, y.detach().numpy()))
        x, y = x.to(device), y.to(device)
        output = model(x)
        y_pre = torch.argmax(output, dim=1).detach()
        Y_pre = np.concatenate((Y_pre, y_pre.cpu().detach().numpy()))
        hits += torch.sum(y==y_pre)

    #return Y_true, Y_pre
    matrix = confusion_matrix(Y_true, Y_pre)
    score  = precision_score(Y_true, Y_pre, labels=[0, 1, 2], average='micro')
    return hits, score, matrix

In [66]:
# training
config.num_epochs = 200
config.learning_rate = 5e-4

model = TextCNN(embedding_weights, config)
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate)
model.train()
writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))
step, eval_fre, log_fre, total_loss, best_hits = 1, 1, 100, 0, 0

LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(filename='../runs/my.log', level=logging.DEBUG, format=LOG_FORMAT)

for epoch in range(1, config.num_epochs+1):
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        model.zero_grad()
        output = model(x)
        loss = F.cross_entropy(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        #print(total_loss)
        if step % log_fre == 0:
            logging.info('after {} steps, mean loss is {}'.format(step, round(total_loss/log_fre, 4)))
            writer.add_scalar('Loss/train-loss', total_loss/log_fre, step)
            total_loss = 0

        step += 1

    if epoch % eval_fre == 0:
        tmp_hits, score, _ = eval(model, valid_loader)
        if tmp_hits > best_hits:
            best_hits = tmp_hits
            torch.save(model.state_dict(), '../runs/best_model'.format(tmp_hits))
        #torch.save(model.state_dict(), '../runs/model_epoch={}'.format(epoch))
        writer.add_scalar('Loss/valid-micro_precision', score, epoch)
        writer.add_scalar('hits/hits', tmp_hits, epoch)

In [69]:
test_set = dataset(x_test, y_test)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=config.batch_size, num_workers=4)

best_model = TextCNN(embedding_weights, config)
best_model.load_state_dict(torch.load('../runs/best_model'))
best_model = best_model.to(device)

In [70]:
best_model.eval()
_, _, score = eval(best_model, test_loader)
print(score)

[[1768    6    4]
 [  12 1872    6]
 [  49   32 1872]]
