In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append(os.path.dirname(os.path.abspath('')))
from salt_bert.make_preprocessed_data import tokenization
import torch
import os
from huggingface_from_pretraining.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from huggingface_from_pretraining.modeling import *
from huggingface_from_pretraining.optimization import BertAdam, warmup_linear
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
import random
import numpy as np
import logging
from tqdm import tqdm, trange

logger = logging.getLogger()
logger.setLevel(logging.INFO)
streamHandler = logging.StreamHandler()
# fileHandler = logging.FileHandler('./test.log')
logger.addHandler(streamHandler)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
do_train = True
do_eval = True
train_batch_size = 24
eval_batch_size = 16
gradient_accumulation_steps = 3
num_train_epochs = 50
warmup_proportion = 0.1
fp16 = False
learning_rate = 5e-5
local_rank = -1
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
train_batch_size = train_batch_size // gradient_accumulation_steps
output_dir = './model_output'

In [3]:
# fine-tuning Data 로드해주세요

main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   str_ymd   731 non-null    object
 1   year      731 non-null    int64 
 2   mon       731 non-null    int64 
 3   day       731 non-null    int64 
 4   day_kor   731 non-null    object
 5   weekend   731 non-null    object
 6   holiday   731 non-null    object
 7   str_duty  731 non-null    object
dtypes: int64(3), object(5)
memory usage: 45.8+ KB


In [4]:
# train, test를 나누고 하는 것은 직접 해주셔야 합니다
train_df_rate = math.trunc(len(tot_df)*train_rate)
train_df = tot_df[:train_df_rate- (train_df_rate%(day_range))]
test_df = tot_df[train_df.index[-1]+1:]
test_df_rate = len(test_df)- (len(test_df)%day_range)
test_df = test_df[:test_df_rate]

In [5]:
tokenizer = tokenization.FullTokenizer("./vocab.list", do_lower_case=False)

In [6]:
train_feats = train_df['str_duty_x'].values.reshape(-1,7)
train_feats_list = list(map(lambda x:' '.join(x),train_feats))
train_labels = train_df['str_duty_y'].values.reshape(-1,7)
train_labels_list = list(map(lambda x:' '.join(x),train_labels))
test_feats = test_df['str_duty_x'].values.reshape(-1,7)
test_feats_list = list(map(lambda x:' '.join(x),test_feats))
test_labels = test_df['str_duty_y'].values.reshape(-1,7)
test_labels_list = list(map(lambda x:' '.join(x),test_labels))

In [7]:
num_train_optimization_steps = None
if do_train:
    num_train_optimization_steps = int(
        len(train_feats) / train_batch_size / gradient_accumulation_steps) * num_train_epochs

In [8]:
state_dict = None
state_dict = torch.load( os.path.join('./model_output/', 'pytorch_model.bin') )
cache_dir = "" if "" else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1))

In [9]:
model = BertForTimeSeriesClassification.from_pretrained('./model_output/', state_dict=state_dict, cache_dir=cache_dir, num_labels=4)

loading archive file ./model_output/
Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 7
}

Weights of BertForTimeSeriesClassification not initialized from pretrained model: ['lstm_classifier.weight_ih_l0', 'lstm_classifier.weight_hh_l0', 'lstm_classifier.bias_ih_l0', 'lstm_classifier.bias_hh_l0', 'output_dense.weight', 'output_dense.bias', 'time_distributed.module.weight', 'time_distributed.module.bias']
Weights from pretrained model not used in BertForTimeSeriesClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() and not False else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(device, n_gpu, bool(-1 != -1), None))
model.to(device)

device: cuda n_gpu: 1, distributed training: False, 16-bits training: None


BertForTimeSeriesClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(7, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out

In [11]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [12]:
optimizer = BertAdam(optimizer_grouped_parameters,
                    lr=3e-5,
                    warmup=0.1,
                    t_total=num_train_optimization_steps)

In [13]:
import pickle
label_list = np.unique(train_feats)
if not os.path.isfile(os.path.join('.','label.list')):
    with open( os.path.join('.', 'label.list'), 'wb') as f:
        pickle.dump(label_list, f)
else:
    with open(os.path.join('.', 'label.list'), 'rb') as f:
        label_list = pickle.load(f)
logger.info("category set : %s", label_list)

category set : ['D' 'E' 'N' 'O']


In [14]:
class InputFeatures(object):
	"""A single set of features of data."""

	def __init__(self, input_ids, input_mask, label_id):
		self.input_ids = input_ids
		self.input_mask = input_mask
		self.label_id = label_id

In [15]:
def convert_examples_to_features(examples, train_labels, label_list, max_seq_length, tokenizer):
	"""Loads a data file into a list of `InputBatch`s."""

	label_map = {label : i for i, label in enumerate(label_list)}

	features = []
	for (ex_index, example) in enumerate(examples):

		input_ids = tokenizer.convert_tokens_to_ids(example)

		# The mask has 1 for real tokens and 0 for padding tokens. Only real
		# tokens are attended to.
		input_mask = [1] * len(input_ids)

		# Zero-pad up to the sequence length.
		padding = [0] * (max_seq_length - len(input_ids))
		input_ids += padding
		input_mask += padding

		assert len(input_ids) == max_seq_length
		assert len(input_mask) == max_seq_length
		#print(ex_index)		
		label_id = list(map(lambda x:label_map[x],train_labels[ex_index]))
		if ex_index < 5:
			logger.info("*** Example ***")
			logger.info("tokens: %s" % " ".join(
					[str(x) for x in example]))
			logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
			logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
			logger.info("label: %s (id = %s)" % (train_labels[ex_index], label_id))

		features.append(
				InputFeatures(input_ids=input_ids,
							  input_mask=input_mask,
							  label_id=label_id))
	return features

In [16]:
def accuracy(out, labels):
	labels = labels.reshape(-1)
	outputs = np.argmax(out, axis=2).reshape(-1)
	return np.sum(outputs == labels)

In [60]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

max_seq_length = 7
if do_train:
	train_features = convert_examples_to_features(train_feats, train_labels, label_list, max_seq_length, tokenizer)
	logger.info("***** Running training *****")
	logger.info("  Num examples = %d", len(train_feats))
	logger.info("  Batch size = %d", train_batch_size)
	logger.info("  Num steps = %d", num_train_optimization_steps)
	all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
	all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
	all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
	train_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids)
	train_sampler = RandomSampler(train_data)
	train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

	if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
		eval_features = convert_examples_to_features(test_feats, test_labels, label_list, max_seq_length, tokenizer)
		logger.info("***** Running evaluation *****")
		logger.info("  Num examples = %d", len(test_feats))
		logger.info("  Batch size = %d", eval_batch_size)
		all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
		all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
		all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
		eval_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids)
		# Run prediction for full data
		eval_sampler = SequentialSampler(eval_data)
		eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

	for epoch_i in trange(int(num_train_epochs), desc="Epoch"):
		model.train()
		tr_loss = 0
		nb_tr_examples, nb_tr_steps = 0, 0
		
		for step, batch in enumerate(train_dataloader):
			batch = tuple(t.to(device) for t in batch)
			input_ids, input_mask, label_ids = batch
			hidden = torch.zeros(1, batch[0].size()[0], 256, requires_grad=True, device=device)
			cell = torch.zeros(1, batch[0].size()[0], 256, requires_grad=True, device=device)
			loss = model(input_ids, input_mask, label_ids, (hidden,cell))
			if n_gpu > 1:
				loss = loss.mean() # mean() to average on multi-gpu.
			if gradient_accumulation_steps > 1:
				loss = loss / gradient_accumulation_steps

			if fp16:
				optimizer.backward(loss)
			else:
				loss.backward()

			tr_loss += loss.item()
			nb_tr_examples += input_ids.size(0)
			nb_tr_steps += 1
			if (step + 1) % gradient_accumulation_steps == 0:
				if fp16:
					# modify learning rate with special warm up BERT uses
					# if fp16 is False, BertAdam is used that handles this automatically
					lr_this_step = learning_rate * warmup_linear(global_step/num_train_optimization_steps, warmup_proportion)
					for param_group in optimizer.param_groups:
						param_group['lr'] = lr_this_step
				optimizer.step()
				optimizer.zero_grad()
				global_step += 1
			
			if nb_tr_steps != 0 and nb_tr_steps % 100 == 0 :
				logger.info ( '[train]\t%d\t%d\t%f' % (nb_tr_steps, nb_tr_examples, tr_loss / nb_tr_examples*1.0) )

		if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
			model.eval()
			eval_accuracy = 0.0
			nb_eval_examples = 0

			for input_ids, input_mask, label_ids in eval_dataloader:
				input_ids = input_ids.to(device)
				input_mask = input_mask.to(device)
				label_ids = label_ids.to(device)

				with torch.no_grad():
					logits = model(input_ids, input_mask)

				logits = logits.detach().cpu().numpy()
				label_ids = label_ids.to('cpu').numpy()
				
				eval_accuracy += accuracy(logits, label_ids)

				nb_eval_examples = (input_ids.size(0)*input_ids.size(1)) + nb_eval_examples
			eval_accuracy = eval_accuracy / nb_eval_examples
			result = {'eval_accuracy': eval_accuracy,
						'global_step': global_step}
			logger.info("***** Eval results *****")
			for key in sorted(result.keys()):
				logger.info("  %s = %s", key, str(result[key]))
				
				
		if output_dir != None and epoch_i%25==0:
			model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
			output_model_file = os.path.join(output_dir, "pytorch_model_e%d.bin" % epoch_i)
			torch.save(model_to_save.state_dict(), output_model_file)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


*** Example ***
tokens: O E E O E E E
input_ids: 6 4 4 6 4 4 4
input_mask: 1 1 1 1 1 1 1
label: ['N' 'N' 'O' 'O' 'D' 'D' 'D'] (id = [2, 2, 3, 3, 0, 0, 0])
*** Example ***
tokens: N N O O D D D
input_ids: 5 5 6 6 3 3 3
input_mask: 1 1 1 1 1 1 1
label: ['N' 'N' 'O' 'O' 'O' 'O' 'E'] (id = [2, 2, 3, 3, 3, 3, 1])
*** Example ***
tokens: N N O O O O E
input_ids: 5 5 6 6 6 6 4
input_mask: 1 1 1 1 1 1 1
label: ['E' 'O' 'D' 'D' 'O' 'N' 'N'] (id = [1, 3, 0, 0, 3, 2, 2])
*** Example ***
tokens: E O D D O N N
input_ids: 4 6 3 3 6 5 5
input_mask: 1 1 1 1 1 1 1
label: ['O' 'O' 'E' 'E' 'E' 'O' 'D'] (id = [3, 3, 1, 1, 1, 3, 0])
*** Example ***
tokens: O O E E E O D
input_ids: 6 6 4 4 4 6 3
input_mask: 1 1 1 1 1 1 1
label: ['D' 'O' 'O' 'O' 'D' 'N' 'N'] (id = [0, 3, 3, 3, 0, 2, 2])
***** Running training *****
  Num examples = 47
  Batch size = 8
  Num steps = 50
*** Example ***
tokens: E N N O O N N
input_ids: 4 5 5 6 6 5 5
input_mask: 1 1 1 1 1 1 1
label: ['O' 'O' 'O' 'D' 'D' 'D' 'E'] (id = [3, 3, 3, 

torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])

***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 2





Epoch:   2%|▏         | 1/50 [00:01<00:51,  1.06s/it]

torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])


***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 4
Epoch:   4%|▍         | 2/50 [00:01<00:31,  1.52it/s]

torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])


***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 6
Epoch:   6%|▌         | 3/50 [00:01<00:24,  1.90it/s]

torch.Size([7, 7, 256])
torch.Size([3, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([7, 7, 256])


***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 8
Epoch:   8%|▊         | 4/50 [00:02<00:21,  2.15it/s]

torch.Size([3, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])

***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 10
Epoch:  10%|█         | 5/50 [00:02<00:19,  2.30it/s]


torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])

***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 12
Epoch:  12%|█▏        | 6/50 [00:02<00:18,  2.41it/s]


torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])

***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 14
Epoch:  14%|█▍        | 7/50 [00:03<00:17,  2.50it/s]


torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])

***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 16
Epoch:  16%|█▌        | 8/50 [00:03<00:16,  2.55it/s]


torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])

***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 18
Epoch:  18%|█▊        | 9/50 [00:04<00:15,  2.58it/s]


torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])

***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 20
Epoch:  20%|██        | 10/50 [00:04<00:15,  2.60it/s]


torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])


***** Eval results *****


torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])


  eval_accuracy = 0.42857142857142855
  global_step = 22
Epoch:  22%|██▏       | 11/50 [00:04<00:15,  2.58it/s]

torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([7, 7, 256])
torch.Size([3, 7, 256])

***** Eval results *****
  eval_accuracy = 0.42857142857142855
  global_step = 24
Epoch:  24%|██▍       | 12/50 [00:05<00:14,  2.57it/s]


torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])
torch.Size([8, 7, 256])

Epoch:  24%|██▍       | 12/50 [00:05<00:17,  2.17it/s]


torch.Size([8, 7, 256])
torch.Size([7, 7, 256])





KeyboardInterrupt: 

In [65]:
# 예측 테스트

input_tokens=['D','E','N','O','O','E','E']
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
input_ids = torch.tensor([input_ids], dtype=torch.long)
input_mask = torch.tensor([[1,1,1,1,1,1,1]], dtype=torch.long)
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)

with torch.no_grad():
    logits = model(input_ids, input_mask)

logits = logits.detach().cpu().numpy()
# label_ids = label_ids.to('cpu').numpy()
print(list(map(lambda x:label_list[x],np.argmax(logits,axis=2).reshape(-1))))

torch.Size([1, 7, 256])
['O', 'O', 'O', 'O', 'O', 'O', 'O']


하기 내용들은, 값을 테스트해보기 위해 사용한 블락들입니다.

In [82]:
np.savetxt('./test.txt',model.bert.embeddings(input_ids).detach().cpu().numpy().reshape(-1,768))

In [43]:
embed = model.bert.embeddings(input_ids).detach().cpu().numpy().reshape(-1,768)

In [53]:
pd.DataFrame(np.corrcoef(embed[:4]))

Unnamed: 0,0,1,2,3
0,1.0,-0.054612,-0.009569,-0.082023
1,-0.054612,1.0,-0.056569,-0.012214
2,-0.009569,-0.056569,1.0,0.000704
3,-0.082023,-0.012214,0.000704,1.0


In [57]:
sequence_output,_ = model.bert(input_ids,input_mask,False)
sequence_output = sequence_output.detach()

In [58]:
sequence_output

tensor([[[ 1.7252, -2.1268,  0.7987,  ..., -1.3268,  1.3964,  2.2400],
         [ 1.7252, -2.1268,  0.7987,  ..., -1.3268,  1.3964,  2.2400],
         [ 1.7252, -2.1268,  0.7987,  ..., -1.3268,  1.3964,  2.2400],
         ...,
         [ 1.7252, -2.1268,  0.7987,  ..., -1.3268,  1.3964,  2.2400],
         [ 1.7252, -2.1268,  0.7987,  ..., -1.3268,  1.3964,  2.2400],
         [ 1.7252, -2.1268,  0.7987,  ..., -1.3268,  1.3964,  2.2400]]],
       device='cuda:0')

In [63]:
np.corrcoef(np.array([[0.8, 0.3, 0.1],
[0.7, 0.5, 0.1],
[0.1, 0.2, 0.8]]))

array([[ 1.        ,  0.9078413 , -0.80583738],
       [ 0.9078413 ,  1.        , -0.97986371],
       [-0.80583738, -0.97986371,  1.        ]])