In [1]:
!pip install transformers



In [2]:
import tensorflow as tf
import torch

from transformers import BertTokenizer, AlbertTokenizer, RobertaTokenizer
from transformers import BertForSequenceClassification, AlbertForSequenceClassification, RobertaForSequenceClassification, AdamW, BertConfig, AlbertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

Using TensorFlow backend.


In [3]:
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv("bert_cleaned_friends_train.txt", sep='\t', names=['label', 'utterance'])
test = pd.read_csv("test.txt", sep='\t', names=['utterance'])

print(train.shape)
print(test.shape)

(10558, 2)
(3296, 1)


In [4]:
sentences = train['utterance']
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

In [5]:
train['label'] = train['label'].astype('category')

In [6]:
mapping = dict( enumerate(train['label'].cat.categories ) )
mapping

{0: 'anger',
 1: 'disgust',
 2: 'fear',
 3: 'joy',
 4: 'neutral',
 5: 'non-neutral',
 6: 'sadness',
 7: 'surprise'}

In [7]:
train['label'] = train['label'].cat.codes
labels = train['label'].values
labels = labels.astype('long')
labels

array([4, 4, 4, ..., 7, 4, 5])

In [8]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [9]:
MAX_LEN = 80

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [10]:
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [11]:
# 훈련셋과 검증셋으로 분리
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels, 
                                                                                    random_state=2018, 
                                                                                    test_size=0.1)

# 어텐션 마스크를 훈련셋과 검증셋으로 분리
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2018, 
                                                       test_size=0.1)

# 데이터를 파이토치의 텐서로 변환
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)	

In [12]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [13]:
test_sentences = test['utterance']

In [14]:
test_sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in test_sentences]

In [15]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in test_sentences]

In [16]:
MAX_LEN = 80

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [17]:
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [18]:
test_inputs = torch.tensor(input_ids)
test_masks = torch.tensor(attention_masks)

In [19]:
batch_size = 4

test_data = TensorDataset(test_inputs, test_masks)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [20]:
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [21]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [22]:
class STLR(torch.optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, max_mul, ratio, steps_per_cycle, decay=1, last_epoch=-1):
        self.max_mul = max_mul - 1
        self.turning_point = steps_per_cycle // (ratio + 1)
        self.steps_per_cycle = steps_per_cycle
        self.decay = decay
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        residual = self.last_epoch % self.steps_per_cycle
        multiplier = self.decay ** (self.last_epoch // self.steps_per_cycle)
        if residual <= self.turning_point:
            multiplier *= self.max_mul * (residual / self.turning_point)
        else:
            multiplier *= self.max_mul * (
                (self.steps_per_cycle - residual) /
                (self.steps_per_cycle - self.turning_point))
        return [lr * (1 + multiplier) for lr in self.base_lrs]

In [23]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=8)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [24]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )

epochs = 10

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [25]:
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [26]:
def format_time(elapsed):

    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

model.zero_grad()

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_loss = 0

    model.train()
        
    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch
         
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        loss = outputs[0]

        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

        model.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():     
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        pred_label = np.argmax(logits, axis=1).flatten()
        # print(pred_label)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...

  Average training loss: 1.31
  Training epcoh took: 0:04:10

Running Validation...
[4 4 4 3 4 4 4 4 7 4 4 7 7 4 4 4 3 4 7 7 7 4 4 3 4 7 4 4 4 5 3 4]
[5 4 4 7 3 5 4 4 3 4 4 4 7 3 4 4 4 4 3 4 3 5 4 4 3 5 4 5 5 3 4 3]
[6 4 4 4 4 4 6 4 4 4 3 4 7 5 7 4 5 4 4 4 3 4 5 5 4 5 5 3 7 4 7 7]
[4 3 4 5 4 4 4 4 4 4 5 7 4 4 4 4 7 4 4 4 4 7 4 4 3 4 3 7 3 4 3 4]
[4 7 7 4 4 4 4 3 7 3 7 4 4 4 4 4 4 4 4 7 5 4 4 4 6 4 4 3 4 6 4 5]
[4 7 3 4 4 3 3 4 4 4 4 4 3 4 3 4 4 7 5 4 4 3 5 3 4 4 4 7 5 4 7 5]
[3 7 4 7 4 3 5 4 3 5 3 4 4 4 4 3 5 4 4 5 4 5 4 4 7 3 4 4 4 4 4 5]
[4 4 4 4 4 5 4 5 4 7 7 4 4 3 4 3 4 4 4 6 7 4 3 4 4 4 5 3 7 4 6 5]
[3 4 3 4 4 4 4 4 4 4 4 4 3 7 4 3 4 4 4 4 4 6 4 4 4 7 4 4 4 4 4 5]
[4 4 4 4 4 4 4 4 4 4 3 7 4 5 4 4 4 4 4 3 3 4 3 4 4 3 4 4 3 5 4 4]
[6 4 4 7 4 4 4 3 3 3 4 4 7 4 7 4 4 3 4 4 4 3 7 4 7 7 5 4 3 3 7 4]
[4 4 3 4 5 7 4 3 6 7 7 4 4 3 4 4 7 4 4 4 4 4 7 4 5 4 4 4 4 4 7 7]
[4 4 4 4 5 4 7 4 7 4 5 5 4 4 4 4 7 4 4 4 7 4 5 4 5 4 4 4 4 7 4 0]
[3 4 4 4 4 4 3 3 3 3 3 4 3 3 7 6 5 3 4 7 7 4

In [None]:
t0 = time.time()

model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

with open('result.txt', "a") as f:
  for step, batch in enumerate(test_dataloader):
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    batch = tuple(t.to(device) for t in batch)
    
    b_input_ids, b_input_mask = batch

    with torch.no_grad():     
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    for logit in logits:
      label = np.argmax(logit)
      # print(label)
      print(mapping[label])
      f.write(mapping[label] + '\n')