# Friends 감정분석 with ELECTRA and Data Augmentation(EmoLex 감성사전)
참고(링크) : http://sentiment.nrc.ca/lexicons-for-research/

**[Friends 감정]**<br>
분노, 혐오, 슬픔, 기쁨, 중립, 비중립, 놀라움, 공포의 8가지 감정<br>
'anger' 'disgust' 'sadness' 'joy' 'neutral' 'non-neutral' 'surprise' 'fear'<br>

**[EmoLex 감성사전]**<br>
positive, trust, anticipation, joy, fear, surprise, disgust, negative, sadness<br>
이중 joy, fear, surprise, disgust, sadness 5가지 감정 발췌<br>

# 준비
라이브러리, 파라미터 세팅

In [1]:
import tensorflow as tf
import torch

from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
import random
import time
import datetime
import json
import re
import os

In [2]:
# ELECTRA 토크나이저 설정
TOKENIZER_Small = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
TOKENIZER_Base = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
TOKENIZER_Large = ElectraTokenizer.from_pretrained('google/electra-large-discriminator')

# ELECTRA 모델 설정
MODEL_Small = ElectraForSequenceClassification.from_pretrained('google/electra-small-generator', num_labels=8)
MODEL_Base = ElectraForSequenceClassification.from_pretrained('google/electra-base-generator', num_labels=8)
MODEL_Large = ElectraForSequenceClassification.from_pretrained('google/electra-large-generator', num_labels=8)

DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TEST_SIZE = 0.2
RANDOM_SEED = 42

def electra_param(maxlen=85,epochnum=7,batchsize=32,modelsize='small'):
    global MAX_LEN
    global EPOCHS
    global BAT_SZ
    global MOD_SZ
    global TOKENIZER_CHOICE
    global MODEL_CHOICE
    
    MAX_LEN = maxlen
    EPOCHS = epochnum
    BAT_SZ = batchsize
    MOD_SZ = modelsize
    
    if modelsize == 'small':
        TOKENIZER_CHOICE = TOKENIZER_Small
        MODEL_CHOICE = MODEL_Small
    elif modelsize == 'base':
        TOKENIZER_CHOICE = TOKENIZER_Base
        MODEL_CHOICE = MODEL_Base
    elif modelsize == 'large':
        TOKENIZER_CHOICE = TOKENIZER_Large
        MODEL_CHOICE = MODEL_Large
    else:
        TOKENIZER_CHOICE = TOKENIZER_Small
        MODEL_CHOICE = MODEL_Small
    
    print("MAX_LEN :", MAX_LEN)
    print("EPOCHS :", EPOCHS)
    print("BAT_SZ :", BAT_SZ)
    print()
    print("TOKENIZER :", TOKENIZER_CHOICE)
    print()
    print("MODEL :", MODEL_CHOICE)

Some weights of the model checkpoint at google/electra-small-generator were not used when initializing ElectraForSequenceClassification: ['generator_predictions.LayerNorm.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.weight', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-generator and are newly initializ

## ===> 아래 함수에서 주요 파라미터 설정하여 수행할 것! <===

In [3]:
# 주요 편집 대상 파라미터 (MAX_LEN, EPOCHS, BAT_SZ, TOKENIZER, MODEL)
electra_param(maxlen=85,epochnum=9,batchsize=32,modelsize='large')

MAX_LEN : 85
EPOCHS : 9
BAT_SZ : 32

TOKENIZER : PreTrainedTokenizer(name_or_path='google/electra-large-discriminator', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

MODEL : ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=1024, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in

In [4]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [5]:
# GPU 할당 변경하기
GPU_NUM = 2 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

Current cuda device  2


# 데이터 로드

In [6]:
def jsonToDf(file_name):
    with open(file_name, encoding = 'utf-8', mode = 'r') as file:
        json_array = json.load(file)
  
    result = pd.DataFrame.from_dict(json_array[0])

    is_first = True
    for array in json_array:
        if is_first:
            is_first = False
            continue
    
        temp_df = pd.DataFrame.from_dict(array)
        result = result.append(temp_df, ignore_index = True)

    return result

In [7]:
train = jsonToDf(DATA_IN_PATH+'friends_train.json')  # 학습용
dev = jsonToDf(DATA_IN_PATH+'friends_dev.json')  # 검증용
test = pd.read_csv(DATA_IN_PATH+'en_data.csv')  # 테스트(캐글) 데이터

In [8]:
print(train.shape)
print(dev.shape)
print(test.shape)

(10561, 4)
(1178, 4)
(1623, 5)


In [9]:
train.head()

Unnamed: 0,speaker,utterance,emotion,annotation
0,Chandler,also I was the point person on my companys tr...,neutral,4100000
1,The Interviewer,You mustve had your hands full.,neutral,5000000
2,Chandler,That I did. That I did.,neutral,5000000
3,The Interviewer,So lets talk a little bit about your duties.,neutral,5000000
4,Chandler,My duties? All right.,surprise,2000030


In [10]:
dev.head()

Unnamed: 0,speaker,utterance,emotion,annotation
0,Phoebe,"Oh my God, hes lost it. Hes totally lost it.",non-neutral,2120
1,Monica,What?,surprise,1000130
2,Ross,"Or! Or, we could go to the bank, close our acc...",neutral,3000200
3,Chandler,Youre a genius!,joy,500000
4,Joey,"Aww, man, now we wont be bank buddies!",sadness,40100


In [11]:
test.head()

Unnamed: 0,id,i_dialog,i_utterance,speaker,utterance
0,0,0,0,Phoebe,"Alright, whadyou do with him?"
1,1,0,1,Monica,Oh! You're awake!
2,2,0,2,Joey,Then you gotta come clean with Ma! This is not...
3,3,0,3,Mr. Tribbiani,"Yeah, but this is"
4,4,0,4,Joey,I don't wanna hear it! Now go to my room!


In [12]:
# # 'id' 컬럼 신규
# train['id'] = [i for i in range(len(train))]
# dev['id'] = [i for i in range(len(dev))]

# 'speaker', 'annotation', 'i_dialog', 'i_utterance' 컬럼 drop
train.drop('speaker', axis=1, inplace=True)
train.drop('annotation', axis=1, inplace=True)
dev.drop('speaker', axis=1, inplace=True)
dev.drop('annotation', axis=1, inplace=True)
test.drop('i_dialog', axis=1, inplace=True)
test.drop('i_utterance', axis=1, inplace=True)
test.drop('speaker', axis=1, inplace=True)

# 컬럼 순서 바꾸기
train = train[['utterance','emotion']]
dev = dev[['utterance','emotion']]

In [13]:
print(train.shape)
print(dev.shape)
print(test.shape)

(10561, 2)
(1178, 2)
(1623, 2)


In [14]:
train.head()

Unnamed: 0,utterance,emotion
0,also I was the point person on my companys tr...,neutral
1,You mustve had your hands full.,neutral
2,That I did. That I did.,neutral
3,So lets talk a little bit about your duties.,neutral
4,My duties? All right.,surprise


In [15]:
dev.head()

Unnamed: 0,utterance,emotion
0,"Oh my God, hes lost it. Hes totally lost it.",non-neutral
1,What?,surprise
2,"Or! Or, we could go to the bank, close our acc...",neutral
3,Youre a genius!,joy
4,"Aww, man, now we wont be bank buddies!",sadness


In [16]:
test.head()

Unnamed: 0,id,utterance
0,0,"Alright, whadyou do with him?"
1,1,Oh! You're awake!
2,2,Then you gotta come clean with Ma! This is not...
3,3,"Yeah, but this is"
4,4,I don't wanna hear it! Now go to my room!


In [17]:
train['emotion'].value_counts()

neutral        4752
non-neutral    2017
joy            1283
surprise       1220
anger           513
sadness         351
disgust         240
fear            185
Name: emotion, dtype: int64

### 추가 데이터 로드 (EmoLex 감성사전)
NRC-Emotion-Lexicon-Wordlevel : http://sentiment.nrc.ca/lexicons-for-research/<br>

**[EmoLex 감성사전]**<br>
positive, trust, anticipation, joy, fear, surprise, disgust, negative, sadness 감정 라벨링이 되어 있으며, 이중 **joy, fear, surprise, disgust, sadness** 5가지 감정 발췌하여 기본 데이터에 추가

In [18]:
# 추가 데이터 로드
df_tmp = pd.read_csv("./data_in/NRC-Emotion-Lexicon-Wordlevel/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep='\t', header=None, engine='python')
df_tmp.reset_index(drop=False, inplace=False)
df_tmp.head()

Unnamed: 0,0,1,2
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0


In [19]:
len(df_tmp)

141820

In [20]:
df_tmp = df_tmp[(df_tmp != 0).all(1)]
df_tmp = df_tmp.reset_index(drop=True)
df_emolex = df_tmp.copy()
df_emolex.columns = [['utterance','emotion','label']]
df_emolex.drop('label', axis=1, inplace=True)
df_emolex.head(10)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,utterance,emotion
0,abacus,trust
1,abandon,fear
2,abandon,negative
3,abandon,sadness
4,abandoned,anger
5,abandoned,fear
6,abandoned,negative
7,abandoned,sadness
8,abandonment,anger
9,abandonment,fear


In [21]:
len(df_emolex)

13901

In [22]:
list_word = list(df_emolex.iloc[:,0])
list_emotion = list(df_emolex.iloc[:,1])

train_plus_tmp = pd.DataFrame( data={"utterance": list_word, "emotion": list_emotion} )
train_plus_tmp

Unnamed: 0,utterance,emotion
0,abacus,trust
1,abandon,fear
2,abandon,negative
3,abandon,sadness
4,abandoned,anger
...,...,...
13896,zest,anticipation
13897,zest,joy
13898,zest,positive
13899,zest,trust


In [23]:
joy = train_plus_tmp[train_plus_tmp['emotion']=='joy']
fear = train_plus_tmp[train_plus_tmp['emotion']=='fear']
surprise = train_plus_tmp[train_plus_tmp['emotion']=='surprise']
disgust = train_plus_tmp[train_plus_tmp['emotion']=='disgust']
sadness = train_plus_tmp[train_plus_tmp['emotion']=='sadness']

In [24]:
plus_result_tmp2 = pd.concat([train,joy,fear,surprise,disgust,sadness], axis=0, ignore_index=True)
train = plus_result_tmp2.copy()
train

Unnamed: 0,utterance,emotion
0,also I was the point person on my companys tr...,neutral
1,You mustve had your hands full.,neutral
2,That I did. That I did.,neutral
3,So lets talk a little bit about your duties.,neutral
4,My duties? All right.,surprise
...,...,...
15504,wretched,sadness
15505,wrinkled,sadness
15506,wrongdoing,sadness
15507,wrongful,sadness


In [25]:
print(train.shape)
print(dev.shape)
print(test.shape)

(15509, 2)
(1178, 2)
(1623, 2)


In [26]:
# train_neutral = train[train['emotion']=='neutral']
# train_neutral = train_neutral.sample(n=2500)  # neutral 편향을 낮춰주기
# train_non_neutral = train[train['emotion']=='non-neutral']
# train_joy = train[train['emotion']=='joy']
# train_surprise = train[train['emotion']=='surprise']
# train_anger = train[train['emotion']=='anger']
# train_sadness = train[train['emotion']=='sadness']
# train_disgust = train[train['emotion']=='disgust']
# train_fear = train[train['emotion']=='fear']

# train = pd.concat([train_neutral, train_joy, train_sadness, train_fear, train_anger, \
#                    train_non_neutral, train_surprise, train_disgust], axis=0)
# print(train.shape)

In [27]:
train['emotion'].value_counts()

neutral        4752
non-neutral    2017
joy            1972
surprise       1754
fear           1661
sadness        1542
disgust        1298
anger           513
Name: emotion, dtype: int64

In [28]:
train.head()

Unnamed: 0,utterance,emotion
0,also I was the point person on my companys tr...,neutral
1,You mustve had your hands full.,neutral
2,That I did. That I did.,neutral
3,So lets talk a little bit about your duties.,neutral
4,My duties? All right.,surprise


In [29]:
dev.head()

Unnamed: 0,utterance,emotion
0,"Oh my God, hes lost it. Hes totally lost it.",non-neutral
1,What?,surprise
2,"Or! Or, we could go to the bank, close our acc...",neutral
3,Youre a genius!,joy
4,"Aww, man, now we wont be bank buddies!",sadness


In [30]:
test.head()

Unnamed: 0,id,utterance
0,0,"Alright, whadyou do with him?"
1,1,Oh! You're awake!
2,2,Then you gotta come clean with Ma! This is not...
3,3,"Yeah, but this is"
4,4,I don't wanna hear it! Now go to my room!


In [31]:
# \x92 이외의 숫자, 특수문자 제거 등 적용 시 성능이 좋지 않기에 수행 제외

# \x92 제거
train['utterance'] = train['utterance'].apply( lambda x : re.sub('[\x92]',"'",str(x)) )
dev['utterance'] = dev['utterance'].apply( lambda x : re.sub('[\x92]',"'",str(x)) )
test['utterance'] = test['utterance'].apply( lambda x : re.sub('[\x92]',"'",str(x)) )

# # 영문자 이외 제거
# train['utterance'] = train['utterance'].apply( lambda x : re.sub('[^a-zA-Z0-9]',' ',str(x)) )
# dev['utterance'] = dev['utterance'].apply( lambda x : re.sub('[^a-zA-Z0-9]',' ',str(x)) )
# test['utterance'] = test['utterance'].apply( lambda x : re.sub('[^a-zA-Z0-9]',' ',str(x)) )

# # 숫자 제거
# train['utterance'] = train['utterance'].apply( lambda x : re.sub(r"\d+", " ", x) )
# dev['utterance'] = dev['utterance'].apply( lambda x : re.sub(r"\d+", " ", x) )
# test['utterance'] = test['utterance'].apply( lambda x : re.sub(r"\d+", " ", x) )

# # 특수문자 제거(정규 표현식 수행)
# train['utterance'] = train['utterance'].str.replace("[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》\x92]"," ")
# dev['utterance'] = dev['utterance'].str.replace("[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》\x92]"," ")
# test['utterance'] = test['utterance'].str.replace("[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》\x92]"," ")

# 데이터 전처리

In [32]:
def getInputsAndLabels(dataset):
    data = dataset.copy(deep=True)
    #data['utterance'] = data['utterance'].str.lower()

    utterances = data['utterance']
    utterances = ["[CLS] " + str(utterance) + " [SEP]" for utterance in utterances]
  
    encoder = LabelEncoder()
    labels = data['emotion'].values
    encoder.fit(labels)
    labels = encoder.transform(labels)

    tokenizer = TOKENIZER_CHOICE
    #tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
    tokenized_texts = [tokenizer.tokenize(utterance) for utterance in utterances]

    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    return input_ids, labels, attention_masks

In [33]:
def getInputsFromTest(dataset):
    data = dataset.copy(deep=True)
    #data['utterance'] = data['utterance'].str.lower()

    utterances = data['utterance']
    utterances = ["[CLS] " + str(utterance) + " [SEP]" for utterance in utterances]
  
    tokenizer = TOKENIZER_CHOICE
    #tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
    tokenized_texts = [tokenizer.tokenize(utterance) for utterance in utterances]

    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    return input_ids, attention_masks

In [34]:
def getIndex(dataset):
    data = dataset.copy(deep = True)
    input_index = data.id.tolist()
    return torch.tensor(input_index)

In [35]:
train_inputs, train_labels, train_masks = getInputsAndLabels(train)

dev_inputs, dev_labels, dev_masks = getInputsAndLabels(dev)

test_inputs, test_masks = getInputsFromTest(test)  # 지우지 말것

In [36]:
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

dev_inputs = torch.tensor(dev_inputs)
dev_labels = torch.tensor(dev_labels)
dev_masks = torch.tensor(dev_masks)

test_index = getIndex(test)
test_inputs = torch.tensor(test_inputs)
test_masks = torch.tensor(test_masks)

In [37]:
batch_size = BAT_SZ

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

dev_data = TensorDataset(dev_inputs, dev_masks, dev_labels)
dev_sampler = SequentialSampler(dev_data)
dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)

test_data = TensorDataset(test_index, test_inputs, test_masks)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# 모델 구현

In [38]:
model = MODEL_CHOICE
#model = ElectraForSequenceClassification.from_pretrained('google/electra-large-generator', num_labels=8)
model.cuda()

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=1024, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linea

In [39]:
# from adabelief_pytorch import AdaBelief

# optimizer = AdaBelief(model.parameters(), lr=2e-5, eps=1e-8)

# epochs = EPOCHS

# total_steps = len(train_dataloader) * epochs

# # 학습률을 조금씩 감소시키는 스케줄러 생성
# scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                             num_warmup_steps = 0,
#                                             num_training_steps = total_steps)

In [40]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,  # 2e-5
                  eps = 1e-8  # 1e-8
                )

epochs = EPOCHS

total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

# 모델 학습

In [41]:
from sklearn.metrics import f1_score

# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def getF1Score(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return f1_score(labels_flat, pred_flat, average = None)

In [42]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [43]:
seed_val = RANDOM_SEED
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
             
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()


        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.5f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy, eval_f1 = 0, 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴 from 검증셋(Original Test Dataset)
    for batch in dev_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():     
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
     
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        # tmp_eval_f1 = getF1Score(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        # eval_f1 += tmp_eval_f1
        nb_eval_steps += 1

    print("  Accuracy: {0:.5f}".format(eval_accuracy/nb_eval_steps))
    #print("  F1: {0:.2f}".format(eval_f1/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...

  Average training loss: 1.76552
  Training epcoh took: 0:00:47

Running Validation...
  Accuracy: 0.45868
  Validation took: 0:00:01

Training...

  Average training loss: 1.47077
  Training epcoh took: 0:00:47

Running Validation...
  Accuracy: 0.50026
  Validation took: 0:00:01

Training...

  Average training loss: 1.35050
  Training epcoh took: 0:00:47

Running Validation...
  Accuracy: 0.51819
  Validation took: 0:00:01

Training...

  Average training loss: 1.26908
  Training epcoh took: 0:00:47

Running Validation...
  Accuracy: 0.54964
  Validation took: 0:00:01

Training...

  Average training loss: 1.20513
  Training epcoh took: 0:00:47

Running Validation...
  Accuracy: 0.55743
  Validation took: 0:00:01

Training...

  Average training loss: 1.15463
  Training epcoh took: 0:00:47

Running Validation...
  Accuracy: 0.56672
  Validation took: 0:00:01

Training...

  Average training loss: 1.11198
  Training epcoh took: 0:00:47

Running Validation...
  Accuracy:

# (참고) 제출 파일 생성

In [44]:
tmp_test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)
test_result = test.copy(deep = True)
#test_result = test_result.drop(columns = ['i_dialog', 'i_utterance', 'speaker'])
test_result['Predicted'] = 'default'

encoder = LabelEncoder()
labels = train['emotion'].values
encoder.fit(labels)
labels = encoder.transform(labels)

for step, batch in enumerate(tmp_test_dataloader):
    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_index, b_input_ids, b_input_mask = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    idx = b_index.item()
    test_result['Predicted'][idx] = encoder.classes_[np.argmax(logits)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [45]:
test_result.tail()

Unnamed: 0,id,utterance,Predicted
1618,1618,Nooo.,non-neutral
1619,1619,"Hi, Kate!",joy
1620,1620,"Hi, Lauren.",neutral
1621,1621,"Hi, Lauren.",neutral
1622,1622,"Hi, pig!",joy


In [46]:
test_result = test_result.drop(columns = ['utterance'])
test_result.columns = ['Id','Predicted']

In [47]:
test_result.tail()

Unnamed: 0,Id,Predicted
1618,1618,non-neutral
1619,1619,joy
1620,1620,neutral
1621,1621,neutral
1622,1622,joy


In [48]:
# 해당 경로가 없으면 생성
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

SAVE_NM = "FRIENDS_ELECTRA_EmoLexSWD_"+str(MOD_SZ)+"_MAXLN"+str(MAX_LEN)+"_BATSZ"+str(BAT_SZ)+"_EPOCH"+str(EPOCHS)+".csv"

# csv 파일 생성
test_result.to_csv(DATA_OUT_PATH + SAVE_NM, index = False)  # 앙상블 조합 및 캐글 제출 용도

### 캐글 제출 결과
**[2020.12.12~]**<br>
**from_pretrained('google/electra-small-generator')**<br>
**MAXLEN 85, epochs 12 => 0.57090**<br>

**from_pretrained('google/electra-base-generator')**<br>
**MAXLEN 85, epochs 12 => 0.57090**<br>

**from_pretrained('google/electra-large-generator')**<br>
**MAXLEN 85, epochs 12 => 0.57090<br>
MAXLN85_BATSZ32_EPOCH12 => 0.57583<br>
MAXLN85_BATSZ32_EPOCH12 => 0.57336**