# Friends Emotionlines sentiment analysis using Transformer

참고자료: https://github.com/Parkchanjun/KU-NLP-2020-1/blob/master/%5B4%5D%20Transformer%EB%A5%BC_%EC%9D%B4%EC%9A%A9%ED%95%9C_%EA%B0%90%EC%A0%95%EB%B6%84%EC%84%9D_%EC%98%81%EC%96%B4.ipynb


## Multi head Attention


In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0: 
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads 
        #Multi-head Attention에서는 query, key, value를 바로 사용하는 것이 아닌 h번의 Linear projection을 따라 서로 다른 representation의 조합으로부터 Attention을 계산하는 방법이다. 
        self.query_dense = layers.Dense(embed_dim) #쿼리
        self.key_dense = layers.Dense(embed_dim) #키
        self.value_dense = layers.Dense(embed_dim) #밸류
        self.combine_heads = layers.Dense(embed_dim) #concat

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True) #Q와 K를 곱한다.
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32) #텐서를 새로운 자료형으로 변환합니다.(tf.shape(key)[-1] = 그냥 입력되는 차원)
        scaled_score = score / tf.math.sqrt(dim_key) #Sclae 작업, K차원의 루트값으로
        weights = tf.nn.softmax(scaled_score, axis=-1) #Softmax
        output = tf.matmul(weights, value) #V 곱하기
        return output, weights 

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim)) #Multihead Attention , -1은 sequence length
        return tf.transpose(x, perm=[0, 2, 1, 3]) #x를 전치합니다. perm에 따라 차원의 순서를 구성합니다.

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0] #batch size
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim) => tf.transpose(x, perm=[0, 2, 1, 3])의 결과
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value) #Self Attention
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim), multi head값들을 concat하는 과정
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output



## Transformer Layer


In [3]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs) #Multihead Attn 블록
        attn_output = self.dropout1(attn_output, training=training) #드롭아웃
        out1 = self.layernorm1(inputs + attn_output) #LM + Residual
        ffn_output = self.ffn(out1) #FF 블록
        ffn_output = self.dropout2(ffn_output, training=training) #드롭아웃
        return self.layernorm2(out1 + ffn_output) #LM + Residual



## 임베딩 Layer


In [4]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emded_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=emded_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=emded_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1) #포지션 정보
        positions = self.pos_emb(positions) #포지션 임베딩
        x = self.token_emb(x) #토큰임베딩
        return x + positions #합치기

## 데이터 다운



In [5]:
from google.colab import drive
drive.mount('/content/drive')
#밑에 나오는 'Go to ths URL in a browser: '에 나타난 url로 가서 허용하고 그 코드 밑에 복사하고 엔터 하시면 됩니다
#friends파일이 커서 데이터를 구글드라이브에 저장하고 거기서 불러오는 방식으로 했습니다.
"""
friends파일(유니코드 제거ver)
friends_test.json  https://drive.google.com/file/d/10rCWTr2OMoiw3JJDja-9e1kwoETvoDmF/view?usp=sharing, 
friends_train.json     https://drive.google.com/file/d/1XMtL50HI3h4X-3a7hIKw9s4G9VdevDIQ/view?usp=sharing, 
friends_dev.json     https://drive.google.com/file/d/1gdx-PO4vDmO7ngI5Z05QZs3etOcui-cg/view?usp=sharing
"""
#위 주소에서 받고 압축풀어서 자신의 구글드라이브에 업로드 해주세요

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


'\nfriends파일(유니코드 제거ver)\nfriends_test.json  https://drive.google.com/file/d/10rCWTr2OMoiw3JJDja-9e1kwoETvoDmF/view?usp=sharing, \nfriends_train.json     https://drive.google.com/file/d/1XMtL50HI3h4X-3a7hIKw9s4G9VdevDIQ/view?usp=sharing, \nfriends_dev.json     https://drive.google.com/file/d/1gdx-PO4vDmO7ngI5Z05QZs3etOcui-cg/view?usp=sharing\n'

## 데이터셋 준비


In [6]:
import os
import numpy as np
import nltk
import json
from keras.utils import to_categorical ##One-Hot-Encoding을 매우 쉽게 해주는 함수
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')


Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
def tokenize():
    sentiment = ['neutral', 'joy', 'sadness', 'fear', 'anger', 'surprise', 'disgust', 'non-neutral']

    sentiment2index = {}
    for voca in sentiment:
        if voca not in sentiment2index.keys():
            sentiment2index[voca] = len(sentiment2index)
    return sentiment2index

def read_dataset(dataset_type):
    max_seq_len = 0
    with open(dataset_type, "r", encoding="utf-8") as file_handler:
        json_2_line = json.load(file_handler)
        labels, sentences = [], []
        for line in json_2_line:
            for i in range(len(line)):
                sentence = line[i]['utterance']
                sentences.append(sentence)

                tok_sentence = nltk.word_tokenize(sentence)
                tok_key = nltk.word_tokenize(line[i]['emotion'])
                labels.append(sentiment2index[tok_key[0]])

                max_seq_len = max(max_seq_len, len(tok_sentence))

    return labels, sentences, max_seq_len


sentiment2index = tokenize()
##여기서 위에서 받은 구글드라이브에 있는 파일 경로 적어주면 됩니다
##왼쪽에 drive 폴더에 들어가서 friends 파일 찾고 오른쪽마우스로 '경로 복사'해서 붙여넣기
TRAIN_LABELS, TRAIN_SENTENCES, TRAIN_MAX_SEQ_LEN = read_dataset("/content/drive/My Drive/Colab Notebooks/Friends/friends_train.json") #학습데이터 읽기
TEST_LABELS, TEST_SENTENCES, TEST_MAX_SEQ_LEN = read_dataset("/content/drive/My Drive/Colab Notebooks/Friends/friends_test.json") #테스트데이터 읽기
print(len(TRAIN_LABELS))
print(len(TEST_LABELS))
MAX_SEQUENCE_LEN = max(TRAIN_MAX_SEQ_LEN, TEST_MAX_SEQ_LEN) #Train과 Test 전체에서 가장 긴 길이
#print(TEST_LABELS)
#x_train=make_one_hot(train_data)
#x_test=make_one_hot(test_data)

#one-hot encoding
TRAIN_LABELS=to_categorical(TRAIN_LABELS) 
TEST_LABELS=to_categorical(TEST_LABELS) 


print("Train : ", len(TRAIN_SENTENCES))
for train_label, train_sent in zip(TRAIN_LABELS, TRAIN_SENTENCES[0:30]):
  print(train_label, ':' ,train_sent)

print()
print("Test : ", len(TEST_SENTENCES))
for test_label, test_sent in zip(TEST_LABELS, TEST_SENTENCES[0:10]):
  print(test_label, ':' ,test_sent)

print("MAX_SEQUENCE_LEN", MAX_SEQUENCE_LEN)

10561
2764
Train :  10561
[1. 0. 0. 0. 0. 0. 0. 0.] : also I was the point person on my companys transition from the KL-5 to GR-6 system.
[1. 0. 0. 0. 0. 0. 0. 0.] : You mustve had your hands full.
[1. 0. 0. 0. 0. 0. 0. 0.] : That I did. That I did.
[1. 0. 0. 0. 0. 0. 0. 0.] : So lets talk a little bit about your duties.
[0. 0. 0. 0. 0. 1. 0. 0.] : My duties?  All right.
[1. 0. 0. 0. 0. 0. 0. 0.] : Now youll be heading a whole division, so youll have a lot of duties.
[1. 0. 0. 0. 0. 0. 0. 0.] : I see.
[1. 0. 0. 0. 0. 0. 0. 0.] : But therell be perhaps 30 people under you so you can dump a certain amount on them.
[1. 0. 0. 0. 0. 0. 0. 0.] : Good to know.
[1. 0. 0. 0. 0. 0. 0. 0.] : We can go into detail
[0. 0. 0. 1. 0. 0. 0. 0.] : No dont I beg of you!
[1. 0. 0. 0. 0. 0. 0. 0.] : All right then, well have a definite answer for you on Monday, but I think I can say with some confidence, youll fit in well here.
[0. 0. 0. 0. 0. 1. 0. 0.] : Really?!
[1. 0. 0. 0. 0. 0. 0. 0.] : Absolutely.  Y

In [8]:
#for 캐글 csv파일 읽기
import pandas as pd
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Friends/en_data.csv",encoding= 'unicode_escape')
sample = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Friends/en_sample.csv",encoding= 'unicode_escape')

caggle = data.utterance.tolist()


##데이터 전처리

In [10]:
tokenizer = Tokenizer(num_words=None,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' , lower=True, char_level=False) #토크나이저 생성
tokenizer.fit_on_texts(TRAIN_SENTENCES) #토큰나이즈 진행
TRAIN_SEQUENCES = tokenizer.texts_to_sequences(TRAIN_SENTENCES)#id로 변경
TEST_SEQUENCES = tokenizer.texts_to_sequences(TEST_SENTENCES)#id로 변경
caggle_sequences = tokenizer.texts_to_sequences(caggle)
VOCAB_SIZE = len(tokenizer.word_index) + 1

print(TRAIN_SENTENCES[0])
print(TRAIN_SEQUENCES[0])
print(caggle[0])
print(caggle_sequences[0])

x_caggle = pad_sequences(caggle_sequences, padding='post', maxlen=MAX_SEQUENCE_LEN)
x_train = pad_sequences(TRAIN_SEQUENCES, padding='post', maxlen=MAX_SEQUENCE_LEN) #패딩진행 post or pre?
x_test = pad_sequences(TEST_SEQUENCES, padding='post', maxlen=MAX_SEQUENCE_LEN) #패딩진행
print("PAD_SEQUENCES COMPLETES")
print(x_train[0])
print(MAX_SEQUENCE_LEN)
print(x_caggle[0])

"""
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)#패딩
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)#패딩
"""

also I was the point person on my companys transition from the KL-5 to GR-6 system.
[409, 1, 35, 3, 760, 440, 29, 20, 2880, 2881, 127, 3, 2882, 1153, 4, 2883, 900, 901]
Alright, whadyou do with him?
[397, 26, 36, 73]
PAD_SEQUENCES COMPLETES
[ 409    1   35    3  760  440   29   20 2880 2881  127    3 2882 1153
    4 2883  900  901    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]
91
[397  26  36  73   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 

'\nx_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)#패딩\nx_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)#패딩\n'

##데이터 구성 확인

In [23]:
emotion=[0,0,0,0,0,0,0,0]

for i in range(len(TRAIN_LABELS)):
  for j in range(8):
    if(TRAIN_LABELS[i][j]==1):
      emotion[j]+=1
print(emotion)

emotion=[0,0,0,0,0,0,0,0]

for i in range(len(TEST_LABELS)):
  for j in range(8):
    if(TEST_LABELS[i][j]==1):
      emotion[j]+=1
print(emotion)
#Nuetral에 해당되는 데이터가 다른 감정 데이터에 비해 압도적으로 많음을 확인할 수 있다.


[4752, 1283, 351, 185, 513, 1220, 240, 2017]
[1287, 304, 85, 32, 161, 286, 68, 541]


## 모델 구축


In [11]:
embed_dim = 32  # Embedding size for each token, 논문에서는 512차원
num_heads = 2  # Number of attention heads, 논문에서는 8개
ff_dim = 32  # Hidden layer size in feed forward network inside transformer, 논문에서는 2048차원
maxlen = MAX_SEQUENCE_LEN



hello


## 학습


In [22]:
inputs = layers.Input(shape=(maxlen,)) #처음 입력
embedding_layer = TokenAndPositionEmbedding(maxlen, VOCAB_SIZE, embed_dim) #객체 생성
x = embedding_layer(inputs)  #포지셔널 임베딩
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim) #객체 생성
x = transformer_block(x) #트랜스포머 
x = layers.GlobalAveragePooling1D()(x) #Average Pooling
x = layers.Dropout(0.5)(x) #드롯아웃
x = layers.Dense(20, activation="relu")(x) #FFNN
x = layers.Dropout(0.5)(x) #드롭아웃
outputs = layers.Dense(8, activation="softmax")(x) #Softmax

model = keras.Model(inputs=inputs, outputs=outputs) #모델 생성

model.compile("adam", "CategoricalCrossentropy", metrics=["accuracy"])

"""
callback_list = [
  keras.callbacks.EarlyStopping(
    monitor='val_accuracy', # 모델의 검증 정확도 모니터링
    patience=2, # 3 에포크보다 더 길게 향상되지 않으면 중단
  ),
  keras.callbacks.ModelCheckpoint(
    filepath='my_model.h5', # 저장
    monitor='val_accuracy',
    save_best_only=True, # 가장 좋은 모델
  )
]
"""

history = model.fit(
    x_train, TRAIN_LABELS, batch_size=64, epochs=7, validation_data=(x_test, TEST_LABELS)
)

#모델 정보 출력
model.summary() 

#성능 측정
test_loss,test_acc=model.evaluate(x_test,TEST_LABELS)
print("Test_acc: ",test_acc)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 91)]              0         
_________________________________________________________________
token_and_position_embedding (None, 91, 32)            195840    
_________________________________________________________________
transformer_block_9 (Transfo (None, 91, 32)            6464      
_________________________________________________________________
global_average_pooling1d_9 ( (None, 32)                0         
_________________________________________________________________
dropout_38 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_78 (Dense)             (None, 20)                660       
_______________________________________________________

##예측 (for 캐글 Leaderboard)

In [13]:

predictions = model.predict(
    x_caggle, batch_size=None, verbose=0, steps=None, callbacks=None, max_queue_size=10,
    workers=1, use_multiprocessing=False)
y_caggle=[]
for i in range(len(predictions)):
  y_caggle.append(np.argmax(predictions[i]))

sentiment = ['neutral', 'joy', 'sadness', 'fear', 'anger', 'surprise', 'disgust', 'non-neutral']
for i in range(len(y_caggle)):
  y_caggle[i] = sentiment[y_caggle[i]]
print(y_caggle)
dataframe = pd.DataFrame(y_caggle)
dataframe.to_csv("/content/drive/My Drive/Colab Notebooks/Friends/en_sample.csv",index='Expected', header=1)


['neutral', 'non-neutral', 'surprise', 'neutral', 'non-neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'non-neutral', 'non-neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'surprise', 'sadness', 'surprise', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'joy', 'neutral', 'neutral', 'neutral', 'non-neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'non-neutral', 'neutral', 'non-neutral', 'neutral', 'neutral', 'surprise', 'neutral', 'neutral', 'joy', 'neutral', 'neutral', 'neutral', 'surprise', 'neutral', 'non-neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'surprise', 'neutral', 'non-neutral', 'neutral', 'joy', 'neutral', 'non-neutral', 'neutral', 'neutral', 'non-neutral', 'neutral', 'surprise', 'neutral', 'non-neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'non-neutral', 'neutral', 'neutral', 'neutral', 'non-neutral', 'neutral', 'neutral', 'joy', 'neutral', 'neutral', 'non-neutral', 'j

"\nimport csv\nf = open('/content/drive/My Drive/Colab Notebooks/Friends/en_data.csv', 'r', encoding='utf-8')\nrdr = csv.reader(f)\n\nsentences=[]\nfor line in rdr:\n  sentence = line[4]\n  print(sentence)\nf.close\n"