In [1]:
import pandas as pd
from pyvi import ViTokenizer
import re
import nltk
import seaborn as sns
from matplotlib import pyplot

# 1. Import data

In [2]:
train_filename = "train_nor_811.xlsx"
valid_filename = "valid_nor_811.xlsx"
test_filename = "test_nor_811.xlsx"
train_data = pd.read_excel(train_filename, engine = "openpyxl")
valid_data = pd.read_excel(valid_filename, engine = "openpyxl")
test_data = pd.read_excel(test_filename, engine = "openpyxl")

In [3]:
from sklearn.preprocessing import LabelEncoder
def file_processing(data):
    data.drop(columns = {"Unnamed: 0"}, axis = 1, inplace = True)
    data["emotion_encode"] = data["Emotion"]
    encoder = LabelEncoder()
    data.emotion_encode = encoder.fit_transform(data.Emotion)
    return data

In [4]:
train_data = file_processing(train_data)
valid_data = file_processing(valid_data)
test_data = file_processing(test_data)

# 2. Data visualization

# 3. Data preprocessing

In [5]:
def remove_duplicate(word):
    prev_char = ""
    clean_word = ""
    for character in word:
        if(character != prev_char):
            clean_word += character
            prev_char = character
    return clean_word

In [7]:
def deEmojify(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def normalize_sentences(sentences):
    punc_lst = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''", "=", "%", "^", "@", "<", ">"}
    confusing_words = {"per"}
    acronym_word = {
        "ko" : "không",
        "k" : "không",
        "z" : "vậy",
        "v" : "vậy",
        "dzậy" : "vậy",
        "dậy": "vậy",
        "t" : "tao",
        "m" : "mày",
        "sgk" : "sách_giáo_khoa",
        "zi" : "vậy",
        "dth" : "dễ_thương",
        "dume": "đụ mẹ"
    }
    
    clean_sentences = []
    
    # remove punctuation and lowercase
    for sent in sentences:
        
        # remove emojis
        sent = deEmojify(sent)
        
        sent = nltk.word_tokenize(sent)
        temp = []
        for word in sent:
            word = word.lower()
            word = remove_duplicate(word)
            if (word in punc_lst or word in confusing_words):
                continue
            elif(word in acronym_word):
                temp.append(acronym_word[word])
            elif(word.isdigit()):
                temp.append("<NUM>")
            else:
                temp.append(word)
        # remove whitespace
        sent = ' '.join(temp)
        
        clean_sentences.append(sent)
        
    return clean_sentences

In [8]:
def normalize_dataset(data):
    sentences = []
    for i in range(len(data)):
        sentences.append(ViTokenizer.tokenize(data.Sentence[i]))
    
    sentences = normalize_sentences(sentences)
    encode_tags = data.emotion_encode
    
    # remove empty sentences
    for idx, sent in enumerate(sentences):
        if sent.strip() == "":
            del sentences[idx]
            del encode_tags[idx]
    
    return sentences, encode_tags

In [9]:
train_clean_sentences, train_encode_tags = normalize_dataset(train_data)
valid_clean_sentences, valid_encode_tags = normalize_dataset(valid_data)
test_clean_sentences, test_encode_tags = normalize_dataset(test_data)

# 4. Model architecture

In [10]:
from tensorflow.keras.layers import Flatten
from tensorflow.keras import layers, activations , models , preprocessing , utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

### Tokenizer

In [113]:
MAX_LEN = 150

tokenizer = Tokenizer(lower = True, split = ' ')
tokenizer.fit_on_texts(train_clean_sentences + valid_clean_sentences)

X_train = tokenizer.texts_to_sequences(train_clean_sentences)
X_train = pad_sequences(X_train, MAX_LEN, padding='post', truncating='post')

X_valid = tokenizer.texts_to_sequences(valid_clean_sentences)
X_valid = pad_sequences(X_valid, MAX_LEN, padding='post', truncating='post')

In [114]:
vocab_size = len(tokenizer.word_index) + 1

### Word2Vec

In [108]:
from gensim.models import Word2Vec
import numpy as np
MAX_LEN = 150
w2v_model = Word2Vec(train_clean_sentences + valid_clean_sentences, min_count = 1, size = MAX_LEN)

In [109]:
def convert2vec(sentence):
    _sum = np.array([0]*MAX_LEN)
    for word in sentence:
        if not(word in w2v_model.wv.vocab):
            continue
        vec = w2v_model.wv[word]
        _sum = _sum + vec
    return _sum/len(sentence)

In [110]:
X_train = []
for sent in train_clean_sentences:
    if len(sent) == 0:
        X_train.append(np.array([0]*MAX_LEN))
    else:
        X_train.append(convert2vec(sent))
X_train = np.array(X_train)

X_valid = []
for sent in valid_clean_sentences:
    if len(sent) == 0:
        X_valid.append(np.array([0]*MAX_LEN))
    else:
        X_valid.append(convert2vec(sent))
X_valid = np.array(X_valid)

## 4.1 CNN Model

In [71]:
embedding_dim = 128

inputs = layers.Input(shape=( MAX_LEN , ))
embedding = layers.Embedding(vocab_size, embedding_dim, input_length=MAX_LEN)(inputs)

cnn1 = layers.Conv1D(filters=100, kernel_size=1, activation='relu')(embedding)
cnn1 = layers.MaxPooling1D(pool_size=2)(cnn1)
cnn1 = Flatten()(cnn1)

cnn2 = layers.Conv1D(filters=100, kernel_size=2, activation='relu')(embedding)
cnn2 = layers.MaxPooling1D(pool_size=2)(cnn2)
cnn2 = Flatten()(cnn2)

outputs = layers.Concatenate()([cnn1,cnn2])

outputs = layers.Dense(28, activation='tanh')(outputs)
outputs = layers.Dense(14, activation='tanh')(outputs)
outputs = layers.Dense(7, activation='softmax')(outputs)
model=models.Model(inputs,outputs)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
# model.summary()

In [66]:
y_train = tf.keras.utils.to_categorical(train_encode_tags)
y_valid = tf.keras.utils.to_categorical(valid_encode_tags)

In [72]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1)
history = model.fit(X_train, y_train,
                    epochs = 10,
                    callbacks = [es],
                    validation_data=(X_valid, y_valid),
                    batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 00003: early stopping


## 4.2 LSTM Model

In [32]:
from keras.models import Sequential
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense

model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_dim, input_length=MAX_LEN))
# model2.add(SpatialDropout1D(0.2))
model2.add(LSTM(128))
model2.add(Dense(128, activation='sigmoid'))
model2.add(Dense(7, activation='softmax'))
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [33]:
y_train_lstm = tf.keras.utils.to_categorical(train_encode_tags)
y_valid_lstm = tf.keras.utils.to_categorical(valid_encode_tags)

history2 = model2.fit(X_train, y_train_lstm,
                    epochs = 20,
                    callbacks = [es],
                    validation_data=(X_valid, y_valid_lstm),
                    batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 00002: early stopping


## 4. BiLSTM model

In [115]:
bi_model = Sequential()
bi_model.add(Embedding(vocab_size, embedding_dim, input_length=MAX_LEN))
bi_model.add(SpatialDropout1D(0.2))
bi_model.add(layers.Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
bi_model.add(Dense(7, activation='softmax'))
bi_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [116]:
bi_history = bi_model.fit(X_train, y_train_lstm,
                    epochs=10,
                    callbacks = [es],
                    validation_data=(X_valid, y_valid_lstm),
                    batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 00004: early stopping


In [49]:
bi_model.save("bi_model.h5")

## 4.3 Decision Tree Model + TF IDF vectorize

In [74]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [101]:
# TF IDF vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(use_idf = True, max_features=1800)
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(train_clean_sentences)

tfidf_vectorizer_vectors_test = tfidf_vectorizer.fit_transform(test_clean_sentences)

In [102]:
tfidf_vectorizer_vectors.shape

(5547, 1800)

In [103]:
tree_model = DecisionTreeClassifier()
Y_DT = np.array(train_encode_tags)

kf = KFold(n_splits=10)
for train_index, test_index in kf.split(tfidf_vectorizer_vectors):
    X_train_DT, X_test_DT = tfidf_vectorizer_vectors[train_index], tfidf_vectorizer_vectors[test_index]
    y_train_DT, y_test_DT = Y_DT[train_index], Y_DT[test_index]
    
    tree_model.fit(X_train_DT, y_train_DT)

In [104]:
y_pred_DT = tree_model.predict(tfidf_vectorizer_vectors_test)

In [105]:
print(classification_report(test_encode_tags, y_pred_DT))

              precision    recall  f1-score   support

           0       0.14      0.07      0.10        40
           1       0.18      0.38      0.24       132
           2       0.33      0.22      0.26       193
           3       0.09      0.02      0.04        46
           4       0.19      0.16      0.18       129
           5       0.17      0.16      0.17       116
           6       0.06      0.05      0.06        37

    accuracy                           0.20       693
   macro avg       0.17      0.15      0.15       693
weighted avg       0.21      0.20      0.19       693



## 4.4 Linear SVC + TF IDF vectorize

In [85]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [106]:
linear_svc = LinearSVC()
clf = linear_svc.fit(tfidf_vectorizer_vectors, train_encode_tags)

svc_model = CalibratedClassifierCV(base_estimator=linear_svc, cv="prefit")

svc_model.fit(tfidf_vectorizer_vectors, train_encode_tags)
y_pred_DT = svc_model.predict(tfidf_vectorizer_vectors_test)

In [107]:
print(classification_report(test_encode_tags, y_pred_DT))

              precision    recall  f1-score   support

           0       0.12      0.07      0.09        40
           1       0.24      0.33      0.28       132
           2       0.27      0.35      0.30       193
           3       0.00      0.00      0.00        46
           4       0.19      0.16      0.17       129
           5       0.17      0.14      0.15       116
           6       0.00      0.00      0.00        37

    accuracy                           0.22       693
   macro avg       0.14      0.15      0.14       693
weighted avg       0.19      0.22      0.20       693

