In [None]:
# !pip install httplib2==0.15.0 #Requires restart on colab
# !pip install --user flair
# !pip install transformers

In [None]:
import re
import pandas as pd
import numpy as np
import transformers

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import matplotlib.pyplot as plt
import seaborn as sn
from tabulate import tabulate

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, classification_report, confusion_matrix, multilabel_confusion_matrix

In [None]:
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

#Setup

- add texts and labels
- set preprocessing parameters
- set model parameters

In [131]:
#DATA

original_train_sentences = []    #List of texts
original_train_labels = []      #List of labels

#PREPROCESSING
DEIDENTIFY = True     #True -> replaces URLs, emails, and usernames with reserved tokens
EMOPRESERVE = True    #True -> adds emoticons and emojis to tokenizer vocabulary and prevents them from being affected by further text cleaning
TEXTCLEAN = False     #True -> removes or isolates specific punctuations and expands contractions
TOKEN_TYPE = "ws"     #wp -> wordpiece tokenization; ws -> word split

#GCN Parameters
EDGE = 2                                # 0:d2w 1:d2w+w2w 2:d2w+w2w+d2d
NODE = 2 if TOKEN_TYPE == "ws" else 1   # 0:one-hot #1:BERT  #2:GLOVE
NUM_LAYERS = 2
HIDDEN_DIM = 200
DROP_OUT = 0.5
LR = 0.02
WEIGHT_DECAY = 0
EARLY_STOPPING = 10
GCN_EPOCHS = 200

#BERT Parameters
BERT_EPOCHS = 200
BERT_DROPOUT = 0.5
BERT_LR = 1e-05
BERT_EARLYSTOP = 10
MAX_LENGTH = 256

#General Parameters
BATCH_SIZE = 32

#Load Resources

In [132]:
#Load list of emoticons
with open("resources/TextEmoticonList.txt", "r") as file:
  emoticonList = file.read().split("\n")

#Remove emoticons with spaces in-between
emoticonList = [emoticon for emoticon in emoticonList if len(emoticon.split(" ")) == 1]

#Remove one character emoticons
emoticonList = [emoticon for emoticon in emoticonList if len(emoticon) > 1]

print(len(emoticonList))
print(emoticonList[:10])

2609
[':)', ':-)', ':]', ':-]', ':3', ':-3', ':>', ':->', '8)', '8-)']


In [133]:
#Load list of emojis
emojiList = pd.read_csv("resources/Emojis-Grid view.csv")
emojiList = emojiList[emojiList["Emoji"] != "C"]
emojiList = emojiList["Emoji"].tolist()

#Unicode versions
emojiList_uni = [emoji.encode('unicode-escape').decode('ASCII') for emoji in emojiList]

print(len(emojiList))
print(emojiList[:10])
print(emojiList_uni[:10])

1627
['😀', '😁', '😂', '🤣', '😃', '😄', '😅', '😆', '😉', '😊']
['\\U0001f600', '\\U0001f601', '\\U0001f602', '\\U0001f923', '\\U0001f603', '\\U0001f604', '\\U0001f605', '\\U0001f606', '\\U0001f609', '\\U0001f60a']


# Load Lexicons

##EmoLex

In [134]:
emoLex = pd.read_csv("_LEXICONS/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", delimiter = "\t", names = ["Term", "AffectCategory", "AssociationFlag"])
print("Emotions:", emoLex["AffectCategory"].unique())

#Check for punctuations on lexicon
print("Punctuations:", [text for text in emoLex["Term"].unique().astype(str) if len(re.findall("[^#\w]", text)) > 0])

#Check for stopwords
stop = stopwords.words("english")
print("Stop words:", set(emoLex["Term"].unique().astype(str)) & set(stop))

Emotions: ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'negative' 'positive'
 'sadness' 'surprise' 'trust']
Punctuations: []
Stop words: {'haven', 'don'}


In [135]:
emoLex = pd.pivot(emoLex.loc[emoLex["Term"].notna()], index = "Term", columns = "AffectCategory", values = "AssociationFlag")

emoLex_labels = ["anger", "disgust", "fear", "sadness", "surprise", "negative", "other"] #removed none
emoLex_otherLabels = [x for x in emoLex.columns.values if x not in emoLex_labels]

emoLex["other"] = (emoLex[emoLex_otherLabels].sum(axis = 1) > 0).astype(int)
emoLex["labels"] = [np.array(x) for x in emoLex[emoLex_labels].values.tolist()]

emoLex_allTokens = emoLex.index.values
emoLex_allEmotions = np.stack(emoLex["labels"].values)

emoLex.head()

AffectCategory,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,other,labels
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
aback,0,0,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0]"
abacus,0,0,0,0,0,0,0,0,0,1,1,"[0, 0, 0, 0, 0, 0, 1]"
abandon,0,0,0,1,0,1,0,1,0,0,0,"[0, 0, 1, 1, 0, 1, 0]"
abandoned,1,0,0,1,0,1,0,1,0,0,0,"[1, 0, 1, 1, 0, 1, 0]"
abandonment,1,0,0,1,0,1,0,1,1,0,0,"[1, 0, 1, 1, 1, 1, 0]"


##TEC

In [136]:
tec = pd.read_csv("_LEXICONS/NRC-Hashtag-Emotion-Lexicon-v0.2.txt", delimiter = "\t", names = ["AffectCategory", "Term", "Score"])
print("Emotions:", tec["AffectCategory"].unique())

#Check for punctuations on lexicon
print("Punctuations:", [text for text in tec["Term"].unique().astype(str) if len(re.findall("[^#\w]", text)) > 0])

#Check for stopwords
stop = stopwords.words("english")
print("Stop words:", set(tec["Term"].unique().astype(str)) & set(stop))

Emotions: ['anticipation' 'fear' 'anger' 'trust' 'surprise' 'sadness' 'joy'
 'disgust']
Punctuations: ['25/1', 'spring/summer', '20/20', '85%', 'r&amp', '9/11', '12/12/12', '30%', 'a$ap', '$2', 'love/hate', '25%', '+1', '50%', '95%', '40%', '$1', '60%', '^_^', '$20', '=d', '2%', '5%', '99%', '$200', 'and/or', '$100', '#&lt', '$10', '$50', '20%', 'w/', 'm&amp', '1%', '1/2', '12%', '#a&amp', '3%', 'b&amp', '10%', '4%', '6%', '*sigh*', 'w/out', 'h&amp', '=/', '//', '70%', 'w/o', '_&lt', '\\\\', 'f*ck', 'a&amp', 'sh*t', 'times&lt', 'f**king', 'things&lt', 'boyfriend/girlfriend', '^mj', 'them&lt', 'f***ing', 'b*tch', 'night&lt', 'f******', 'f*cking', 'f**k', 'back&lt', '$30', 'f***', 'bf/gf', '$300', 'school&lt', 'sky+', 'at&amp', 'today&lt', '$15', 'time&lt', 'him/her', 'day&lt', '24/7', '$80', '1/4', 'it&lt', '3/4', '80%', 'me&lt', 'b+', '&lt', '$25', 'me&amp', '9%', '98%', 'you&lt', '$60', '2/3', 'he/she', 'b/c', '$5', '$40', '75%', '$4', 'basic/glitter', 'standard&amp', 'sms/dm/', 'foll

In [137]:
scoreThreshold = 0.5

tec["Value"] = (tec["Score"] > scoreThreshold).astype(int)
tec = pd.pivot(tec.loc[tec["Term"].notna()], index = "Term", columns = "AffectCategory", values = "Value").fillna(0).astype(int)

tec_labels = ["anger", "disgust", "fear", "sadness", "surprise", "other"] #removed none
tec_otherLabels = [x for x in tec.columns.values if x not in tec_labels]

tec["other"] = (tec[tec_otherLabels].sum(axis = 1) > 0).astype(int)
tec["labels"] = [np.array(x) for x in tec[tec_labels].values.tolist()]

tec_allTokens = tec.index.values
tec_allEmotions = np.stack(tec["labels"].values)

tec.head()

AffectCategory,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,other,labels
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
#&lt,0,0,0,0,1,0,0,0,1,"[0, 0, 0, 0, 0, 1]"
#1,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
#100thingsaboutme,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
#100thingsthatmakemehappy,0,0,0,0,1,0,0,0,1,"[0, 0, 0, 0, 0, 1]"
#121212concert,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 1, 0]"


##SenticNet

In [138]:
senticNet = pd.read_csv("_LEXICONS/senticnet_tsv.txt", delimiter = "\t", keep_default_na = False)

senticNet["PRIMARY EMOTION"] = senticNet["PRIMARY EMOTION"].str.replace("#", "")
senticNet.loc[senticNet["PRIMARY EMOTION"].isin(["enthusiasm", "eagerness", "responsiveness"]), "MainEmotion"] = "eagerness"
senticNet.loc[senticNet["PRIMARY EMOTION"].isin(["calmness", "bliss", "serenity"]), "MainEmotion"] = "calmness"
senticNet.loc[senticNet["PRIMARY EMOTION"].isin(["joy", "ecstasy", "contentment"]), "MainEmotion"] = "joy"
senticNet.loc[senticNet["PRIMARY EMOTION"].isin(["pleasantness", "acceptance", "delight"]), "MainEmotion"] = "pleasantness"
senticNet.loc[senticNet["PRIMARY EMOTION"].isin(["disgust", "dislike", "loathing"]), "MainEmotion"] = "disgust"
senticNet.loc[senticNet["PRIMARY EMOTION"].isin(["sadness", "grief", "melancholy"]), "MainEmotion"] = "sadness"
senticNet.loc[senticNet["PRIMARY EMOTION"].isin(["anger", "annoyance", "rage"]), "MainEmotion"] = "anger"
senticNet.loc[senticNet["PRIMARY EMOTION"].isin(["fear", "anxiety", "terror"]), "MainEmotion"] = "fear"

senticNet["SECONDAY EMOTION"] = senticNet["SECONDAY EMOTION"].str.replace("#", "")
senticNet.loc[senticNet["SECONDAY EMOTION"].isin(["enthusiasm", "eagerness", "responsiveness"]), "SecondEmotion"] = "eagerness"
senticNet.loc[senticNet["SECONDAY EMOTION"].isin(["calmness", "bliss", "serenity"]), "SecondEmotion"] = "calmness"
senticNet.loc[senticNet["SECONDAY EMOTION"].isin(["joy", "ecstasy", "contentment"]), "SecondEmotion"] = "joy"
senticNet.loc[senticNet["SECONDAY EMOTION"].isin(["pleasantness", "acceptance", "delight"]), "SecondEmotion"] = "pleasantness"
senticNet.loc[senticNet["SECONDAY EMOTION"].isin(["disgust", "dislike", "loathing"]), "SecondEmotion"] = "disgust"
senticNet.loc[senticNet["SECONDAY EMOTION"].isin(["sadness", "grief", "melancholy"]), "SecondEmotion"] = "sadness"
senticNet.loc[senticNet["SECONDAY EMOTION"].isin(["anger", "annoyance", "rage"]), "SecondEmotion"] = "anger"
senticNet.loc[senticNet["SECONDAY EMOTION"].isin(["fear", "anxiety", "terror"]), "SecondEmotion"] = "fear"

senticNet["Value"] = 1

In [139]:
temp = pd.concat((senticNet[["CONCEPT", "MainEmotion"]],
                  senticNet.loc[senticNet["SecondEmotion"].notna(), ["CONCEPT", "SecondEmotion"]].rename(columns = {"SecondEmotion": "MainEmotion"}),
                  senticNet[["CONCEPT", "POLARITY VALUE"]].rename(columns = {"POLARITY VALUE": "MainEmotion"})))
temp = temp.drop_duplicates()
temp["Value"] = 1

senticNet = pd.pivot(temp, index = "CONCEPT", columns = "MainEmotion", values = "Value").fillna(0).astype(int)

In [140]:
senticNet_labels = ["anger", "disgust", "fear", "sadness", "negative", "other"] #removed none
senticNet_otherLabels = [x for x in senticNet.columns.values if x not in senticNet_labels]

senticNet["other"] = (senticNet[senticNet_otherLabels].sum(axis = 1) > 0).astype(int)
senticNet["labels"] = [np.array(x) for x in senticNet[senticNet_labels].values.tolist()]

senticNet_allTokens = senticNet.index.values
senticNet_allEmotions = np.stack(senticNet["labels"].values)

senticNet.head()

MainEmotion,anger,calmness,disgust,eagerness,fear,joy,nan,negative,pleasantness,positive,sadness,other,labels
CONCEPT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
( :,0,0,0,0,0,1,1,0,0,1,0,1,"[0, 0, 0, 0, 0, 1]"
( x,0,0,0,0,0,1,1,0,0,1,0,1,"[0, 0, 0, 0, 0, 1]"
(':,0,0,0,0,0,1,1,0,0,1,0,1,"[0, 0, 0, 0, 0, 1]"
(-':,0,0,0,0,0,1,1,0,0,1,0,1,"[0, 0, 0, 0, 0, 1]"
(-:,0,0,0,0,0,1,1,0,0,1,0,1,"[0, 0, 0, 0, 0, 1]"


# Dataset Preparation

In [141]:
train_size = len(original_train_sentences)
test_size = 0

sentences = original_train_sentences

# Preprocess

In [142]:
if TOKEN_TYPE == "wp":
  tokenTitle = "WordPiece"
elif TOKEN_TYPE == "ws":
  tokenTitle = "WordSplit"
else:
  raise Exception("Invalid token type.")

In [143]:
tokenURL = "_URL_"
tokenEmail = "_EMAIL_"
tokenUsername = "_USER_"
reserveTokens = [tokenURL, tokenEmail, tokenUsername]

#CLEANING PROCESS
#- Include emojis and emoticons
#- Replace url, email, and usernames with tokens
#- Remove non-major puncutations and separate them from words with whitespaces
#- Lowercase
def preprocess_str(string):

  #Preclean
  if DEIDENTIFY:
    string = re.sub(r"https?://[^\s]+", tokenURL, string)              #Links
    string = re.sub(r"[\w.+-]+@[\w-]+\.[\w.-]+", tokenEmail, string)   #Email
    string = re.sub(r"@[a-zA-Z0-9_]{2,}", tokenUsername, string)       #Usernames

  #Emoticon/Emoji split
  tokens = [string]
  if EMOPRESERVE:
    allEmo = emoticonList + emojiList + emojiList_uni + reserveTokens
    for emoticon in allEmo:
      if emoticon in string:
        splits = []
        for split in tokens:
          splits.append(re.split(r"(" + re.escape(emoticon) + ")", split))
        tokens = [y.strip() for x in splits for y in x if y != ""]

  for idx in range(len(tokens)):
    if EMOPRESERVE and tokens[idx] in allEmo: #Skip emoticons, emojis
      continue

    if TEXTCLEAN:
      tokens[idx] = re.sub(r"[^A-Za-z0-9(),!?\.\'\`]", " ", tokens[idx])
      tokens[idx] = re.sub(r"\'s", " \'s", tokens[idx])
      tokens[idx] = re.sub(r"\'ve", " \'ve", tokens[idx])
      tokens[idx] = re.sub(r"n\'t", " n\'t", tokens[idx])
      tokens[idx] = re.sub(r"\'re", " \'re", tokens[idx])
      tokens[idx] = re.sub(r"\'d", " \'d", tokens[idx])
      tokens[idx] = re.sub(r"\'ll", " \'ll", tokens[idx])
      tokens[idx] = re.sub(r",", " , ", tokens[idx])
      tokens[idx] = re.sub(r"!", " ! ", tokens[idx])
      tokens[idx] = re.sub(r"\(", " ( ", tokens[idx])
      tokens[idx] = re.sub(r"\)", " ) ", tokens[idx])
      tokens[idx] = re.sub(r"\?", " ? ", tokens[idx])
      tokens[idx] = re.sub(r"\.", " . ", tokens[idx])
      tokens[idx] = re.sub(r"\s{2,}", " ", tokens[idx])

    #Lower case and strip by default
    tokens[idx] = tokens[idx].lower().strip()

  return " ".join(tokens)

##Tokenizer

In [144]:
#Load BERT tokenizer
if TOKEN_TYPE == "wp":
  from transformers import BertTokenizer
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  if DEIDENTIFY:
    tokenizer.add_tokens(reserveTokens)

  if EMOPRESERVE:
    tokenizer.add_tokens(emoticonList + emojiList + emojiList_uni)

In [145]:
def tokenize_str(string):
  if TOKEN_TYPE.lower() == "wp":
    #Use BERT Tokenizer
    return tokenizer.tokenize(string)
  elif TOKEN_TYPE.lower() == "ws":
    return string.split()
  else:
    raise Exception("Unknown value for TOKEN_TYPE")

## Remove less frequent words, tokenize sentences

In [146]:
from tqdm.notebook import tqdm
stop_words = set(stopwords.words('english'))
remove_limit = 5

original_word_freq = {}  # to remove rare words
for sentence in tqdm(sentences):
    temp = preprocess_str(sentence)
    word_list = tokenize_str(temp)[:512] #Manual truncation
    for word in word_list:
        if word in original_word_freq:
            original_word_freq[word] += 1
        else:
            original_word_freq[word] = 1

tokenize_sentences = []
word_list_dict = {}
for sentence in tqdm(sentences):
    temp = preprocess_str(sentence)
    word_list_temp = tokenize_str(temp)[:512] #Manual truncation
    doc_words = []
    for word in word_list_temp:
        #NOTE: Including stopwords
        # if word in original_word_freq and word not in stop_words and original_word_freq[word] >= remove_limit:
        if word in original_word_freq and original_word_freq[word] >= remove_limit:
            doc_words.append(word)
            word_list_dict[word] = 1
    tokenize_sentences.append(doc_words)
word_list = list(word_list_dict.keys())
vocab_length = len(word_list)

#word to id dict
word_id_map = {}
for i in range(vocab_length):
    word_id_map[word_list[i]] = i

#Convert tokens to ids
tokenize_sentences_tokenIds = []
for tokens in tokenize_sentences:
  sent_token_ids = []
  for token in tokens:
    sent_token_ids.append(word_id_map[token])
  tokenize_sentences_tokenIds.append(sent_token_ids)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [147]:
node_size = train_size + vocab_length

## Document-Emotion Label Encoding

In [148]:
#Encode document emotions (multi)
def encodeDocEmotions(lexTokens, lexEmotions, tokenized_sentences):

  if len(lexTokens) != len(lexEmotions):
    raise Exception("Tokens and labels must match in length.")

  labels_doc_emo = []
  for sentence in tokenized_sentences:
    matchTokens = set(lexTokens) & set(sentence)

    sentence_emotions = lexEmotions[np.sum(np.array([lexTokens == token for token in matchTokens]), axis = 0) > 0]
    if len(sentence_emotions) == 0:
      sentence_emotions = np.zeros(lexEmotions.shape[1]).astype(int)
    else:
      sentence_emotions = (np.sum(sentence_emotions, axis = 0) > 0).astype(int)

    labels_doc_emo.append(sentence_emotions)

  return np.stack(labels_doc_emo), lexEmotions.shape[1]

# Model input

In [150]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm

## Build Graph

In [151]:
from math import log
row = []
col = []
weight = []

### word-word: PMI

In [152]:
if EDGE >= 1:
    window_size = 20
    total_W = 0
    word_occurrence = {}
    word_pair_occurrence = {}

    def ordered_word_pair(a, b):
        if a > b:
            return b, a
        else:
            return a, b

    def update_word_and_word_pair_occurrence(q):
        unique_q = list(set(q))
        for i in unique_q:
            try:
                word_occurrence[i] += 1
            except:
                word_occurrence[i] = 1
        for i in range(len(unique_q)):
            for j in range(i+1, len(unique_q)):
                word1 = unique_q[i]
                word2 = unique_q[j]
                word1, word2 = ordered_word_pair(word1, word2)
                try:
                    word_pair_occurrence[(word1, word2)] += 1
                except:
                    word_pair_occurrence[(word1, word2)] = 1


    for ind in tqdm(range(train_size+test_size)):
        words = tokenize_sentences[ind]

        q = []
        # push the first (window_size) words into a queue
        for i in range(min(window_size, len(words))):
            q += [word_id_map[words[i]]]
        # update the total number of the sliding windows
        total_W += 1
        # update the number of sliding windows that contain each word and word pair
        update_word_and_word_pair_occurrence(q)

        now_next_word_index = window_size
        # pop the first word out and let the next word in, keep doing this until the end of the document
        while now_next_word_index<len(words):
            q.pop(0)
            q += [word_id_map[words[now_next_word_index]]]
            now_next_word_index += 1
            # update the total number of the sliding windows
            total_W += 1
            # update the number of sliding windows that contain each word and word pair
            update_word_and_word_pair_occurrence(q)

    for word_pair in word_pair_occurrence:
        i = word_pair[0]
        j = word_pair[1]
        count = word_pair_occurrence[word_pair]
        word_freq_i = word_occurrence[i]
        word_freq_j = word_occurrence[j]
        pmi = log((count * total_W) / (word_freq_i * word_freq_j))
        if pmi <=0:
            continue
        row.append(train_size + i)
        col.append(train_size + j)
        weight.append(pmi)
        row.append(train_size + j)
        col.append(train_size + i)
        weight.append(pmi)


  0%|          | 0/500 [00:00<?, ?it/s]

### doc-word: Tf-idf

In [153]:
#get each word appears in which document
word_doc_list = {}
for word in word_list:
    word_doc_list[word]=[]

for i in range(len(tokenize_sentences)):
    doc_words = tokenize_sentences[i]
    unique_words = set(doc_words)
    for word in unique_words:
        exsit_list = word_doc_list[word]
        exsit_list.append(i)
        word_doc_list[word] = exsit_list

#document frequency
word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

# term frequency
doc_word_freq = {}

for doc_id in range(len(tokenize_sentences)):
    words = tokenize_sentences[doc_id]
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

In [154]:
for i in range(len(tokenize_sentences)):
    words = tokenize_sentences[i]
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_length)
        col.append(train_size + j)
        idf = log(1.0 * len(tokenize_sentences) / word_doc_freq[word_list[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

### doc-doc: jaccard

In [155]:
import nltk

if EDGE>=2:
    tokenize_sentences_set = [set(s) for s in tokenize_sentences]
    jaccard_threshold = 0.2
    for i in tqdm(range(len(tokenize_sentences))):
        for j in range(i+1, len(tokenize_sentences)):

            #NOTE: RINA EDIT
            #Jaccard distance is throwing an error when both sets are empty
            if (len(tokenize_sentences_set[i]) == 0) & (len(tokenize_sentences_set[j]) == 0):
              continue

            jaccard_w = 1 - nltk.jaccard_distance(tokenize_sentences_set[i], tokenize_sentences_set[j])
            if jaccard_w > jaccard_threshold:
                if i < train_size:
                    row.append(i)
                else:
                    row.append(i + vocab_length)
                if j < train_size:
                    col.append(j)
                else:
                    col.append(vocab_length + j)
                weight.append(jaccard_w)
                if j < train_size:
                    row.append(j)
                else:
                    row.append(j + vocab_length)
                if i < train_size:
                    col.append(i)
                else:
                    col.append(vocab_length + i)
                weight.append(jaccard_w)

  0%|          | 0/500 [00:00<?, ?it/s]

### Adjacent matrix

In [156]:
import scipy.sparse as sp
adj = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size))

# build symmetric adjacency matrix
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

In [157]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo(), d_inv_sqrt

adj, norm_item = normalize_adj(adj + sp.eye(adj.shape[0]))


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape).to(device)

adj = sparse_mx_to_torch_sparse_tensor(adj)

## Features

In [158]:
#LOAD GLOVE
gloveFilePath = "resources/glove.twitter.27B.100d.txt"

def getGloveModel():
  #Convert Glove format to Word2Vec format
  import gensim
  from gensim.test.utils import datapath, get_tmpfile
  from gensim.models import KeyedVectors
  from gensim.scripts.glove2word2vec import glove2word2vec

  # https://radimrehurek.com/gensim/scripts/glove2word2vec.html
  tmp_file = get_tmpfile("test_word2vec.txt")
  glove2word2vec(gloveFilePath, tmp_file)
  print("Converted glove to word2vec format")

  gloveModel = KeyedVectors.load_word2vec_format(tmp_file)
  gloveDim = gloveModel.vector_size
  print("Loaded pretrained glove model")

  return gloveModel, gloveDim

In [159]:
if NODE == 0:
    features = np.arange(node_size)
    features = torch.FloatTensor(features).to(device)

elif NODE == 1:

    import flair
    from flair.embeddings import TransformerDocumentEmbeddings, TransformerWordEmbeddings
    from flair.data import Sentence

    doc_embedding = TransformerDocumentEmbeddings("bert-base-uncased", fine_tune=False)
    word_embedding = TransformerWordEmbeddings("bert-base-uncased", layers='-1',subtoken_pooling="mean")

    sent_embs = []
    word_embs = {}

    for ind in tqdm(range(train_size+test_size)):
        sent = tokenize_sentences[ind]
        if len(sent) > 0:
          sentence = Sentence(" ".join(sent[:512]),use_tokenizer=False)
          doc_embedding.embed(sentence)
          sent_embs.append(sentence.get_embedding().tolist())
          words = Sentence(" ".join(sent[:512]),use_tokenizer=False)
          word_embedding.embed(words)
          for token in words:
              word = token.text
              embedding = token.embedding.tolist()
              if word not in word_embs:
                  word_embs[word] = embedding
              else:
                  word_embs[word] = np.minimum(word_embs[word], embedding)
        else:
          sent_embs.append([0] * 768)

    word_embs_list = []
    for word in word_list:
      word_embs_list.append(word_embs[word])

    features = sent_embs[:train_size] + word_embs_list + sent_embs[train_size:]


elif NODE == 2:
  gloveModel, gloveDim = getGloveModel()
  gloveVocab = gloveModel.index_to_key

  #SENTENCE EMBEDDINGS
  sentence_emb = []
  for idx in tqdm(range(train_size + test_size)):
    sentence = tokenize_sentences[idx]
    if len(sentence) > 0:
      sentence_wEmb = []
      for idx_token, token in enumerate(sentence):
        if token in gloveVocab:
          sentence_wEmb.append(gloveModel[token])
        else:
          sentence_wEmb.append(np.zeros(gloveDim)) #UNKNOWN: set to 0s

      sentence_emb.append(np.mean(sentence_wEmb, axis = 0)) #SENTENCE EMBEDDING: average
    else:
      #Append 0s if sentence is empty after tokenization
      sentence_emb.append(np.zeros(gloveDim))


  #WORD EMBEDDINGS
  word_embs_list = []
  for word in word_list:
    if word in gloveVocab:
      word_embs_list.append(gloveModel[word])
    else:
      word_embs_list.append(np.zeros(gloveDim))

  features = sentence_emb[:train_size] + word_embs_list + sentence_emb[train_size:]

import scipy.sparse as sp
def preprocess_features(features):
  """Row-normalize feature matrix and convert to tuple representation"""
  rowsum = np.array(features.sum(1))
  r_inv = np.power(rowsum, -1).flatten()
  r_inv[np.isinf(r_inv)] = 0.
  r_mat_inv = sp.diags(r_inv)
  features = r_mat_inv.dot(features)
  return features

features = preprocess_features(sp.csr_matrix(features)).todense()
features = torch.FloatTensor(features).to(device)

  glove2word2vec(gloveFilePath, tmp_file)


Converted glove to word2vec format
Loaded pretrained glove model


  0%|          | 0/500 [00:00<?, ?it/s]

  r_inv = np.power(rowsum, -1).flatten()


# Model

## GCN Layer

In [160]:
class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features,  drop_out = 0, activation=None, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.zeros(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters(in_features, out_features)
        self.dropout = torch.nn.Dropout(drop_out)
        self.activation =  activation

    def reset_parameters(self,in_features, out_features):
        stdv = np.sqrt(6.0/(in_features+out_features))
        # stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        # if self.bias is not None:
        #     torch.nn.init.zeros_(self.bias)
            # self.bias.data.uniform_(-stdv, stdv)


    def forward(self, input, adj, feature_less = False):
        if feature_less:
            support = self.weight
            support = self.dropout(support)
        else:
            input = self.dropout(input)
            support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            output = output + self.bias
        if self.activation is not None:
            output = self.activation(output)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

## GCN Model

In [161]:
class GCN(nn.Module):
  def __init__(self, nfeat, nhid, nclass, emb_dim, dropout, n_layers = 2):
    super(GCN, self).__init__()
    self.n_layers = n_layers
    self.gc_list = []
    if n_layers >= 2:
      self.gc1 = GraphConvolution(nfeat, nhid, dropout, activation = nn.ReLU())
      self.gc_list = nn.ModuleList([GraphConvolution(nhid, nhid, dropout, activation = nn.Relu()) for _ in range(self.n_layers-2)])

      #Rina edits:
      #Final GCN output shaped to required embedding dimension
      #Added Linear layer for predictions
      self.gcf = GraphConvolution(nhid, emb_dim, dropout)
      self.l1 = nn.Linear(emb_dim, nclass)
    else:
      self.gc1 = GraphConvolution(nfeat, nhid, dropout)

  def forward(self, x, adj):
    if self.n_layers >= 2:
      x = self.gc1(x, adj, feature_less = True)
      for i in range(self.n_layers - 2):
        x = self.gc_list[i](x, adj)
      x = self.gcf(x, adj)
      x = self.l1(x)
    else:
      x = self.gc1(x, adj, feature_less = True)
    return x

##BERT Model

In [162]:
class BERT(nn.Module):
  def __init__(self, config, nclass, weight_matrix, dropout):
    super(BERT, self).__init__()

    self.l1 = transformers.BertModel(config)

    #Manually set word embeddings
    #https://discuss.pytorch.org/t/set-weights-for-embedding-layer/56097
    vocab_size, embedding_dim = weight_matrix.shape
    self.l1.embeddings.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.l1.embeddings.word_embeddings.weights = nn.Parameter(weight_matrix)

    self.l2 = torch.nn.Dropout(dropout)
    self.l3 = torch.nn.Linear(768, nclass)

  def forward(self, ids, attention_mask, token_type_ids,):
    _, x = self.l1(input_ids = ids,
                   attention_mask = attention_mask,
                   token_type_ids = token_type_ids,
                   return_dict = False)
    x = self.l2(x)
    x = self.l3(x)
    return x

In [163]:
class BERTWPCustomDataset(torch.utils.data.Dataset):

    def __init__(self, seq_ids, targets, attention_masks = None, token_type_ids = None):
      self.seq_ids = seq_ids
      self.attention_masks = attention_masks
      self.token_type_ids = token_type_ids
      self.targets = targets

    def __len__(self):
      return len(self.seq_ids)

    def __getitem__(self, index):

      item = {
            'input_ids': self.seq_ids[index],
            'targets': self.targets[index]
        }
      if self.attention_masks is not None:
        item["attention_masks"] = self.attention_masks[index]

      if self.token_type_ids is not None:
        item["token_type_ids"] = self.token_type_ids[index]

      return item


# Training

In [164]:
#Register hook to get output of 2nd gcn layer
#https://discuss.pytorch.org/t/how-can-i-extract-intermediate-layer-output-from-loaded-cnn-model/77301/2
activation = {}
def get_activation(name, bertOutput = False):
    def hook(model, input, output):
        if bertOutput:
          activation[name] = output[1].detach()
        else:
          activation[name] = output.detach()
    return hook

In [165]:
def generate_train_val_gcn(train_pro = 0.9):
  real_train_size = int(train_pro * train_size)
  val_size = train_size - real_train_size

  idx_train = np.random.choice(range(train_size), real_train_size, replace = False)
  idx_train.sort()
  idx_val = []
  pointer = 0
  for v in range(train_size):
      if pointer<len(idx_train) and idx_train[pointer] == v:
          pointer +=1
      else:
          idx_val.append(v)
  idx_val = np.array(idx_val)

  return idx_train, idx_val#, idx_test

idx_train, idx_val = generate_train_val_gcn()

In [166]:
def train_gcn(gcnModel, goldLabels, gcnOptimizer, gcnEpochs, gcnCriterion, show_result = True):
  start = time.time()
  val_loss = []
  for epoch in range(gcnEpochs):
    t = time.time()
    gcnModel.train()
    gcnOptimizer.zero_grad()
    output = gcnModel(features, adj)
    loss_train = gcnCriterion(output[idx_train], goldLabels[idx_train])
    acc_train = cal_accuracy(output[idx_train], goldLabels[idx_train], multiLabel = True)
    hloss_train = cal_hammingloss(output[idx_train], goldLabels[idx_train])
    loss_train.backward()
    gcnOptimizer.step()

    gcnModel.eval()
    output = gcnModel(features, adj)

    loss_val = gcnCriterion(output[idx_val], goldLabels[idx_val])
    val_loss.append(loss_val.item())
    acc_val = cal_accuracy(output[idx_val], goldLabels[idx_val], multiLabel = True)
    hloss_val = cal_hammingloss(output[idx_val], goldLabels[idx_val])

    if show_result:
      print(  'Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc_train),
              'hloss_train: {:.4f}'.format(hloss_train),
              'loss_val: {:.4f}'.format(loss_val.item()),
              'acc_val: {:.4f}'.format(acc_val),
              'hloss_val: {:.4f}'.format(hloss_val),
              'time: {:.4f}s'.format(time.time() - t))

    if epoch > EARLY_STOPPING and np.min(val_loss[-EARLY_STOPPING:]) > np.min(val_loss[:-EARLY_STOPPING]) :
      if show_result:
        print("Early Stopping...")
      break

  print("Total train time: {:.4f}s".format(time.time() - start) )

In [167]:
import os
def train_bert(model, train_loader, val_loader, optimizer, criterion, epochs, early_stop = None, show_result = True,
               save_model_path = "", upload_freq = 0):
  start = time.time()
  val_loss = []
  for epoch in range(epochs):
    t = time.time()

    #Training
    model.train()
    batch_loss_train = []
    batch_acc_train = []
    batch_hloss_train = []
    for data in train_loader:
      ids = data["input_ids"].to(device)
      goldLabels_train = data["targets"]

      att_mask = data["attention_masks"].to(device) if "attention_masks" in data else None
      token_type_ids = data["token_type_ids"].to(device) if "token_type_ids" in data else None

      optimizer.zero_grad()
      output = model(ids, att_mask, token_type_ids)

      loss_train = criterion(output, goldLabels_train)
      acc_train = cal_accuracy(output, goldLabels_train, multiLabel = True)
      hloss_train = cal_hammingloss(output, goldLabels_train)

      batch_loss_train.append(loss_train.item())
      batch_acc_train.append(acc_train)
      batch_hloss_train.append(hloss_train)

      loss_train.backward()
      optimizer.step()

    #Validation
    model.eval()
    batch_loss_val = []
    batch_acc_val = []
    batch_hloss_val = []
    with torch.no_grad():
      for _, data in enumerate(val_loader, 0):
        ids = data["input_ids"].to(device)
        goldLabels_val = data["targets"]

        att_mask = data["attention_masks"].to(device) if "attention_masks" in data else None
        token_type_ids = data["token_type_ids"].to(device) if "token_type_ids" in data else None

        output = model(ids, att_mask, token_type_ids)

        loss_val = criterion(output, goldLabels_val)
        acc_val = cal_accuracy(output, goldLabels_val, multiLabel = True)
        hloss_val = cal_hammingloss(output, goldLabels_val)

        batch_loss_val.append(loss_val.item())
        batch_acc_val.append(acc_val)
        batch_hloss_val.append(hloss_val)

    val_loss.append(np.mean(batch_loss_val))
    if early_stop != None and epoch > early_stop and np.min(val_loss[-early_stop:]) > np.min(val_loss[:-early_stop]) :
      if show_result:
          print("Early Stopping...")
      break

    if show_result:
      print(  'Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(np.mean(batch_loss_train)),
              'acc_train: {:.4f}'.format(np.mean(batch_acc_train)),
              'hloss_train: {:.4f}'.format(np.mean(batch_hloss_train)),
              'loss_val: {:.4f}'.format(np.mean(batch_loss_val)),
              'acc_val: {:.4f}'.format(np.mean(batch_acc_val)),
              'hloss_val: {:.4f}'.format(np.mean(batch_hloss_val)),
              'time: {:.4f}s'.format(time.time() - t))

    if save_model_path != "":
      os.makedirs(os.path.dirname(save_model_path), exist_ok = True)
      state = {"last_epoch": epoch,
               "val_losses": val_loss,
               "model_state": model.state_dict(),
               "optimizer_state": optimizer.state_dict()}
      torch.save(state, save_model_path)

  print("Total train time: {:.4f}s".format(time.time() - start) )

  #Save final model
  if save_model_path != "":
    os.makedirs(os.path.dirname(save_model_path), exist_ok = True)
    state = {"last_epoch": epoch,
              "val_losses": val_loss,
              "model_state": model.state_dict(),
              "optimizer_state": optimizer.state_dict()}
    torch.save(state, save_model_path)

def test_bert(model, test_loader):
  model.eval()
  with torch.no_grad():
    batch_targets = []
    batch_outputs = []
    batch_pooledOutputs = []
    for _, data in enumerate(test_loader, 0):
      ids = data["input_ids"].to(device)
      targets = data["targets"]

      att_mask = data["attention_masks"].to(device) if "attention_masks" in data else None
      token_type_ids = data["token_type_ids"].to(device) if "token_type_ids" in data else None


      output = model(ids, att_mask, token_type_ids)

      batch_targets.extend(targets.cpu().detach().numpy().tolist())
      batch_outputs.extend(output.cpu().detach().numpy().tolist())
      batch_pooledOutputs.extend(activation["l1.pooled"].cpu().detach().numpy())

  return batch_outputs, batch_targets, np.stack(batch_pooledOutputs)


def build_bert_inputs(sentence_tokenIds, max_len, embeddings):
  pad_id = 0
  cls_id = 1
  sep_id = 2
  max_seq_length = min(max([len(tokens) for tokens in sentence_tokenIds]) + 2, max_len)                   #Add 2 for [CLS] & [SEP]
  input_ids = [list(np.array(x) + 3) for x in sentence_tokenIds]                                          #Adjust ids to add special tokens
  input_ids = [[cls_id] + x + [sep_id] for x in input_ids]                                                #Insert CLS and SEP at beginning and end
  # input_ids = pad_sequences(input_ids, max_seq_length, value = pad_id, padding = "post", truncating = "post")
  input_ids = [input + [pad_id] * max(0, max_seq_length - len(input)) if len(input) < max_seq_length else input[:max_seq_length] for input in input_ids]
  emb_matrix = torch.cat((torch.zeros((3, embeddings.shape[1])),
                          embeddings),
                        dim = 0)

  return np.array(input_ids), emb_matrix, max_seq_length

In [168]:
def cal_accuracy(predictions, labels, multiLabel = False):
  if multiLabel:
    predictions = predictions.cpu().detach().numpy() >= 0.5
  else:
    predictions = torch.argmax(predictions,-1).cpu().tolist()

  labels = labels.cpu().tolist()

  return accuracy_score(predictions, labels)

def cal_hammingloss(predictions, labels):
  predictions = predictions.cpu().detach().numpy() >= 0.5
  labels = labels.cpu().tolist()

  return hamming_loss(predictions, labels)

In [169]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
def evaluate_output(outputs, targets, targetLabels, title = "", show_results = True, multiLabel = False, return_results = False):

    if multiLabel:
      outputs = np.array(outputs) >= 0.5
    else:
      outputs = np.argmax(outputs, axis = 1)

    accuracy = accuracy_score(targets, outputs)
    f1_score_micro = f1_score(targets, outputs, average='micro')
    f1_score_macro = f1_score(targets, outputs, average='macro')
    f1_score_weighted = f1_score(targets, outputs, average="weighted")

    if show_results:
      print()
      print("=" * 50)
      print(title)
      print("=" * 50)
      print("Accuracy Score: %.4f" % (accuracy))
      print("F1 Score (Micro): %.4f" % (f1_score_micro))
      print("F1 Score (Macro): %.4f" % (f1_score_macro))
      print("F1 Score (Weighted): %.4f" % (f1_score_weighted))

      if multiLabel:
        ham_loss = hamming_loss(targets, outputs)
        print("Hamming Loss: %.4f" % (ham_loss))


      print(classification_report(targets, outputs, target_names = targetLabels, digits = 4))

    if return_results:
      results = {"Accuracy": accuracy,
              "F1_Micro": f1_score_micro,
              "F1_Macro": f1_score_macro,
              "F1_Weighted": f1_score_weighted,
              "Class Precision": precision_score(targets, outputs, average = None),
              "Class Recall": recall_score(targets, outputs, average = None),
              "Class F1": f1_score(targets, outputs, average = None)
      }

      return results

In [170]:
def saveWeights(fileName, text, doc_weights, doc_emoLabels, doc_labels, emo_classNames):

  import pickle as pkl
  import os

  os.makedirs(os.path.dirname(fileName), exist_ok = True)
  with open(fileName, "wb") as file:
    pkl.dump({"text": text,
              "doc_embeddings": doc_weights,
              "doc_emoLabels": doc_emoLabels,
              "emo_classNames": emo_classNames,
              "doc_labels": doc_labels
              }, file)

def saveWordEmbeddings(fileName, vocab_map, weights):
  import pickle as pkl
  import os

  if len(vocab_map) != len(weights):
    raise Exception("Vocab and Weights are not similar in shape.")

  os.makedirs(os.path.dirname(fileName), exist_ok = True)
  with open(fileName, "wb") as file:
    pkl.dump({"vocab_map": vocab_map,
             "weights": weights}, file)

In [171]:
def train_GCNBert(save_model_path = "", upload_freq = 0):
  ###
  #GCN
  ###

  #Build model
  gcnModel = GCN(nfeat = node_size, nhid = HIDDEN_DIM, nclass = num_emoClass, emb_dim = 768, dropout = DROP_OUT, n_layers = NUM_LAYERS).to(device)
  optimizer = optim.Adam(gcnModel.parameters(), lr = LR, weight_decay = WEIGHT_DECAY)
  gcnModel.gcf.register_forward_hook(get_activation("gcf"))

  #Training
  train_gcn(gcnModel, emoLabels, optimizer, GCN_EPOCHS, nn.BCEWithLogitsLoss())

  #Evaluate training
  gcnModel.eval()
  output = gcnModel(features, adj)
  evaluate_output(output[:train_size].cpu().detach().numpy(), emoLabels.cpu(), lexClassNames, title = title1, multiLabel = True)

  #Extract embeddings
  # trained_doc_embeddings = activation["gcf"][:train_size].cpu()
  trained_word_embeddings = activation["gcf"][train_size:].cpu()


  ###
  #BERT
  ###

  bertConfig = transformers.BertConfig()
  bert_max_length = bertConfig.max_position_embeddings

  if TOKEN_TYPE == "wp":
    #Align new weights with bert embeddings
    tempBert = transformers.BertModel(bertConfig)
    tempBert.resize_token_embeddings(len(tokenizer))
    emb_matrix = tempBert.embeddings.word_embeddings.weight.data.detach()
    bert_map = tokenizer.get_vocab()
    for wp, idx in word_id_map.items():
      emb_matrix[bert_map[wp]] = trained_word_embeddings[idx]

    #Build BERT inputs
    clean_sentences = [preprocess_str(text) for text in original_train_sentences]
    inputs = tokenizer(clean_sentences, padding = True, truncation = True, max_length = MAX_LENGTH)
    input_ids = np.array(inputs["input_ids"])
    token_type_ids = np.array(inputs["token_type_ids"])
    attention_mask = np.array(inputs["attention_mask"])

    #Build data loader
    train_set = BERTWPCustomDataset(input_ids[idx_train], emoLabels[idx_train], attention_masks = attention_mask[idx_train], token_type_ids = token_type_ids[idx_train])
    val_set = BERTWPCustomDataset(input_ids[idx_val], emoLabels[idx_val], attention_masks = attention_mask[idx_val], token_type_ids = token_type_ids[idx_val])
    test_set = BERTWPCustomDataset(input_ids, emoLabels, attention_masks = attention_mask, token_type_ids = token_type_ids)

  elif TOKEN_TYPE == "ws":
    #Build BERT inputs
    if MAX_LENGTH == None:
      truncate_len = bert_max_length
    else:
      truncate_len = min(MAX_LENGTH, bert_max_length)
    input_ids, emb_matrix, max_seq_length = build_bert_inputs(tokenize_sentences_tokenIds, truncate_len, trained_word_embeddings)
    print("Max sequence length:", max_seq_length)

    #Build data loader
    train_set = BERTWPCustomDataset(input_ids[idx_train], emoLabels[idx_train])
    val_set = BERTWPCustomDataset(input_ids[idx_val], emoLabels[idx_val])
    test_set = BERTWPCustomDataset(input_ids, emoLabels)

  else:
    raise Exception("Invalid Token Type.")

  train_loader = torch.utils.data.DataLoader(train_set, **train_params)
  val_loader = torch.utils.data.DataLoader(val_set, **train_params)
  test_loader = torch.utils.data.DataLoader(test_set, **test_params)

  #Build Model
  bertModel = BERT(bertConfig, num_emoClass, emb_matrix, BERT_DROPOUT).to(device)
  bertModel.l1.register_forward_hook(get_activation("l1.pooled", bertOutput = True))
  optimizer = optim.Adam(bertModel.parameters(), lr = BERT_LR)
  criterion = nn.BCEWithLogitsLoss()


  #Train
  train_bert(bertModel, train_loader, val_loader, optimizer, criterion, BERT_EPOCHS, early_stop = BERT_EARLYSTOP,
             save_model_path = save_model_path, upload_freq = upload_freq)

  #Evaluate and extract document embeddings
  outputs, targets, doc_embeddings = test_bert(bertModel, test_loader)
  new_embeddings = bertModel.l1.embeddings.word_embeddings.weight.detach().cpu()
  evaluate_output(outputs, targets, lexClassNames, title = title2, multiLabel = True)

  return doc_embeddings, new_embeddings

#Generate MM-EMOG Embeddings

In [176]:
train_params = {'batch_size': BATCH_SIZE,
          'shuffle': True,
          'num_workers': 0
          }
test_params = {'batch_size': BATCH_SIZE,
          'shuffle': False,
          'num_workers': 0
          }

if TOKEN_TYPE == "wp":
  vocab = tokenizer.get_vocab()
elif TOKEN_TYPE == "ws":
  vocab = {k: v + 3 for k, v in word_id_map.items()} #Adjust by 3 to add BERT tokens
  vocab["[PAD]"] = 0
  vocab["[CLS]"] = 1
  vocab["[SEP]"] = 2
else:
  raise Exception("Invalid token type.")

##EmoLex

In [178]:
lexName = "EmoLex"
title1 = "TextGCN x " + lexName
title2 = "TextGCN->BERT x " + lexName
texts = original_train_sentences

lexTokens = emoLex_allTokens
lexLabels = emoLex_allEmotions
lexClassNames = emoLex_labels

#Populate document multilabel emotions
emoLabels, num_emoClass = encodeDocEmotions(lexTokens, lexLabels, tokenize_sentences)
original_emoLabels_train = emoLabels[:train_size]
emoLabels = torch.FloatTensor(emoLabels).to(device)

doc_embeddings, word_embeddings = train_GCNBert()

# saveWeights("./_OUTPUT/DocEmbeddings_" + tokenTitle + "Embeddings.pkl", texts, doc_embeddings, lexLabels, original_train_labels, lexClassNames)   #Document embeddings
saveWordEmbeddings("./_OUTPUT/MMEMOG_" + tokenTitle + "Embeddings_" + lexName + ".pkl", vocab, word_embeddings)

Epoch: 0001 loss_train: 0.6900 acc_train: 0.2667 hloss_train: 0.3403 loss_val: 0.6190 acc_val: 0.2600 hloss_val: 0.3086 time: 0.0102s
Epoch: 0002 loss_train: 0.6266 acc_train: 0.2667 hloss_train: 0.3403 loss_val: 0.8035 acc_val: 0.1000 hloss_val: 0.2686 time: 0.0096s
Epoch: 0003 loss_train: 0.8483 acc_train: 0.1156 hloss_train: 0.2952 loss_val: 0.5622 acc_val: 0.1000 hloss_val: 0.2686 time: 0.0094s
Epoch: 0004 loss_train: 0.5889 acc_train: 0.1156 hloss_train: 0.2956 loss_val: 0.6123 acc_val: 0.2600 hloss_val: 0.3000 time: 0.0096s
Epoch: 0005 loss_train: 0.6213 acc_train: 0.2556 hloss_train: 0.3387 loss_val: 0.6294 acc_val: 0.2600 hloss_val: 0.3086 time: 0.0095s
Epoch: 0006 loss_train: 0.6332 acc_train: 0.2600 hloss_train: 0.3413 loss_val: 0.6081 acc_val: 0.2600 hloss_val: 0.3086 time: 0.0095s
Epoch: 0007 loss_train: 0.6109 acc_train: 0.2667 hloss_train: 0.3403 loss_val: 0.5757 acc_val: 0.2600 hloss_val: 0.3086 time: 0.0094s
Epoch: 0008 loss_train: 0.5806 acc_train: 0.2667 hloss_train: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 0001 loss_train: 0.6128 acc_train: 0.1354 hloss_train: 0.3259 loss_val: 0.5333 acc_val: 0.2448 hloss_val: 0.2800 time: 2.2102s
Epoch: 0002 loss_train: 0.5758 acc_train: 0.1583 hloss_train: 0.2914 loss_val: 0.5625 acc_val: 0.0781 hloss_val: 0.2897 time: 2.2164s
Epoch: 0003 loss_train: 0.5741 acc_train: 0.1812 hloss_train: 0.3012 loss_val: 0.5403 acc_val: 0.1649 hloss_val: 0.2865 time: 2.2207s
Epoch: 0004 loss_train: 0.5674 acc_train: 0.2375 hloss_train: 0.3039 loss_val: 0.5213 acc_val: 0.2014 hloss_val: 0.2718 time: 2.2286s
Epoch: 0005 loss_train: 0.5860 acc_train: 0.1729 hloss_train: 0.3259 loss_val: 0.5306 acc_val: 0.2292 hloss_val: 0.2366 time: 2.2338s
Epoch: 0006 loss_train: 0.5652 acc_train: 0.2458 hloss_train: 0.2821 loss_val: 0.5027 acc_val: 0.2448 hloss_val: 0.2753 time: 2.2374s
Epoch: 0007 loss_train: 0.5570 acc_train: 0.2354 hloss_train: 0.2908 loss_val: 0.5124 acc_val: 0.1615 hloss_val: 0.2550 time: 2.2382s
Epoch: 0008 loss_train: 0.5655 acc_train: 0.1917 hloss_train: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##TEC

In [179]:
lexName = "TEC"
title1 = "TextGCN x " + lexName
title2 = "TextGCN->BERT x " + lexName
texts = original_train_sentences

lexTokens = tec_allTokens
lexLabels = tec_allEmotions
lexClassNames = tec_labels

#Populate document multilabel emotions
emoLabels, num_emoClass = encodeDocEmotions(lexTokens, lexLabels, tokenize_sentences)
original_emoLabels_train = emoLabels[:train_size]
emoLabels = torch.FloatTensor(emoLabels).to(device)

doc_embeddings, word_embeddings = train_GCNBert()

# saveWeights("./_OUTPUT/DocEmbeddings_" + tokenTitle + "Embeddings" +  ".pkl", texts, doc_embeddings, lexLabels, original_train_labels, lexClassNames)
saveWordEmbeddings("./_OUTPUT/MMEMOG_" + tokenTitle + "Embeddings_" + lexName + ".pkl", vocab, word_embeddings)

Epoch: 0001 loss_train: 0.6939 acc_train: 0.2311 hloss_train: 0.2285 loss_val: 0.6230 acc_val: 0.2400 hloss_val: 0.2233 time: 0.0100s
Epoch: 0002 loss_train: 0.6182 acc_train: 0.2311 hloss_train: 0.2285 loss_val: 0.6457 acc_val: 0.2400 hloss_val: 0.2233 time: 0.0094s
Epoch: 0003 loss_train: 0.6312 acc_train: 0.2311 hloss_train: 0.2285 loss_val: 0.5429 acc_val: 0.2400 hloss_val: 0.2233 time: 0.0091s
Epoch: 0004 loss_train: 0.5193 acc_train: 0.2311 hloss_train: 0.2285 loss_val: 0.5543 acc_val: 0.2400 hloss_val: 0.2233 time: 0.0091s
Epoch: 0005 loss_train: 0.5269 acc_train: 0.2311 hloss_train: 0.2285 loss_val: 0.5674 acc_val: 0.2400 hloss_val: 0.2233 time: 0.0092s
Epoch: 0006 loss_train: 0.5412 acc_train: 0.2311 hloss_train: 0.2285 loss_val: 0.5455 acc_val: 0.2400 hloss_val: 0.2233 time: 0.0091s
Epoch: 0007 loss_train: 0.5184 acc_train: 0.2311 hloss_train: 0.2285 loss_val: 0.5239 acc_val: 0.2400 hloss_val: 0.2233 time: 0.0091s
Epoch: 0008 loss_train: 0.4925 acc_train: 0.2311 hloss_train: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 0001 loss_train: 0.5390 acc_train: 0.2104 hloss_train: 0.2424 loss_val: 0.5428 acc_val: 0.2604 hloss_val: 0.2190 time: 2.2293s
Epoch: 0002 loss_train: 0.4979 acc_train: 0.2771 hloss_train: 0.2142 loss_val: 0.5099 acc_val: 0.2726 hloss_val: 0.2190 time: 2.2318s
Epoch: 0003 loss_train: 0.5083 acc_train: 0.2167 hloss_train: 0.2299 loss_val: 0.5109 acc_val: 0.2361 hloss_val: 0.2231 time: 2.2259s
Epoch: 0004 loss_train: 0.5062 acc_train: 0.2500 hloss_train: 0.2243 loss_val: 0.5006 acc_val: 0.2361 hloss_val: 0.2231 time: 2.2303s
Epoch: 0005 loss_train: 0.4921 acc_train: 0.2479 hloss_train: 0.2198 loss_val: 0.4933 acc_val: 0.2483 hloss_val: 0.2190 time: 2.2352s
Epoch: 0006 loss_train: 0.4962 acc_train: 0.2458 hloss_train: 0.2250 loss_val: 0.5005 acc_val: 0.2604 hloss_val: 0.2170 time: 2.2412s
Epoch: 0007 loss_train: 0.5151 acc_train: 0.2188 hloss_train: 0.2444 loss_val: 0.4879 acc_val: 0.2483 hloss_val: 0.2231 time: 2.2444s
Epoch: 0008 loss_train: 0.5022 acc_train: 0.2208 hloss_train: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##SenticNet

In [180]:
lexName = "SenticNet"
title1 = "TextGCN x " + lexName
title2 = "TextGCN->BERT x " + lexName
texts = original_train_sentences

lexTokens = senticNet_allTokens
lexLabels = senticNet_allEmotions
lexClassNames = senticNet_labels

#Populate document multilabel emotions
emoLabels, num_emoClass = encodeDocEmotions(lexTokens, lexLabels, tokenize_sentences)
original_emoLabels_train = emoLabels[:train_size]
emoLabels = torch.FloatTensor(emoLabels).to(device)

doc_embeddings, word_embeddings = train_GCNBert()

# saveWeights("./_OUTPUT/DocEmbeddings_" + tokenTitle + "Embeddings" +  ".pkl", texts, doc_embeddings, lexLabels, original_train_labels, lexClassNames)
saveWordEmbeddings("./_OUTPUT/MMEMOG_" + tokenTitle + "Embeddings_" + lexName + ".pkl", vocab, word_embeddings)

Epoch: 0001 loss_train: 0.6891 acc_train: 0.3533 hloss_train: 0.2744 loss_val: 0.5905 acc_val: 0.5600 hloss_val: 0.1700 time: 0.0102s
Epoch: 0002 loss_train: 0.6137 acc_train: 0.3533 hloss_train: 0.2744 loss_val: 0.4793 acc_val: 0.2200 hloss_val: 0.1967 time: 0.0098s
Epoch: 0003 loss_train: 0.6613 acc_train: 0.2444 hloss_train: 0.2678 loss_val: 0.4243 acc_val: 0.5600 hloss_val: 0.1700 time: 0.0094s
Epoch: 0004 loss_train: 0.5405 acc_train: 0.3533 hloss_train: 0.2744 loss_val: 0.5022 acc_val: 0.5600 hloss_val: 0.1700 time: 0.0093s
Epoch: 0005 loss_train: 0.5483 acc_train: 0.3533 hloss_train: 0.2744 loss_val: 0.5265 acc_val: 0.5600 hloss_val: 0.1700 time: 0.0094s
Epoch: 0006 loss_train: 0.5547 acc_train: 0.3533 hloss_train: 0.2744 loss_val: 0.4906 acc_val: 0.5600 hloss_val: 0.1700 time: 0.0093s
Epoch: 0007 loss_train: 0.5324 acc_train: 0.3533 hloss_train: 0.2744 loss_val: 0.4333 acc_val: 0.5600 hloss_val: 0.1700 time: 0.0094s
Epoch: 0008 loss_train: 0.5106 acc_train: 0.3533 hloss_train: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 0001 loss_train: 0.5427 acc_train: 0.3000 hloss_train: 0.2701 loss_val: 0.4014 acc_val: 0.5833 hloss_val: 0.1612 time: 2.3469s
Epoch: 0002 loss_train: 0.5515 acc_train: 0.3104 hloss_train: 0.2885 loss_val: 0.4231 acc_val: 0.5469 hloss_val: 0.1713 time: 2.3539s
Epoch: 0003 loss_train: 0.5354 acc_train: 0.3500 hloss_train: 0.2660 loss_val: 0.4221 acc_val: 0.5226 hloss_val: 0.1875 time: 2.3554s
Epoch: 0004 loss_train: 0.5357 acc_train: 0.3750 hloss_train: 0.2601 loss_val: 0.4102 acc_val: 0.6181 hloss_val: 0.1296 time: 2.3596s
Epoch: 0005 loss_train: 0.5361 acc_train: 0.3479 hloss_train: 0.2767 loss_val: 0.4061 acc_val: 0.5955 hloss_val: 0.1571 time: 2.3581s
Epoch: 0006 loss_train: 0.5089 acc_train: 0.3979 hloss_train: 0.2510 loss_val: 0.4105 acc_val: 0.5747 hloss_val: 0.1490 time: 2.3586s
Epoch: 0007 loss_train: 0.5122 acc_train: 0.3729 hloss_train: 0.2479 loss_val: 0.3852 acc_val: 0.5747 hloss_val: 0.1409 time: 2.3598s
Epoch: 0008 loss_train: 0.5174 acc_train: 0.3875 hloss_train: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
