In [None]:
# !pip install --user flair   #Needs restart on colab
# !pip install transformers
# !pip install --quiet optuna

In [None]:
import pickle
import numpy as np
import pandas as pd

import re
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn
from tabulate import tabulate

import torch

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

#Setup

In [None]:
#DATA
original_train_sentences = []
original_train_labels = []
original_test_sentences = []
original_test_labels = []

#PREPROCESSING
DEIDENTIFY = True     #True -> replaces URLs, emails, and usernames with reserved tokens
EMOPRESERVE = True    #True -> adds emoticons and emojis to tokenizer vocabulary and prevents them from being affected by further text cleaning
TEXTCLEAN = False     #True -> removes or isolates specific punctuations and expands contractions
TOKEN_TYPE = "wp"     #wp -> wordpiece tokenization; ws -> word split

#MMEMOG
mmemogFile = "_OUTPUT/MMEMOG_WordPieceEmbeddings_EmoLex.pkl"

#GCN Parameters
EDGE = 2 # 0:d2w 1:d2w+w2w 2:d2w+w2w+d2d
NODE = 3 # 0:one-hot #1:BERT #2:Glove #3:Custom(MMEMOG)
EARLY_STOPPING = 10
NUM_EPOCHS = 200

#Save Paths
session_name = "TestModel"
save_output_path = "_OUTPUT/" + session_name + ".csv" #Predictions

#Load Resources

In [None]:
#LOAD GLOVE
gloveFileName = "resources/glove.twitter.27B.100d.txt"

def getGloveModel():

  #Convert Glove format to Word2Vec format
  import gensim
  from gensim.test.utils import datapath, get_tmpfile
  from gensim.models import KeyedVectors
  from gensim.scripts.glove2word2vec import glove2word2vec

  # https://radimrehurek.com/gensim/scripts/glove2word2vec.html
  tmp_file = get_tmpfile("test_word2vec.txt")
  glove2word2vec(gloveFileName, tmp_file)
  print("Converted glove to word2vec format")

  gloveModel = KeyedVectors.load_word2vec_format(tmp_file)
  gloveDim = gloveModel.vector_size
  print("Loaded pretrained glove model")

  return gloveModel, gloveDim

In [None]:
#Load list of emoticons
with open("resources/TextEmoticonList.txt", "r") as file:
  emoticonList = file.read().split("\n")

#Remove emoticons with spaces in-between
emoticonList = [emoticon for emoticon in emoticonList if len(emoticon.split(" ")) == 1]

#Remove one character emoticons
emoticonList = [emoticon for emoticon in emoticonList if len(emoticon) > 1]

print(len(emoticonList))
print(emoticonList[:10])

In [None]:
#Load list of emojis
emojiList = pd.read_csv("resources/Emojis-Grid view.csv")
emojiList = emojiList[emojiList["Emoji"] != "C"]
emojiList = emojiList["Emoji"].tolist()

#Unicode versions
emojiList_uni = [emoji.encode('unicode-escape').decode('ASCII') for emoji in emojiList]

print(len(emojiList))
print(emojiList[:10])
print(emojiList_uni[:10])

# Dataset Preparation

In [None]:
train_size = len(original_train_sentences)
test_size = len(original_test_sentences)
sentences = original_train_sentences + original_test_sentences

# Preprocess

In [None]:
tokenURL = "_URL_"
tokenEmail = "_EMAIL_"
tokenUsername = "_USER_"
reserveTokens = [tokenURL, tokenEmail, tokenUsername]

#CLEANING PROCESS
#- Include emojis and emoticons
#- Replace url, email, and usernames with tokens
#- Remove non-major puncutations and separate them from words with whitespaces
#- Lowercase
def preprocess_str(string):

  #Preclean
  if DEIDENTIFY:
    string = re.sub(r"https?://[^\s]+", tokenURL, string)              #Links
    string = re.sub(r"[\w.+-]+@[\w-]+\.[\w.-]+", tokenEmail, string)   #Email
    string = re.sub(r"@[a-zA-Z0-9_]{2,}", tokenUsername, string)       #Usernames

  #Emoticon/Emoji split
  tokens = [string]
  if EMOPRESERVE:
    allEmo = emoticonList + emojiList + emojiList_uni + reserveTokens
    for emoticon in allEmo:
      if emoticon in string:
        splits = []
        for split in tokens:
          # splits.append(re.split(r"((^|\s)" + re.escape(emoticon) + "(\s|$))", split))
          splits.append(re.split(r"(" + re.escape(emoticon) + ")", split))
        tokens = [y.strip() for x in splits for y in x if y != ""]

  for idx in range(len(tokens)):
    if EMOPRESERVE and tokens[idx] in allEmo: #Skip emoticons, emojis
      continue

    if TEXTCLEAN:
      tokens[idx] = re.sub(r"[^A-Za-z0-9(),!?\.\'\`]", " ", tokens[idx])
      tokens[idx] = re.sub(r"\'s", " \'s", tokens[idx])
      tokens[idx] = re.sub(r"\'ve", " \'ve", tokens[idx])
      tokens[idx] = re.sub(r"n\'t", " n\'t", tokens[idx])
      tokens[idx] = re.sub(r"\'re", " \'re", tokens[idx])
      tokens[idx] = re.sub(r"\'d", " \'d", tokens[idx])
      tokens[idx] = re.sub(r"\'ll", " \'ll", tokens[idx])
      tokens[idx] = re.sub(r",", " , ", tokens[idx])
      tokens[idx] = re.sub(r"!", " ! ", tokens[idx])
      tokens[idx] = re.sub(r"\(", " ( ", tokens[idx])
      tokens[idx] = re.sub(r"\)", " ) ", tokens[idx])
      tokens[idx] = re.sub(r"\?", " ? ", tokens[idx])
      tokens[idx] = re.sub(r"\.", " . ", tokens[idx])
      tokens[idx] = re.sub(r"\s{2,}", " ", tokens[idx])

    #Lower case and strip by default
    tokens[idx] = tokens[idx].lower().strip()

  return " ".join(tokens)

##Tokenizer

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer.add_tokens(reserveTokens)
tokenizer.add_tokens(emoticonList + emojiList + emojiList_uni)

## Remove Stopwords and less frequent words, tokenize sentences

In [None]:
from tqdm.notebook import tqdm
remove_limit = 5

original_word_freq = {}  # to remove rare words
for sentence in tqdm(sentences):
    temp = preprocess_str(sentence)
    word_list = tokenizer.tokenize(temp)[:512] #Use BertTokenizer; NOTE: manual truncation
    for word in word_list:
        if word in original_word_freq:
            original_word_freq[word] += 1
        else:
            original_word_freq[word] = 1

tokenize_sentences = []
word_list_dict = {}
for sentence in tqdm(sentences):
    temp = preprocess_str(sentence)
    word_list_temp = tokenizer.tokenize(temp)[:512] #Use BertTokenizer; NOTE: manual truncation
    doc_words = []
    for word in word_list_temp:
        #NOTE: Including stopwords
        # if word in original_word_freq and word not in stop_words and original_word_freq[word] >= remove_limit:
        if word in original_word_freq and original_word_freq[word] >= remove_limit:
            doc_words.append(word)
            word_list_dict[word] = 1
    tokenize_sentences.append(doc_words)
word_list = list(word_list_dict.keys())
vocab_length = len(word_list)

#word to id dict
word_id_map = {}
for i in range(vocab_length):
    word_id_map[word_list[i]] = i

In [None]:
node_size = train_size + vocab_length + test_size

## Label Encoding

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

unique_labels=np.unique(original_train_labels)

num_class = len(unique_labels)
lEnc = LabelEncoder()
lEnc.fit(unique_labels)

print(unique_labels)
print(lEnc.transform(unique_labels))

train_labels = lEnc.transform(original_train_labels)
test_labels = lEnc.transform(original_test_labels)

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

labels = train_labels.tolist()+test_labels.tolist()
labels = torch.LongTensor(labels).to(device)

# Model input

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm

## Build Graph

In [None]:
from math import log
row = []
col = []
weight = []

### word-word: PMI

In [None]:
if EDGE >= 1:
    window_size = 20
    total_W = 0
    word_occurrence = {}
    word_pair_occurrence = {}

    def ordered_word_pair(a, b):
        if a > b:
            return b, a
        else:
            return a, b

    def update_word_and_word_pair_occurrence(q):
        unique_q = list(set(q))
        for i in unique_q:
            try:
                word_occurrence[i] += 1
            except:
                word_occurrence[i] = 1
        for i in range(len(unique_q)):
            for j in range(i+1, len(unique_q)):
                word1 = unique_q[i]
                word2 = unique_q[j]
                word1, word2 = ordered_word_pair(word1, word2)
                try:
                    word_pair_occurrence[(word1, word2)] += 1
                except:
                    word_pair_occurrence[(word1, word2)] = 1


    for ind in tqdm(range(train_size+test_size)):
        words = tokenize_sentences[ind]

        q = []
        # push the first (window_size) words into a queue
        for i in range(min(window_size, len(words))):
            q += [word_id_map[words[i]]]
        # update the total number of the sliding windows
        total_W += 1
        # update the number of sliding windows that contain each word and word pair
        update_word_and_word_pair_occurrence(q)

        now_next_word_index = window_size
        # pop the first word out and let the next word in, keep doing this until the end of the document
        while now_next_word_index<len(words):
            q.pop(0)
            q += [word_id_map[words[now_next_word_index]]]
            now_next_word_index += 1
            # update the total number of the sliding windows
            total_W += 1
            # update the number of sliding windows that contain each word and word pair
            update_word_and_word_pair_occurrence(q)

    for word_pair in word_pair_occurrence:
        i = word_pair[0]
        j = word_pair[1]
        count = word_pair_occurrence[word_pair]
        word_freq_i = word_occurrence[i]
        word_freq_j = word_occurrence[j]
        pmi = log((count * total_W) / (word_freq_i * word_freq_j))
        if pmi <=0:
            continue
        row.append(train_size + i)
        col.append(train_size + j)
        weight.append(pmi)
        row.append(train_size + j)
        col.append(train_size + i)
        weight.append(pmi)


### doc-word: Tf-idf

In [None]:
#get each word appears in which document
word_doc_list = {}
for word in word_list:
    word_doc_list[word]=[]

for i in range(len(tokenize_sentences)):
    doc_words = tokenize_sentences[i]
    unique_words = set(doc_words)
    for word in unique_words:
        exsit_list = word_doc_list[word]
        exsit_list.append(i)
        word_doc_list[word] = exsit_list

#document frequency
word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

# term frequency
doc_word_freq = {}

for doc_id in range(len(tokenize_sentences)):
    words = tokenize_sentences[doc_id]
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

In [None]:
for i in range(len(tokenize_sentences)):
    words = tokenize_sentences[i]
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_length)
        col.append(train_size + j)
        idf = log(1.0 * len(tokenize_sentences) / word_doc_freq[word_list[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

### doc-doc: jaccard

In [None]:
import nltk

if EDGE>=2:
    tokenize_sentences_set = [set(s) for s in tokenize_sentences]
    jaccard_threshold = 0.2
    for i in tqdm(range(len(tokenize_sentences))):
        for j in range(i+1, len(tokenize_sentences)):

            #NOTE: RINA EDIT
            #Jaccard distance is throwing an error when both sets are empty
            if (len(tokenize_sentences_set[i]) == 0) & (len(tokenize_sentences_set[j]) == 0):
              continue

            jaccard_w = 1 - nltk.jaccard_distance(tokenize_sentences_set[i], tokenize_sentences_set[j])
            if jaccard_w > jaccard_threshold:
                if i < train_size:
                    row.append(i)
                else:
                    row.append(i + vocab_length)
                if j < train_size:
                    col.append(j)
                else:
                    col.append(vocab_length + j)
                weight.append(jaccard_w)
                if j < train_size:
                    row.append(j)
                else:
                    row.append(j + vocab_length)
                if i < train_size:
                    col.append(i)
                else:
                    col.append(vocab_length + i)
                weight.append(jaccard_w)

### Adjacent matrix

In [None]:
import scipy.sparse as sp
adj = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size))

# build symmetric adjacency matrix
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

In [None]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo(), d_inv_sqrt

adj, norm_item = normalize_adj(adj + sp.eye(adj.shape[0]))


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape).to(device)

adj = sparse_mx_to_torch_sparse_tensor(adj)

## Features

In [None]:
import scipy.sparse as sp
def generate_features(word1_emb_type = None, word2_emb_type = None, emo_map = None, emo_matrix = None):
  def preprocess_features(features):
      """Row-normalize feature matrix and convert to tuple representation"""
      rowsum = np.array(features.sum(1))
      r_inv = np.power(rowsum, -1).flatten()
      r_inv[np.isinf(r_inv)] = 0.
      r_mat_inv = sp.diags(r_inv)
      features = r_mat_inv.dot(features)
      return features

  if NODE == 0:
      features = np.arange(node_size)
      features = torch.FloatTensor(features).to(device)

  elif NODE == 1:

    from flair.embeddings import TransformerDocumentEmbeddings, TransformerWordEmbeddings
    from flair.data import Sentence
    doc_embedding = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=False)
    word_embedding = TransformerWordEmbeddings('bert-base-uncased', layers='-1',subtoken_pooling="mean")

    sent_embs = []
    word_embs = {}

    for ind in tqdm(range(train_size+test_size)):
        sent = tokenize_sentences[ind]
        if len(sent) > 0:
          sentence = Sentence(" ".join(sent[:512]),use_tokenizer=False)
          doc_embedding.embed(sentence)
          sent_embs.append(sentence.get_embedding().tolist())
          words = Sentence(" ".join(sent[:512]),use_tokenizer=False)
          word_embedding.embed(words)
          for token in words:
              word = token.text
              embedding = token.embedding.tolist()
              if word not in word_embs:
                  word_embs[word] = embedding
              else:
                  word_embs[word] = np.minimum(word_embs[word], embedding)
        else:
          sent_embs.append([0] * 768)

    word_embs_list = []
    for word in word_list:
      word_embs_list.append(word_embs[word])

    features = sent_embs[:train_size] + word_embs_list + sent_embs[train_size:]

    features = preprocess_features(sp.csr_matrix(features)).todense()
    features = torch.FloatTensor(features).to(device)

  elif NODE == 2 or NODE == 3:
    if NODE == 2:
      gloveModel, gloveDim = getGloveModel()

      emb_map = gloveModel.wv.vocab
      emb_weights = gloveModel
      emb_dim = gloveDim
    else:
      emb_weights, emb_vocab_map = generate_custom_word_embeddings(word1_emb_type, word2_emb_type, emo_map, emo_matrix)
      emb_weights = emb_weights.detach().numpy()
      emb_weights_map = {k: emb_weights[i] for k, i in emb_vocab_map.items()}
      emb_dim = emb_weights.shape[-1]

    #SENTENCE EMBEDDINGS
    sentence_emb = []
    for idx in tqdm(range(train_size + test_size)):
      sentence = tokenize_sentences[idx]
      if len(sentence) > 0:
        sentence_wEmb = []
        for idx_token, token in enumerate(sentence):
          if token in emb_weights_map:
            sentence_wEmb.append(emb_weights_map[token])
          else:
            sentence_wEmb.append(np.zeros(emb_dim)) #UNKNOWN: set to 0s

        sentence_emb.append(np.mean(sentence_wEmb, axis = 0)) #SENTENCE EMBEDDING: average
      else:
        #Append 0s if sentence is empty after tokenization
        sentence_emb.append(np.zeros(emb_dim))
    # sentence_emb = np.stack(sentence_emb)

    #WORD EMBEDDINGS
    word_embs_list = []
    for word in word_list:
      if word in emb_weights_map:
        word_embs_list.append(emb_weights_map[word])
      else:
        word_embs_list.append(np.zeros(emb_dim))

    features = sentence_emb[:train_size] + word_embs_list + sentence_emb[train_size:]
    features = preprocess_features(sp.csr_matrix(features)).todense()
    features = torch.FloatTensor(features).to(device)

  else:
    raise Exception("No node feature selected.")

  return features

In [None]:
from transformers import BertTokenizer, BertModel

def align_custom_weights(custom_vocab_map, custom_weights, base_type = "bert"):

  special_tokens = []
  if base_type == "bert":

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    bertModel = BertModel.from_pretrained("bert-base-uncased")

    bert_map = tokenizer.get_vocab()
    bert_vocab = tokenizer.get_vocab().keys()
    bert_weights = bertModel.embeddings.word_embeddings.weight.clone().detach()

    #Determine out of vocabulary words, get word piece ids
    oov = [w for w in custom_vocab_map.keys() if w not in bert_vocab]
    oov_word_piece = [tokenizer(w)["input_ids"][1:-1] for w in oov]
    oov_word_piece_map = {w: oov_word_piece[i] for i, w in enumerate(oov)}

    #Add oov to tokenizer vocab
    new_vocab_len = tokenizer.add_tokens(oov)
    final_vocab_map = tokenizer.get_vocab()
    special_tokens = tokenizer.special_tokens_map.values()
    print("Added tokens:", new_vocab_len)
    if len(oov) != new_vocab_len:
      raise Exception("OOV does not match added vocab")

    #Resize bert_weights to add new tokens
    base_weights = torch.cat((bert_weights, torch.zeros((new_vocab_len, bert_weights.shape[-1]))), dim = 0)

    #Assign word piece mean as bert weights to new tokens
    for w in oov:
      idx = tokenizer.convert_tokens_to_ids(w.lower())
      if idx == 100: #Unknown
        raise Exception("Encountered unknown word:", w)

      base_weights[idx] = torch.mean(bert_weights[oov_word_piece_map[w]], dim = 0)

    ##Clone base_weights to initialize weights to be concatenated
    new_weights = base_weights.clone()

    ##Set new weights
    for w in custom_vocab_map:
      ##Skip special tokens
      if w in special_tokens:
        continue

      ##NOTE: lower to match bert implementation
      idx = tokenizer.convert_tokens_to_ids(w.lower())
      if idx == 100: #Unknown
        raise Exception("Encountered unknown word:", w)

      if w in oov:
        word_piece_ids = tokenizer(w, return_token_type_ids = False, return_attention_mask = False)["input_ids"][1:-1]
        base_weights[idx] = torch.mean(base_weights[word_piece_ids], dim = 0)

      new_weights[idx] = custom_weights[custom_vocab_map[w]]
  elif base_type == "random":
    torch.manual_seed(0)
    base_weights = torch.rand(custom_weights.shape)
    final_vocab_map = custom_vocab_map
    new_weights = custom_weights

  else:
    raise Exception("Base type (%s) not recognized." % (base_type))

  #Concatenate weights per word
  final_weights = torch.cat((base_weights, new_weights), dim = -1)

  return final_vocab_map, final_weights

In [None]:
#Generate Word Embedding variables
def generate_custom_word_embeddings(word1_emb_type, word2_emb_type, emoWordEmb_map, emoWordEmb_weights):
  if word2_emb_type == "emo":
    print("Aligning word embeddings: %s and %s" % (word1_emb_type, word2_emb_type))
    final_vocab_map, final_weights = align_custom_weights(emoWordEmb_map, emoWordEmb_weights, base_type = word1_emb_type)
    # final_seq_ids, final_vocab_map = generate_word_ids(clean_texts, FORCE_MAX_LENGTH, emb_type = "custom", word_id_map = final_vocab_map)
  elif word2_emb_type is None:
    if word1_emb_type != "bert":
      # final_seq_ids, final_vocab_map = generate_word_ids(clean_texts, FORCE_MAX_LENGTH, emb_type = word1_emb_type, word_id_map = emoWordEmb_map)
      final_vocab_map = emoWordEmb_map
      final_weights = emoWordEmb_weights
    else:
      bertTokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
      bertModel = BertModel.from_pretrained("bert-base-uncased")
      # final_seq_ids, final_vocab_map = generate_word_ids(clean_texts, FORCE_MAX_LENGTH, emb_type = word1_emb_type)
      final_vocab_map = bertTokenizer.get_vocab()
      final_weights = bertModel.embeddings.word_embeddings.weight.detach()
  else:
    raise Exception("Not yet implemented")
  print("Word1 emb type:", word1_emb_type)
  print("Word2 emb type:", word2_emb_type)
  print("Final weight shape:", final_weights.shape)

  #Add random weights if new tokens (ie. unknown, pad) are added
  # if len(final_vocab_map.keys()) > final_weights.shape[0]:
  #   print("Adding weights...")
  #   add_dim = len(final_vocab_map.keys()) - final_weights.shape[0]
  #   rand_weights = torch.rand(size = (add_dim, final_weights.shape[1]))
  #   final_weights = torch.cat((final_weights, rand_weights), dim = 0)

  return final_weights, final_vocab_map

# Model

## GCN Layer

In [None]:
import math

import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features,  drop_out = 0, activation=None, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.zeros(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters(in_features, out_features)
        self.dropout = torch.nn.Dropout(drop_out)
        self.activation =  activation

    def reset_parameters(self,in_features, out_features):
        stdv = np.sqrt(6.0/(in_features+out_features))
        # stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        # if self.bias is not None:
        #     torch.nn.init.zeros_(self.bias)
            # self.bias.data.uniform_(-stdv, stdv)


    def forward(self, input, adj, feature_less = False):
        if feature_less:
            support = self.weight
            support = self.dropout(support)
        else:
            input = self.dropout(input)
            support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            output = output + self.bias
        if self.activation is not None:
            output = self.activation(output)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

## GCN Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, n_layers = 2):
        super(GCN, self).__init__()
        self.n_layers = n_layers
        self.gc_list = []
        if n_layers >= 2:
            self.gc1 = GraphConvolution(nfeat, nhid, dropout, activation = nn.ReLU())
            self.gc_list = nn.ModuleList([GraphConvolution(nhid, nhid, dropout, activation = nn.ReLU()) for _ in range(self.n_layers-2)])
            self.gcf = GraphConvolution(nhid, nclass, dropout)
        else:
            self.gc1 = GraphConvolution(nfeat, nclass, dropout)

    def forward(self, x, adj):
        if self.n_layers>=2:
            x = self.gc1(x, adj, feature_less = True)
            for i in range(self.n_layers-2):
                x = self.gc_list[i](x,adj)
            x = self.gcf(x,adj)
        else:
            x = self.gc1(x, adj, feature_less = True)
        return x

In [None]:
def cal_accuracy(predictions,labels):
    pred = torch.argmax(predictions,-1).cpu().tolist()
    lab = labels.cpu().tolist()
    cor = 0
    for i in range(len(pred)):
        if pred[i] == lab[i]:
            cor += 1
    return cor/len(pred)

# Training

## Initialize model

In [None]:
import torch.optim as optim


# criterion = nn.CrossEntropyLoss()

# model = GCN(nfeat=node_size, nhid=HIDDEN_DIM, nclass=num_class, dropout=DROP_OUT,n_layers=NUM_LAYERS).to(device)
# optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

## Training and Validating

In [None]:
import time

def train_model(show_result = True):
    val_loss = []
    for epoch in range(NUM_EPOCHS):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        output= model(features, adj)
        loss_train = criterion(output[idx_train], labels[idx_train])
        acc_train = cal_accuracy(output[idx_train], labels[idx_train])
        loss_train.backward()
        optimizer.step()

        model.eval()
        output = model(features, adj)

        loss_val = criterion(output[idx_val], labels[idx_val])
        val_loss.append(loss_val.item())
        acc_val = cal_accuracy(output[idx_val], labels[idx_val])
        if show_result:
            print(  'Epoch: {:04d}'.format(epoch+1),
                    'loss_train: {:.4f}'.format(loss_train.item()),
                    'acc_train: {:.4f}'.format(acc_train),
                    'loss_val: {:.4f}'.format(loss_val.item()),
                    'acc_val: {:.4f}'.format(acc_val),
                    'time: {:.4f}s'.format(time.time() - t))

        if epoch > EARLY_STOPPING and np.min(val_loss[-EARLY_STOPPING:]) > np.min(val_loss[:-EARLY_STOPPING]) :
            if show_result:
                print("Early Stopping...")
            break

In [None]:
def generate_train_val(train_pro=0.9):
    real_train_size = int(train_pro*train_size)
    val_size = train_size-real_train_size

    idx_train = np.random.choice(train_size, real_train_size,replace=False)
    idx_train.sort()
    idx_val = []
    pointer = 0
    for v in range(train_size):
        if pointer<len(idx_train) and idx_train[pointer] == v:
            pointer +=1
        else:
            idx_val.append(v)
    idx_test = np.arange(train_size+vocab_length, node_size)
    return idx_train, idx_val, idx_test

In [None]:
def test_model():
    model.eval()
    output = model(features, adj)

    return test_labels, output[idx_test].cpu().detach().numpy()

## Evaluation

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
def evaluate_output(outputs, targets, targetLabels, session_name = "", show_results = True, multiLabel = False, return_results = False):

    if multiLabel:
      outputs = np.array(outputs) >= 0.5
    else:
      outputs = np.argmax(outputs, axis = 1)

    accuracy = accuracy_score(targets, outputs)
    f1_score_micro = f1_score(targets, outputs, average='micro')
    f1_score_macro = f1_score(targets, outputs, average='macro')
    f1_score_weighted = f1_score(targets, outputs, average="weighted")

    if show_results:
      print()
      print("=" * 50)
      print(session_name)
      print("=" * 50)
      print("Accuracy Score: %.4f" % (accuracy))
      print("F1 Score (Micro): %.4f" % (f1_score_micro))
      print("F1 Score (Macro): %.4f" % (f1_score_macro))
      print("F1 Score (Weighted): %.4f" % (f1_score_weighted))

      if multiLabel:
        ham_loss = hamming_loss(targets, outputs)
        print("Hamming Loss: %.4f" % (ham_loss))


      print(classification_report(targets, outputs, target_names = targetLabels, digits = 4))

    if return_results:
      results = {"Accuracy": accuracy,
              "F1_Micro": f1_score_micro,
              "F1_Macro": f1_score_macro,
              "F1_Weighted": f1_score_weighted,
              "Class Precision": precision_score(targets, outputs, average = None),
              "Class Recall": recall_score(targets, outputs, average = None),
              "Class F1": f1_score(targets, outputs, average = None)
      }

      return results

In [None]:
def save_predictions(path, ids, texts, goldLabels, classes, probs, preds):
  #Save predictions
  os.makedirs(os.path.dirname(path), exist_ok = True)
  saveOutput = pd.DataFrame({"Text": texts,
                             "Label": goldLabels},
                             index = ids)

  for i, c in enumerate(classes):
    saveOutput[c] = probs[:, i]

  saveOutput["Prediction"] = preds
  saveOutput = saveOutput.sort_index()
  saveOutput.to_csv(path)

#Tuning

In [None]:
import optuna
def objective(trial):
  #Generate model

  tune_dropout = trial.suggest_categorical("dropout", [0.01, 0.05, 0.1, 0.5])
  tune_layers = 2 #trial.suggest_int("num_layers", 2, 5)
  tune_hidden = 200 #trial.suggest_int("num_hidden", 100, 500, step = 100)

  tune_model = GCN(nfeat=node_size, nhid=tune_hidden, nclass=num_class, dropout=tune_dropout, n_layers=tune_layers).to(device)
  criterion = nn.CrossEntropyLoss()

  #Generate optimizers
  tune_lr = trial.suggest_float("lr", 0.01, 0.05, step = 0.01)
  tune_decay = trial.suggest_categorical("weight_decay", [0, 0.005, 0.05])
  optimizer = optim.Adam(tune_model.parameters(), lr=tune_lr, weight_decay=tune_decay)

  #Prepare dataset
  # idx_train, idx_val, _ = generate_train_val()

  #Training
  val_loss = []
  for epoch in range(NUM_EPOCHS):
    t = time.time()
    tune_model.train()
    optimizer.zero_grad()
    output= tune_model(features, adj)
    loss_train = criterion(output[idx_train], labels[idx_train])
    acc_train = cal_accuracy(output[idx_train], labels[idx_train])
    loss_train.backward()
    optimizer.step()

    tune_model.eval()
    output = tune_model(features, adj)

    loss_val = criterion(output[idx_val], labels[idx_val])
    val_loss.append(loss_val.item())
    acc_val = cal_accuracy(output[idx_val], labels[idx_val])

    if epoch > EARLY_STOPPING and np.min(val_loss[-EARLY_STOPPING:]) > np.min(val_loss[:-EARLY_STOPPING]) :
        # if show_result:
        #     print("Early Stopping...")
        break


    #Record accuracy
    trial.report(acc_val, epoch)

    # Handle pruning based on the intermediate value.
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

  return acc_val


#Mental Health Classification

In [None]:
#Load custom embeddings
with open(mmemogFile, "rb") as file:
  content = pickle.load(file)

emoWordEmb_map = content["vocab_map"]
emoWordEmb_weights = content["weights"]

In [None]:
features = generate_features("bert", "emo", emo_map = emoWordEmb_map, emo_matrix = emoWordEmb_weights)
features.shape

##Tune

In [None]:
idx_train, idx_val, idx_test = generate_train_val()
study = optuna.create_study(direction = "maximize")
study.enqueue_trial({"num_hidden": 200,
                     "num_layers": 2,
                     "dropout": 0.5,
                     "lr": 0.02,
                     "weight_decay": 0})
study.optimize(objective, n_trials = 50)

In [None]:
best_trial = study.best_trial
best_params = best_trial.params
print("BEST:", best_trial.value)
print("Params:")
for key, value in best_params.items():
  print("    {}: {}".format(key, value))

hidden_dim = 200
dropout = best_params["dropout"]
num_layers = 2
learn_rate = best_params["lr"]
weight_decay = best_params["weight_decay"]

##1-Run

In [None]:
idx_train, idx_val, idx_test = generate_train_val()

criterion = nn.CrossEntropyLoss()
model = GCN(nfeat=node_size, nhid=hidden_dim, nclass=num_class, dropout=dropout,n_layers=num_layers).to(device)
optimizer = optim.Adam(model.parameters(), lr=learn_rate, weight_decay=weight_decay)
train_model()

gold, probs = test_model()

preds = np.argmax(probs, axis = -1)
results = evaluate_output(probs, gold, lEnc.classes_.astype(str), session_name, return_results = True)
save_predictions(save_output_path,
                np.array(idx_test),
                original_test_sentences,
                lEnc.inverse_transform(gold),
                lEnc.classes_,
                np.array(probs),
                lEnc.inverse_transform(preds))

##10-Run

In [None]:
test_results = []

for t in tqdm(range(10)):
  idx_train, idx_val, idx_test = generate_train_val()

  model = GCN(nfeat=node_size, nhid=hidden_dim, nclass=num_class, dropout=dropout,n_layers=num_layers).to(device)
  optimizer = optim.Adam(model.parameters(), lr=learn_rate, weight_decay=weight_decay)
  train_model(show_result = False)

  gold, probs = test_model()

  preds = np.argmax(probs, axis = -1)
  results = evaluate_output(probs, gold, lEnc.classes_.astype(str), session_name, show_results = False, return_results = True)

  test_results.append(results)

In [None]:
#Collate results
result_average = {k: [] for k in test_results[0].keys()}
for r in test_results:
  for k in result_average:
    result_average[k].append(r[k])

#Average results
print("=" * 50)
print("10-run average")
print("=" * 50)
tab = []
tab_sd = []
header = []
for k, item in result_average.items():
  if k.split(" ")[0] == "Class":
    header.append(k.split(" ")[1])
    tab.append(np.mean(item, axis = 0))
    tab_sd.append(np.std(item, axis = 0))
  else:
    print("%s: %.4f ± %.4f" % (k, np.mean(item, axis = 0), np.std(item, axis = 0)))
# print("\n", tabulate(np.hstack(([[h] for h in header], np.round(np.array(tab), 4))), headers = lEnc.classes_))

class_sd = np.reshape(["%.4f ± %.4f" % (z, a) for x, y in zip(tab, tab_sd) for z, a in zip(x, y)], newshape=(3, num_class))
print("\n", tabulate(np.hstack(([[h] for h in header], class_sd)), headers = lEnc.classes_))