# GoEmotions

# Load Packages

In [1]:
import sys
import sklearn
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from functools import partial
import PIL
import PIL.Image
import pandas as pd

# %tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import tensorflow as tf
#import tensorflow_models as tfm
from tensorflow.keras import layers

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_fscore_support

from tensorflow.keras.initializers import TruncatedNormal
from tensorflow import keras
from keras import backend as K

from transformers import BertTokenizer, AutoTokenizer, TFAutoModel, BertModel, BertConfig, AutoModel, AdamW, TFBertForSequenceClassification, InputExample, InputFeatures, TFBertModel, BertConfig, BertTokenizerFast
from tokenizers import BertWordPieceTokenizer
from tensorflow.keras.optimizers import Adam

import collections
import re
import unicodedata
import emoji
import contractions

np.random.seed(42) # note that you must use the same seed to ensure consistentcy in your training/validation/testing
tf.random.set_seed(42)

# Import GoEmotions Dataset & Benchmarks
**Load SSEC Tweets Benchmark**

In [2]:
# import SSEC tweets
ssec6 = pd.read_csv('SSECTweets0.66.csv')
col_names = [c for c in ssec6.columns if c!="comment_text" ]
ssec6['num_label'] = ssec6[col_names].sum(axis=1)
ssec6 = ssec6.loc[ssec6.num_label==1].drop('num_label', axis=1).reset_index(drop=True)

In [3]:
# import Emo Tweets
emo = pd.read_csv('EmoTweets.csv')
# col_names = [c for c in emo.columns if c!='comment_text']
# emo['num_label'] = emo[col_names].sum(axis=1)
# emo.loc[emo.num_label==1]
ssec6 = emo

In [4]:
ds = tfds.load('goemotions')

2022-12-07 01:50:48.914315: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-07 01:50:49.401328: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 78973 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-80GB, pci bus id: 0000:4e:00.0, compute capability: 8.0


In [5]:
# convert to dataframe for inspection
df = tfds.as_dataframe(ds['train'])

In [6]:
df.head()

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,comment_text,confusion,curiosity,desire,...,love,nervousness,neutral,optimism,pride,realization,relief,remorse,sadness,surprise
0,False,False,False,False,False,False,"b""It's just wholesome content, from questionab...",False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,True,False,False,False,False,False,b'This is actually awesome.',False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,"b""People really spend more than $10 in an app ...",True,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,b'I grew up on the other side of Ama but live ...,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,"b'What the problem? I mean, steak? Good. Dough...",False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
#function to subset the dataset to single label entries only
def remove_multilabels(ds, split):
    df = tfds.as_dataframe(ds[split])
#     df = df.drop('grief', axis=1) # drop grief column 
    col_names = [c for c in df.columns if c!="comment_text" ]
    df['sum'] = df[col_names].sum(axis=1)
    df = df[df['sum']==1]
    #ds = tf.data.Dataset.from_tensor_slices(dict(df))
    #return ds
    return df

In [8]:
#subset data to single label entries only
ds_train = remove_multilabels(ds, 'train')
ds_valid = remove_multilabels(ds, 'validation')
ds_test = remove_multilabels(ds, 'test')

In [9]:
# single label examples
print(ds_train.shape[0]+ds_valid.shape[0]+ds_test.shape[0])

45446


# Preprocessing Data

In [10]:
#convert byte to string
pd.options.mode.chained_assignment = None

ds_train['comment_text'] = ds_train['comment_text'].str.decode("utf-8")
ds_valid['comment_text'] = ds_valid['comment_text'].str.decode("utf-8")
ds_test['comment_text'] = ds_test['comment_text'].str.decode("utf-8")

In [11]:
# Create list of emotions for each dataset

#GoEmotions emotions taxonomy - 28 labels
GE_taxonomy = [c for c in ds_train if c not in ['comment_text', 'sum']] 

# SSEC taxonomy - 8 labels
ssec_taxonomy = [c for c in ssec6.columns if c != 'comment_text']

In [12]:
#number of labels in our dataset
num_labels = len(GE_taxonomy)

ssec_num_labels = len(ssec_taxonomy)

In [13]:
#compute max length of samples
full_text = pd.concat([ds_train['comment_text'], ds_test['comment_text'], ds_valid['comment_text']])
max_length = full_text.apply(lambda x: len(x.split())).max()
max_length

32

In [14]:
ssec_full_text = ssec6['comment_text']
ssec_max_length = ssec_full_text.apply(lambda x: len(x.split())).max()
ssec_max_length

31

**Importing the BERT Base Model**

In [15]:
# Importing BERT pre-trained model and tokenizer
model_name = 'bert-base-uncased'
#model_name = 'bert-large-uncased'

config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertForSequenceClassification.from_pretrained(model_name, config = config)

2022-12-07 01:51:39.145867: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
#SUBSET
# Building a preprocessing function to clean text
def preprocess_corpus(x):

  # Adding a space between words and punctation
  x = re.sub( r'([a-zA-Z\[\]])([,;.!?])', r'\1 \2', x)
  x = re.sub( r'([,;.!?])([a-zA-Z\[\]])', r'\1 \2', x)
  
  # Demojize
  x = emoji.demojize(x)
  
  # Expand contraction
  x = contractions.fix(x)
  
  # Lower
  x = x.lower()

  # #correct some acronyms/typos/abbreviations  
  # x = re.sub(r"lmao", "laughing my ass off", x)  
  # x = re.sub(r"amirite", "am i right", x)
  # x = re.sub(r"\b(tho)\b", "though", x)
  # x = re.sub(r"\b(ikr)\b", "i know right", x)
  # x = re.sub(r"\b(ya|u)\b", "you", x)
  # x = re.sub(r"\b(eu)\b", "europe", x)
  # x = re.sub(r"\b(da)\b", "the", x)
  # x = re.sub(r"\b(dat)\b", "that", x)
  # x = re.sub(r"\b(dats)\b", "that is", x)
  # x = re.sub(r"\b(cuz)\b", "because", x)
  # x = re.sub(r"\b(fkn)\b", "fucking", x)
  # x = re.sub(r"\b(tbh)\b", "to be honest", x)
  # x = re.sub(r"\b(tbf)\b", "to be fair", x)
  # x = re.sub(r"faux pas", "mistake", x)
  # x = re.sub(r"\b(btw)\b", "by the way", x)
  # x = re.sub(r"\b(bs)\b", "bullshit", x)
  # x = re.sub(r"\b(kinda)\b", "kind of", x)
  # x = re.sub(r"\b(bruh)\b", "bro", x)
  # x = re.sub(r"\b(w/e)\b", "whatever", x)
  # x = re.sub(r"\b(w/)\b", "with", x)
  # x = re.sub(r"\b(w/o)\b", "without", x)
  # x = re.sub(r"\b(doj)\b", "department of justice", x)
  
  # #replace some words with multiple occurences of a letter, example "coooool" turns into --> cool
  x = re.sub(r"\b(j+e{2,}z+e*)\b", "jeez", x)
  x = re.sub(r"\b(co+l+)\b", "cool", x)
  x = re.sub(r"\b(g+o+a+l+)\b", "goal", x)
  x = re.sub(r"\b(s+h+i+t+)\b", "shit", x)
  x = re.sub(r"\b(o+m+g+)\b", "omg", x)
  x = re.sub(r"\b(w+t+f+)\b", "wtf", x)
  x = re.sub(r"\b(w+h+a+t+)\b", "what", x)
  x = re.sub(r"\b(y+e+y+|y+a+y+|y+e+a+h+)\b", "yeah", x)
  x = re.sub(r"\b(w+o+w+)\b", "wow", x)
  x = re.sub(r"\b(w+h+y+)\b", "why", x)
  x = re.sub(r"\b(s+o+)\b", "so", x)
  x = re.sub(r"\b(f)\b", "fuck", x)
  x = re.sub(r"\b(w+h+o+p+s+)\b", "whoops", x)
  x = re.sub(r"\b(ofc)\b", "of course", x)
  x = re.sub(r"\b(the us)\b", "usa", x)
  x = re.sub(r"\b(gf)\b", "girlfriend", x)
  x = re.sub(r"\b(hr)\b", "human ressources", x)
  x = re.sub(r"\b(mh)\b", "mental health", x)
  x = re.sub(r"\b(idk)\b", "i do not know", x)
  x = re.sub(r"\b(gotcha)\b", "i got you", x)
  x = re.sub(r"\b(y+e+p+)\b", "yes", x)
  x = re.sub(r"\b(a*ha+h[ha]*|a*ha +h[ha]*)\b", "haha", x)
  x = re.sub(r"\b(o?l+o+l+[ol]*)\b", "lol", x)
  x = re.sub(r"\b(o*ho+h[ho]*|o*ho +h[ho]*)\b", "ohoh", x)
  x = re.sub(r"\b(o+h+)\b", "oh", x)
  x = re.sub(r"\b(a+h+)\b", "ah", x)
  x = re.sub(r"\b(u+h+)\b", "uh", x)
  x = re.sub(r"\b(a+n+d+)\b", "and", x)

  # # Handling emojis
  x = re.sub(r"<3", " love_heart ", x)
  x = re.sub(r"xd", " smiling_face_with_open_mouth_and_tightly_closed_eyes ", x)
  x = re.sub(r":\)", " smiling_face ", x)
  x = re.sub(r"^_^", " smiling_face ", x)
  x = re.sub(r"\*_\*", " star_struck ", x)
  x = re.sub(r":\(", " frowning_face ", x)
  x = re.sub(r":\^\(", " frowning_face ", x)
  x = re.sub(r";\(", " frowning_face ", x)
  x = re.sub(r":\/",  " confused_face", x)
  x = re.sub(r";\)",  " wink", x)
  x = re.sub(r">__<",  " unamused ", x)
  x = re.sub(r"\b([xo]+x*)\b", " xoxo ", x)
  x = re.sub(r"\b(n+a+h+)\b", "nah", x)

  # # Handling special cases of text
  # x = re.sub(r"h a m b e r d e r s", "hamberders", x)
  # x = re.sub(r"b e n", "ben", x)
  # x = re.sub(r"s a t i r e", "satire", x)
  # x = re.sub(r"y i k e s", "yikes", x)
  # x = re.sub(r"s p o i l e r", "spoiler", x)
  # x = re.sub(r"thankyou", "thank you", x)
  # x = re.sub(r"a^r^o^o^o^o^o^o^o^n^d", "around", x)

  # Remove special characters and numbers replace by space + remove double space
  x = re.sub(r"\b([.]{3,})"," dots ", x)
  x = re.sub(r"[^A-Za-z!?_]+"," ", x)
  x = re.sub(r"\b([s])\b *","", x)
  x = re.sub(r" +"," ", x)
  x = x.strip()

  return x

In [17]:
# Applying the preprocessing function on the dataset
ds_train["clean_text"] = ds_train["comment_text"].apply(preprocess_corpus)
ds_valid["clean_text"] = ds_valid["comment_text"].apply(preprocess_corpus)
ds_test["clean_text"] = ds_test["comment_text"].apply(preprocess_corpus)

# Preview of data
display(ds_train[['comment_text', 'clean_text']].sample(5))

Unnamed: 0,comment_text,clean_text
2104,"Wow your sister is in abusive relationship, co...",wow your sister is in abusive relationship cool
30853,If you're blaming her for the design of the mo...,if you are blaming her for the design of the m...
10854,Being compared to [NAME] is a compliment becau...,being compared to name is a compliment because...
8734,They just dont want to go to school. Source:am...,they just do not want to go to school source a...
5755,I have trouble even remembering what true happ...,i have trouble even remembering what true happ...


In [18]:
# apply preprocessing
ssec6['clean_text'] = ssec6['comment_text'].apply(preprocess_corpus)

**SSEC Train-Test Split**  
For the SSEC Benchmark, create 5 train-test splits. Each train split should contain 600 samples. The test set will contain the remaining samples.

In [19]:
np.random.seed(101)

train_idx_list = []
test_idx_list = []

for i in range(5):
    train_idx = list(np.random.choice(len(ssec6),
                                     size=600,
                                     replace=False))
    test_idx = list(set(ssec6.index)-set(train_idx))
    
    train_idx_list.append(train_idx)
    test_idx_list.append(test_idx)

In [20]:
# seed check s/b 2064
train_idx_list[0][10]

442

In [21]:
# create initial split
ssec_train = ssec6.loc[train_idx_list[0]]
ssec_test = ssec6.loc[test_idx_list[0]]

**Define Function to Construct TensorFlow Model**

In [22]:
# function for creating BERT based model
def create_model(nb_labels, max_length, name):
  #nb_labels is the number of labels in our data

  # Load the MainLayer
  bert = transformer_model.layers[0]

  # Build the model inputs
  input_ids = layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
  attention_mask = layers.Input(shape=(max_length,), name='attention_mask', dtype='int32')
  token_type_ids = layers.Input(shape=(max_length,), name='token_type_ids', dtype='int32') #original code called this variable token_ids
  inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids}

  # Load the Transformers BERT model as a layer in a Keras model
  bert_model = bert(inputs)[1]
  dropout = layers.Dropout(config.hidden_dropout_prob, name='pooled_output')
  pooled_output = dropout(bert_model, training=False)

  # Then build the model output
  emotion = layers.Dense(units=nb_labels, activation="sigmoid", kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='emotion')(pooled_output) #possibly change activation to softmax
  #emotion = layers.Dense(units=nb_labels, activation="softmax", kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='emotion')(pooled_output) #possibly change activation to softmax
  outputs = emotion

  # And combine it all in a model object
  model = tf.keras.models.Model(inputs=inputs, outputs=outputs, name=name)

  return model

## The Experiment
With the datasets set up, we now conduct the transfer learning experiment. There are two objectives. First, we want to show that GoEmotions is beneficial for transfer learning tasks when there isn't much data available in the target domain. To measure this, we train a BERT model with just the benchmark data. We compare performance against a BERT model pre-trained with GoEmotions data and then fine-tuned with the benchmark data.

Second, we want to determine the optimal number of epochsfor pre-training with BERT. We pre-train with GoEmotions for 4, 8, and 12 epochs. Then we fine-tune with the benchmark data for 3-epochs. We evaluate the performance on the benchmark test set.

**Baseline** *Finetuning with just the benchmark data for 3 epochs*


In [23]:
# print baseline model summary for reference
tf.keras.backend.clear_session()
baseline_model = create_model(ssec_num_labels, max_length, "BERT_Model")
baseline_model.summary()

Model: "BERT_Model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 32)]         0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 32)]         0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 32)]         0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['attention_mask[0][0]',         
                                thPoolingAndCrossAt               'input_ids[0][0]',     

In [24]:
import math

In [25]:
# Function for calculating multilabel class weights
def calculating_class_weights(y_true):
    number_dim = np.shape(y_true)[1]
    weights = np.empty([number_dim, 2])
    for i in range(number_dim):
        #compute_class_weight(class_weight, *, classes, y)
        weights[i] = compute_class_weight('balanced', classes = [0.,1.], y = y_true[:, i])
    return weights



In [26]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_proba.shape[0]):
        for j in range(y_pred_proba.shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [27]:
# Custom loss function for multilabel
def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
        #paper used sigmoid cross entropy loss function to support multi-label classification
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
        #return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*tf.keras.metrics.sparse_categorical_crossentropy(y_true, y_pred), axis=-1)
    
    return weighted_loss

In [28]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary", zero_division=0)
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro", zero_division=0)
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [29]:
def fit_predict_baseline_model(train_split, test_split, epochs=3, name="BERT_Model"):
    
    tf.keras.backend.clear_session()
    
    baseline_model = create_model(ssec_num_labels, ssec_max_length, name)
    
    # create features and targets for each split
    ssec_X_train = train_split['clean_text']
    ssec_y_train = train_split.loc[:,ssec_taxonomy].values.astype('float')

    ssec_X_test = test_split['clean_text']
    ssec_y_test = test_split.loc[:, ssec_taxonomy].values.astype('float')
    
    # Tokenizing train data
    ssec_train_token = tokenizer(text = ssec_X_train.to_list(),
                                add_special_tokens = True,
                                max_length = ssec_max_length,
                                truncation = True,
                                padding = 'max_length', 
                                return_tensors = 'tf',
                                return_token_type_ids = True,
                                return_attention_mask = True,
                                verbose = True)
    
    # Tokenizing test data
    ssec_test_token = tokenizer(text = ssec_X_test.to_list(),
                               add_special_tokens = True,
                               max_length = ssec_max_length,
                               truncation = True,
                               padding = 'max_length', 
                               return_tensors = 'tf',
                               return_token_type_ids = True,
                               return_attention_mask = True,
                               verbose = True)
    
    # Creating BERT compatible inputs with Input Ids, attention masks and token Ids 
    ssec_train_BERT = {'input_ids': ssec_train_token['input_ids'], 'attention_mask': ssec_train_token['attention_mask'],'token_type_ids': ssec_train_token['token_type_ids']}
    ssec_test_BERT = {'input_ids': ssec_test_token['input_ids'], 'attention_mask': ssec_test_token['attention_mask'],'token_type_ids': ssec_test_token['token_type_ids']}
    
    # -- Model Training --
    batchsize = 16
    learning_rate = 5.e-05
#     epochs = epochs
    
    # Creating TF tensors
    ssec_train_tensor = tf.data.Dataset.from_tensor_slices((ssec_train_BERT, ssec_y_train)).shuffle(len(ssec_train_BERT)).batch(batchsize)
    ssec_test_tensor = tf.data.Dataset.from_tensor_slices((ssec_test_BERT, ssec_y_test)).shuffle(len(ssec_test_BERT)).batch(batchsize)

    optimizer = Adam(learning_rate = learning_rate,)

    class_weights = calculating_class_weights(ssec_y_train)
    loss = get_weighted_loss(class_weights)

    # Compile the model
    baseline_model.compile(
        optimizer = optimizer,
        loss = loss,
        metrics = ["accuracy","AUC"] 
        )

    # train the model
    history = baseline_model.fit(ssec_train_tensor, 
                        epochs = epochs)
    
    # -- make predictions --
    print("Predicting Test Data")
    ssec_pred_proba = baseline_model.predict(ssec_test_BERT)
    ssec_pred_labels = proba_to_labels(ssec_pred_proba)
    
    return model_eval(ssec_y_test, ssec_pred_labels, ssec_taxonomy).to_dict()

In [30]:
# model_name = 'bert-base-uncased'
# config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
# tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# transformer_model = TFBertModel.from_pretrained(model_name, config = config)

# result = fit_predict_baseline_model(ssec_train, ssec_test, epochs=4)
# result

In [31]:
# %%capture
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

MACRO_F1 = pd.DataFrame()
RESULTS = {}

# run 5 trials
for i in range(5):
    
    model_name = 'bert-base-uncased'
    config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
    tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
    transformer_model = TFBertModel.from_pretrained(model_name, config = config)

    trial_name = f"baseline_{i}"
    
    ssec_train = ssec6.loc[train_idx_list[i]]
    ssec_test = ssec6.loc[test_idx_list[i]]
    
    result = fit_predict_baseline_model(ssec_train, ssec_test, epochs=8, name=trial_name)
    
    df = pd.DataFrame({'trial_name':[trial_name],
                       'GoEpochs':[0],
                       'F1':[result['F1']['MACRO-AVERAGE']]})
    
    RESULTS[trial_name] = result
    
    MACRO_F1 = pd.concat([MACRO_F1,df])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Predicting Test Data
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Predicting Test Data
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Predicting Test Data
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Predicting Test Data
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Predicting Test Data


In [32]:
MACRO_F1

Unnamed: 0,trial_name,GoEpochs,F1
0,baseline_0,0,0.09
0,baseline_1,0,0.12
0,baseline_2,0,0.07
0,baseline_3,0,0.1
0,baseline_4,0,0.07


---
**Model4**

In [33]:
def fit_predict_pretrain_model(train_split, test_split, GoEpochs=4, name="BERT_Pretrain_Model"):
    tf.keras.backend.clear_session()

    model4 = create_model(num_labels, max_length, "BERT_Model")

    X_train = ds_train['clean_text']
    y_train = ds_train.loc[:, GE_taxonomy].values.astype(float)

    X_test = ds_test['clean_text']
    y_test = ds_test.loc[:, GE_taxonomy].values.astype(float)
    
    train_token = tokenizer(
        text = X_train.to_list(),
        add_special_tokens = True,
        max_length = max_length,
        truncation = True,
        padding = 'max_length', 
        return_tensors = 'tf',
        return_token_type_ids = True,
        return_attention_mask = True,
        verbose = True)
    
    test_token = tokenizer(
        text = X_test.to_list(),
        add_special_tokens = True,
        max_length = max_length,
        truncation = True,
        padding = 'max_length', 
        return_tensors = 'tf',
        return_token_type_ids = True,
        return_attention_mask = True,
        verbose = True)

    train = {'input_ids': train_token['input_ids'], 'attention_mask': train_token['attention_mask'],'token_type_ids': train_token['token_type_ids']}
    test = {'input_ids': test_token['input_ids'], 'attention_mask': test_token['attention_mask'],'token_type_ids': test_token['token_type_ids']}

    # -- hyperparameters --
    batchsize = 16 #128 #64 #32 #16
    learning_rate = 5.e-05

    train_tensor = tf.data.Dataset.from_tensor_slices((train, y_train)).shuffle(len(train)).batch(batchsize)
    test_tensor = tf.data.Dataset.from_tensor_slices((test, y_test)).shuffle(len(test)).batch(batchsize)
    
    class_weights = calculating_class_weights(y_train)
    
    optimizer = Adam(
    learning_rate = learning_rate,
    )

    loss = get_weighted_loss(class_weights)

    model4.compile(
    optimizer = optimizer,
    loss = loss,
    metrics = ["accuracy","AUC"] 
    )

    history = model4.fit(train_tensor, 
                    epochs = GoEpochs)
    
    # replace output layer
    emo_model4 = keras.Model(inputs=model4.inputs, outputs=model4.layers[-2].output)

    new_output = layers.Dense(units=ssec_num_labels, activation="sigmoid", kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='emotion')(model4.layers[-2].output)
    emo_model4 = keras.Model(inputs = model4.input, outputs=new_output)
    
#     for l in emo_model4.layers[:-2]:
#         l.trainable=False
    
    # process ssec data
    ssec_X_train = train_split['clean_text']
    ssec_y_train = train_split.loc[:,ssec_taxonomy].values.astype('float')

    ssec_X_test = test_split['clean_text']
    ssec_y_test = test_split.loc[:, ssec_taxonomy].values.astype('float')
    
    # Tokenizing train data
    ssec_train_token = tokenizer(text = ssec_X_train.to_list(),
                                add_special_tokens = True,
                                max_length = max_length,
                                truncation = True,
                                padding = 'max_length', 
                                return_tensors = 'tf',
                                return_token_type_ids = True,
                                return_attention_mask = True,
                                verbose = True)
    
    # Tokenizing test data
    ssec_test_token = tokenizer(text = ssec_X_test.to_list(),
                               add_special_tokens = True,
                               max_length = max_length,
                               truncation = True,
                               padding = 'max_length', 
                               return_tensors = 'tf',
                               return_token_type_ids = True,
                               return_attention_mask = True,
                               verbose = True)
    
    # Creating BERT compatible inputs with Input Ids, attention masks and token Ids 
    ssec_train_BERT = {'input_ids': ssec_train_token['input_ids'], 'attention_mask': ssec_train_token['attention_mask'],'token_type_ids': ssec_train_token['token_type_ids']}
    ssec_test_BERT = {'input_ids': ssec_test_token['input_ids'], 'attention_mask': ssec_test_token['attention_mask'],'token_type_ids': ssec_test_token['token_type_ids']}
    
    ssec_train_tensor = tf.data.Dataset.from_tensor_slices((ssec_train_BERT, ssec_y_train)).shuffle(len(ssec_train_BERT)).batch(batchsize)
    ssec_test_tensor = tf.data.Dataset.from_tensor_slices((ssec_test_BERT, ssec_y_test)).shuffle(len(ssec_test_BERT)).batch(batchsize)
    
    optimizer = Adam(
    learning_rate = learning_rate,
    )

    class_weights = calculating_class_weights(ssec_y_train)
    loss = get_weighted_loss(class_weights)

    emo_model4.compile(
    optimizer = optimizer,
    loss = loss,
    metrics = ["accuracy","AUC"] 
    )

    history = emo_model4.fit(ssec_train_tensor, 
                    epochs = 8)
    
    model4_pred_proba = emo_model4.predict(ssec_test_BERT)
    model4_pred_labels = proba_to_labels(model4_pred_proba)
    
    return model_eval(ssec_y_test, model4_pred_labels, ssec_taxonomy).to_dict()

In [34]:
model_name = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

result = fit_predict_pretrain_model(ssec_train, ssec_test, GoEpochs=4)
result

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


{'Precision': {'acceptance': 0.0,
  'admiration': 0.0,
  'amazement': 0.0,
  'anger or annoyance or hostility or fury': 0.0,
  'anticipation or  expectancy or interest': 0.0,
  'calmness or serenity': 0.0,
  'disappointment': 0.0,
  'disgust': 0.0,
  'dislike': 0.0,
  'fear or apprehension or panic or terror': 0.0,
  'hate': 0.0,
  'indifference': 0.0,
  'joy or happiness or elation': 0.0,
  'like': 0.0,
  'sadness or gloominess or grief or sorrow': 0.0,
  'surprise': 0.0,
  'trust': 0.0,
  'uncertainty or indecision or confusion': 0.0,
  'vigilance': 0.0,
  'MACRO-AVERAGE': 0.0},
 'Recall': {'acceptance': 0.0,
  'admiration': 0.0,
  'amazement': 0.0,
  'anger or annoyance or hostility or fury': 0.0,
  'anticipation or  expectancy or interest': 0.0,
  'calmness or serenity': 0.0,
  'disappointment': 0.0,
  'disgust': 0.0,
  'dislike': 0.0,
  'fear or apprehension or panic or terror': 0.0,
  'hate': 0.0,
  'indifference': 0.0,
  'joy or happiness or elation': 0.0,
  'like': 0.0,
  'sadn

In [35]:
# %%capture
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

GOEPOCHS = [4,8,12]

for g in GOEPOCHS:
    print(f"Pretraining BERT with GoEmotions for {g} epochs")
    for i in range(5):
    
        model_name = 'bert-base-uncased'
        config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
        tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
        transformer_model = TFBertModel.from_pretrained(model_name, config = config)

        trial_name = f"{g}epochs_{i}"

        ssec_train = ssec6.loc[train_idx_list[i]]
        ssec_test = ssec6.loc[test_idx_list[i]]

        result = fit_predict_pretrain_model(ssec_train, ssec_test, GoEpochs=g, name=trial_name)

        df = pd.DataFrame({'trial_name':[trial_name],
                           'GoEpochs':[g],
                           'F1':[result['F1']['MACRO-AVERAGE']]})

        RESULTS[trial_name] = result

        MACRO_F1 = pd.concat([MACRO_F1,df])

Pretraining BERT with GoEmotions for 4 epochs
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Pretraining BERT with GoEmotions for 8 epochs
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/

In [55]:
MACRO_F1.to_csv('F1_scores_transfer_learning_emo_600.csv',index=False)

In [37]:
import json

with open("transfer_learning_emo_600_results.json", "w") as outfile:
    json.dump(RESULTS, outfile)

In [56]:
# # %%capture
# from transformers import logging as hf_logging
# hf_logging.set_verbosity_error()

# GOEPOCHS = [8]

# catch = {}
# repeat_num = 3
# for g in GOEPOCHS:
#     print(f"Pretraining BERT with GoEmotions for {g} epochs")
    
#     model_name = 'bert-base-uncased'
#     config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
#     tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
#     transformer_model = TFBertModel.from_pretrained(model_name, config = config)

#     trial_name = f"{g}epochs_{repeat_num}"

#     ssec_train = ssec6.loc[train_idx_list[repeat_num]]
#     ssec_test = ssec6.loc[test_idx_list[repeat_num]]

#     result = fit_predict_pretrain_model(ssec_train, ssec_test, GoEpochs=g, name=trial_name)

#     df = pd.DataFrame({'trial_name':[trial_name],
#                        'GoEpochs':[g],
#                        'F1':[result['F1']['MACRO-AVERAGE']]})
    
#     catch[trial_name] = result

# #     RESULTS[trial_name] = result

# #     MACRO_F1 = pd.concat([MACRO_F1,df])

In [57]:
# catch

In [40]:
# MACRO_F1 = pd.read_csv('F1_scores_transfer_learning_emo.csv')

In [52]:
# MACRO_F1.loc[MACRO_F1.trial_name=='4epochs_1','F1'] = catch['4epochs_1']['F1']['MACRO-AVERAGE']
# MACRO_F1.loc[MACRO_F1.trial_name=='8epochs_3','F1'] = catch['8epochs_3']['F1']['MACRO-AVERAGE']


# # MACRO_F1.to_csv('F1_scores_transfer_learning_emo.csv', index=False)

In [53]:
MACRO_F1.groupby('GoEpochs').agg({'F1':['mean', 'sem']})

Unnamed: 0_level_0,F1,F1
Unnamed: 0_level_1,mean,sem
GoEpochs,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.09,0.009487
4,0.118,0.008602
8,0.11,0.005477
12,0.104,0.007483


In [54]:
MACRO_F1

Unnamed: 0,trial_name,GoEpochs,F1
0,baseline_0,0,0.09
0,baseline_1,0,0.12
0,baseline_2,0,0.07
0,baseline_3,0,0.1
0,baseline_4,0,0.07
0,4epochs_0,4,0.11
0,4epochs_1,4,0.12
0,4epochs_2,4,0.11
0,4epochs_3,4,0.15
0,4epochs_4,4,0.1
