### Gender-Neutral Sentiment Classifier based on BERT-PubMed 


Portions of this code are Copyright 2020 by the TensorFlow Hub Authors and are used in accordance with the Apache 2.0 License

Otherwise Copyright (C) 2023 by the Regents of the University of California and licensed under the Apache License, Version 2.0

In [20]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
# Load prerequisite TensorFlow libraries

import os
import shutil

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization 
import matplotlib.pyplot as plt

import numpy as np

tf.get_logger().setLevel('ERROR')

In [2]:
# Define functions to remove gendered pronouns and nouns from the training dataset

pronouns_gendered = ['himself', 'herself', 'him', 'her', 'he', 'she', 'his', 'hers', 'he\'s', 'she\'s', 
                     'guys', 'gals', 'man', 'woman', 'guy', 'gal', 'men', 'women']
pronouns_nongendered = ['themself', 'themself', 'them', 'them', 'they', 'they', 'their', 'their', 'they\'re', 'they\'re', 
                        'people', 'people', 'person', 'person', 'person', 'person', 'people', 'people']

def remove_gender(b_sentence):
    new_sentence = ''
    sentence = str(b_sentence,'utf-8')
    #print(len(sentence))
    #print(sentence)
    for word in sentence.split():
        if word.lower() in pronouns_gendered:
            new_sentence += pronouns_nongendered[pronouns_gendered.index(word.lower())] + ' '
        elif word.rstrip('.').lower() in pronouns_gendered:
            new_sentence += pronouns_nongendered[pronouns_gendered.index(word.rstrip('.').lower())] + '. '
        else:
            new_sentence += word + ' '
    return bytes(new_sentence,'utf-8')

def rg_array(sentence_array):
    rgv = np.vectorize(remove_gender)
    return rgv(sentence_array)

def rg_tensor(t_string, label):
    [t_string,] = tf.py_function(rg_array, [t_string], [tf.string])
    return t_string, label

In [None]:
# Process the LMRD/IMDB training data
# Uncomment and run the below if not already downloaded 
# url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

#dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url, untar=True, cache_dir='.', cache_subdir='')
#dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
#train_dir = os.path.join(dataset_dir, 'train')
#remove_dir = os.path.join(train_dir, 'unsup') shutil.rmtree(remove_dir)

In [3]:
# Load and process the training and validation datasets

AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

# Load training dataset
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)
class_names = raw_train_ds.class_names

# Remove gendered pronouns from training dataset
raw_train_ds = raw_train_ds.map(rg_tensor)
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Load validation datset
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

# Remove gendered pronouns from validation dataset
val_ds = val_ds.map(rg_tensor)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [4]:
# Load BERT-Pubmed from TensorFlow Hub

bert_model_name = 'experts_pubmed'

map_name_to_handle = {
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
}

map_model_to_preprocess = {
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/google/experts/bert/pubmed/2
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [5]:
# define BERT preprocessing model 

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [6]:
# define classifier model

def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()

In [7]:
# train model

loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

epochs = 2
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Training model with https://tfhub.dev/google/experts/bert/pubmed/2
Epoch 1/2
Epoch 2/2


In [None]:
# export the trained model

dataset_name = 'imdb_' + bert_model_name
saved_model_path = './{}_BERT-PubMed-GenderNeutral'.format(dataset_name.replace('/', '_'))

classifier_model.save(saved_model_path, include_optimizer=False)