In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import Model, models, mixed_precision
from tensorflow.keras.utils import plot_model
import tensorflow_text  # must import even if it is not used, else will have error
import tensorflow_hub as hub
import glob
from sklearn.utils import shuffle

physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
mixed_precision.set_global_policy('mixed_float16')


# all_files = glob.glob("*MA.csv")
# li = []
# for filename in all_files:
#     df = pd.read_csv(filename, index_col=None, header=0)
#     li.append(df)
#
# df = pd.concat(li, axis=0, ignore_index=True)
# df.dropna(inplace=True)
# df = df[['chapter_name', 'qns']]
# df.to_csv('all_qns.csv', index=False)

df = pd.read_csv('all_qns_multi_label.csv')
labels = df['chapter_name'].str.lower().tolist()
labels = [[*i.split(',')] for i in labels]
classes = []
for i in labels:
    for j in i:
        if j not in classes:
            classes.append(j)
classes_index = {v: i for i, v in enumerate(classes)}
def multi_hot_encode(label):
    label_encoded = [classes_index[x] for x in label]
    label_encoded = tf.reduce_max(tf.one_hot(label_encoded, len(classes)), axis=0)
    return label_encoded

labeled_df = pd.DataFrame()
labels = [multi_hot_encode(i) for i in labels]

def clean(qns):
    import re
    qns = re.sub(r'[^\x00-\x7F]+', ' ', qns)  # clean unicode stuff
    qns = re.sub(r'\d+', ' 0 ', qns)  # replace all numbers with 0
    qns = re.sub(r'_+', ' _ ', qns)  # replace all underscores with single underscore
    qns = qns.split('(Note to students')[0].split('Notes to student')[0].split('Note to students')[0].split('Note to student')[0].split('Notes to students')[0].split('(Separate ')[0]  # strip hints/notes
    qns = qns.strip()
    return qns


qns = df['qns'].apply(clean)

labeled_df['label'] = pd.Series(labels)
labeled_df['qns'] = qns
labeled_df.dropna(inplace=True)
labeled_df.to_csv('multi_labeled_df.csv', index=False)

# shuffle the dataset
X, Y = shuffle(qns, labels)


# labeled_df['label'] = pd.Series(labels)
# labeled_df['qns'] = df['qns'].apply(clean)
# labeled_df.dropna(inplace=True)
# labeled_df.to_csv('multi_labeled_df.csv', index=False)
# labeled_df = labeled_df.sample(frac=1).reset_index(drop=True)  # shuffle

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3080 Ti, compute capability 8.6


In [12]:
# an accuracy metrics that excludes negative classes for each sample
def positive_accuracy(y_true, y_pred):
    thresh = 0.5
    positive = y_true * y_pred
    total_score = tf.reduce_sum(tf.cast(tf.greater_equal(positive, thresh), tf.float32))
    acc = total_score / tf.reduce_sum(y_true)
    return acc

# weighted BCE
def weighted_binary_crossentropy(pos_weight=1.):

    # y_pred is the raw output of the logits layer
    def _weighted_binary_crossentropy(y_true, y_pred):
        return tf.keras.backend.mean(tf.nn.weighted_cross_entropy_with_logits(labels=y_true, logits=y_pred, pos_weight=pos_weight), axis=-1)
    
    return _weighted_binary_crossentropy

loss = weighted_binary_crossentropy(pos_weight=15.)

opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
batch_size = 128
epochs = 15
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_positive_accuracy', min_delta=0, patience=10, verbose=1,
                                     mode='auto', baseline=None, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_positive_accuracy', factor=0.1, patience=3, verbose=1)
]

In [3]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
encoder_inputs = preprocessing_layer(text_input)
encoder = hub.KerasLayer('https://tfhub.dev/google/experts/bert/wiki_books/qnli/2', trainable=False)
outputs = encoder(encoder_inputs)
x = outputs['sequence_output']
x = Bidirectional(LSTM(512, return_sequences=True))(x)
x = LayerNormalization()(x)
x = SpatialDropout1D(0.5)(x)
x = Bidirectional(LSTM(256, return_sequences=False))(x)
x = LayerNormalization()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
xOut = Dense(len(classes), activation=None)(x)
model = Model(text_input, xOut)
model.compile(loss=loss, optimizer=opt, metrics=['accuracy', positive_accuracy])

In [13]:
plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True, to_file='lstm_model.png')
model.summary()
model.fit(np.array(X), np.array(Y), batch_size=batch_size, epochs=epochs, callbacks=callbacks, verbose=1, use_multiprocessing=True, validation_split=0.2)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['input_1[0][0]']                
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

<keras.callbacks.History at 0x2bfc76823a0>

In [14]:
model.save('multi_label_lstm.h5')

In [3]:
models.load_model('multi_label_lstm')
question = 'Billy is selling his penis for 5 cents per centimeter. How many centimeters of penis can you buy with 2 dollars?'
question = clean(question)
sigmoid_out = Activation('sigmoid')(model(tf.constant([question])).numpy()[0])
confidence = tf.where(tf.greater_equal(sigmoid_out, 0.5)).numpy()
class_predicted = [classes[i[0]] for i in confidence]
for class_, conf in zip(class_predicted, confidence):
    print(f'Class predicted: {class_} with confidence {sigmoid_out[conf[0]]*100:.3f}%')
conf_list = {}
for i in range(len(classes)):
    conf_list.update({classes[i]: f'{sigmoid_out[i]*100:.3f}%'})

conf_list

ValueError: Unknown layer: KerasLayer. Please ensure this object is passed to the `custom_objects` argument. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.