In [100]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import Model, models, mixed_precision
from tensorflow.keras.utils import plot_model
import tensorflow_text
import tensorflow_hub as hub
import tensorflow_addons as tfa
import glob
from sklearn.utils import shuffle

physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
mixed_precision.set_global_policy('mixed_float16')


# all_files = glob.glob("*MA.csv")
# li = []
# for filename in all_files:
#     df = pd.read_csv(filename, index_col=None, header=0)
#     li.append(df)
#
# df = pd.concat(li, axis=0, ignore_index=True)
# df.dropna(inplace=True)
# df = df[['chapter_name', 'qns']]
# df.to_csv('all_qns.csv', index=False)

df = pd.read_csv('all_qns_multi_label.csv')
labels = df['chapter_name'].str.lower().tolist()
labels = [[*i.split(',')] for i in labels]
classes = []
for i in labels:
    for j in i:
        if j not in classes:
            classes.append(j)
classes_index = {v: i for i, v in enumerate(classes)}
def multi_hot_encode(label):
    label_encoded = [classes_index[x] for x in label]
    label_encoded = tf.reduce_max(tf.one_hot(label_encoded, len(classes)), axis=0)
    return label_encoded

labeled_df = pd.DataFrame()
labels = [multi_hot_encode(i) for i in labels]

def clean(qns):
    import re
    qns = re.sub(r'[^\x00-\x7F]+', ' ', qns)  # clean unicode stuff
    qns = re.sub(r'\d+', ' 0 ', qns)  # replace all numbers with 0
    qns = re.sub(r'_+', '_', qns)  # replace all underscores with single underscore
    qns = qns.split('(Note to students')[0].split('Notes to student')[0].split('Note to students')[0].split('Note to student')[0].split('Notes to students')[0].split('(Separate ')[0]
    qns = qns.strip()
    return qns


qns = df['qns'].apply(clean)

labeled_df['label'] = pd.Series(labels)
labeled_df['qns'] = qns
labeled_df.dropna(inplace=True)
labeled_df.to_csv('multi_labeled_df.csv', index=False)

# shuffle the dataset
X, Y = shuffle(qns, labels)


# labeled_df['label'] = pd.Series(labels)
# labeled_df['qns'] = df['qns'].apply(clean)
# labeled_df.dropna(inplace=True)
# labeled_df.to_csv('multi_labeled_df.csv', index=False)
# labeled_df = labeled_df.sample(frac=1).reset_index(drop=True)  # shuffle

In [97]:
# weighted BCE
def weighted_binary_crossentropy(pos_weight=1.):

    # y_pred is the raw output of the logits layer
    def _weighted_binary_crossentropy(y_true, y_pred):
        return tf.keras.backend.mean(tf.nn.weighted_cross_entropy_with_logits(labels=y_true, logits=y_pred, pos_weight=pos_weight), axis=-1)
    
    return _weighted_binary_crossentropy

loss = weighted_binary_crossentropy(pos_weight=15.)

opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
batch_size = 128
epochs = 100
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, patience=15, verbose=1,
                                     mode='auto', baseline=None, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=12, verbose=1)
]

In [98]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
encoder_inputs = preprocessing_layer(text_input)
encoder = hub.KerasLayer('https://tfhub.dev/google/experts/bert/wiki_books/qnli/2', trainable=False)
outputs = encoder(encoder_inputs)
x = outputs['pooled_output']
x = Dropout(0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
xOut = Dense(len(classes), activation=None)(x)
model = Model(text_input, xOut)
model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])

In [None]:
plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True, to_file='model.png')
model.summary()
model.fit(np.array(X), np.asarray(Y), batch_size=batch_size, epochs=epochs, callbacks=callbacks, verbose=1, use_multiprocessing=True, validation_split=0.2)

In [None]:
model.save('model.h5')
question = 'The height of an isosceles triangle with a base length of 8cm is 3cm. What is the perimeter of a similar triangle with base 4cm'
sigmoid_out = model(tf.constant([question])).numpy()[0]
confidence = [sigmoid_out.index(i) for i in sigmoid_out if i > 0.5]
class_predicted = [classes[confidence.index(i)] for i in confidence]
for class_, conf in zip(class_predicted, confidence):
    print(f'Class predicted: {class_} with confidence {conf*100:.3f}%')
conf_list = {}
for i in range(len(classes)):
    conf_list.update({classes[i]: f'{sigmoid_out[i]*100:.3f}%'})

conf_list