In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import Model, models, mixed_precision
from tensorflow.keras.utils import plot_model
import tensorflow_text
import tensorflow_hub as hub
import tensorflow_addons as tfa
import glob

physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
mixed_precision.set_global_policy('mixed_float16')


# all_files = glob.glob("*MA.csv")
# li = []
# for filename in all_files:
#     df = pd.read_csv(filename, index_col=None, header=0)
#     li.append(df)
#
# df = pd.concat(li, axis=0, ignore_index=True)
# df.dropna(inplace=True)
# df = df[['chapter_name', 'qns']]
# df.to_csv('all_qns.csv', index=False)

df = pd.read_csv('all_qns_merged_classes.csv')
classes = df['chapter_name'].unique()
classes_index = {v: i for i, v in enumerate(classes)}
labeled_df = df.replace({'chapter_name': classes_index})
labeled_df.rename(columns={'chapter_name': 'label'}, inplace=True)


def clean(qns):
    import re
    qns = re.sub(r'[^\x00-\x7F]+', ' ', qns)  # clean unicode stuff
    qns = re.sub(r'\d+', ' 0 ', qns)  # replace all numbers with 0
    qns = re.sub(r'_+', ' _ ', qns)  # replace all underscores with single underscore
    qns = qns.split('(Note to students')[0].split('Notes to student')[0].split('Note to students')[0].split('Note to student')[0].split('Notes to students')[0].split('(Separate ')[0]  # strip hints/notes
    qns = qns.strip()
    return qns


labeled_df['qns'] = labeled_df['qns'].apply(clean)
labeled_df.dropna(inplace=True)
labeled_df.to_csv('single_labeled_df.csv', index=False)
labeled_df = labeled_df.sample(frac=1).reset_index(drop=True)  # shuffle
labels_onehot = tf.keras.utils.to_categorical(labeled_df['label'].values)

In [2]:
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1)
opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
batch_size = 128
epochs = 100
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, patience=15, verbose=1,
                                     mode='auto', baseline=None, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=12, verbose=1)
]

In [3]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
encoder_inputs = preprocessing_layer(text_input)
encoder = hub.KerasLayer('https://tfhub.dev/google/experts/bert/wiki_books/qnli/2', trainable=False)
outputs = encoder(encoder_inputs)
x = outputs['pooled_output']
x = Dropout(0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
xOut = Dense(len(classes))(x)
model = Model(text_input, xOut)
model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])

In [4]:
plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True, to_file='model.png')
model.summary()
model.fit(np.array(labeled_df['qns']), labels_onehot, batch_size=batch_size, epochs=epochs, callbacks=callbacks, verbose=1, use_multiprocessing=True, validation_split=0.2)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['input_1[0][0]']                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

<keras.callbacks.History at 0x210ac94c340>

In [5]:
model.save('model.h5')
question = 'Billy is selling his penis for 5 cents per centimeter. How many centimeters of penis can you buy with 2 dollars'
question = clean(question)
softmax_out = Activation('softmax')(model(tf.constant([question]))).numpy()
class_predicted = classes[np.argmax(softmax_out)]
confidence = np.max(softmax_out)
print(f'Class predicted: {class_predicted} with confidence {confidence*100:.3f}%')
conf_list = {}
for i in range(len(classes)):
    conf_list.update({classes[i]: f'{softmax_out[0][i]*100:.3f}%'})

conf_list

Class predicted: Division with confidence 39.844%


{'Counting and number patterns': '1.356%',
 'Place values': '1.532%',
 'Addition': '8.942%',
 'Subtraction': '4.453%',
 'Comparing': '2.231%',
 'Estimation': '0.802%',
 'Shapes and Geometry': '0.576%',
 'Spatial sense': '0.371%',
 'Data graphs': '2.036%',
 'Measurement': '0.597%',
 'Money': '1.897%',
 'Patterns': '0.925%',
 'Probability and statistics': '3.000%',
 'Sorting, ordering and classifying': '1.093%',
 'Time': '1.410%',
 'Mixed operations': '3.772%',
 'Multiplication and division': '1.241%',
 'Comparing and ordering': '3.372%',
 'Names of numbers': '1.173%',
 'Estimation and rounding': '0.499%',
 'Logical reasoning': '0.257%',
 'Fractions': '0.190%',
 'Geometry': '0.230%',
 'Exercise 1- Learning to interprete': '0.632%',
 'Whole numbers and comparing': '1.218%',
 'Multiplication': '1.321%',
 'Division': '39.844%',
 'Data and graphs': '1.042%',
 'Whole Numbers': '1.524%',
 'Decimals': '1.111%',
 'Challenge': '0.515%',
 'Whole Numbers & Fractions': '0.536%',
 'Average': '0.366%'