## Prepare data and libraries

In [1]:
import numpy as np
import pandas as pd
import keras
import keras_nlp
from sklearn.model_selection import train_test_split

from keras import Input, Model
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Dense, Dropout, LeakyReLU, UnitNormalization, Reshape
from keras.optimizers import RMSprop
from keras.losses import CategoricalCrossentropy
from keras.metrics import F1Score

2024-05-31 00:15:45.193641: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-31 00:15:45.195868: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-31 00:15:45.314661: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-31 00:15:45.759914: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en")
backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en")
backbone.trainable = False

inputs = Input(shape=(1,), dtype="string", name="sentence")
preprocess = preprocessor(inputs)
embed = backbone(preprocess)
encoder = Model(inputs=inputs, outputs=embed)

Downloading from https://www.kaggle.com/api/v1/models/keras/roberta/keras/roberta_base_en/2/download/preprocessor.json...


In [3]:
cnn = keras.saving.load_model('roberta_cnn.keras')

In [4]:
embed_cnn = cnn(embed)

In [5]:
full_model = Model(inputs=inputs, outputs=embed_cnn)

# Generate Bloom's Taxonomy levels

## STA interviews

In [6]:
levels = ["Knowledge", "Comprehension", "Application", "Analysis", "Synthesis", "Evaluation"]
np.sort(levels)

array(['Analysis', 'Application', 'Comprehension', 'Evaluation',
       'Knowledge', 'Synthesis'], dtype='<U13')

In [7]:
def interview_bloom(number):
    dataset = pd.read_csv(f'e4/int{number}_new.csv', sep=',')
    predictions = full_model.predict(dataset.Question)
    bloom_indices = np.argmax(predictions, axis=1)
    dataset['BT_Level'] = np.sort(levels)[bloom_indices]
    dataset.to_csv(f'e4/int{number}_bloom.csv', sep=',')

In [8]:
for i in range(8):
    interview_bloom(i+1)

2024-05-31 00:16:13.172830: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:933] Skipping loop optimization for Merge node with control input: functional_3_1/roberta_preprocessor_1/roberta_tokenizer_1/RaggedFromUniformRowLength/RowPartitionFromUniformRowLength/assert_greater_equal/Assert/AssertGuard/branch_executed/_107
2024-05-31 00:16:14.158467: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT64 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


## R. Tatman's Question-Answer Dataset

In [9]:
tatman = pd.read_csv('data/tatman/merged.tsv', sep='\t')
tatman

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,was abraham lincoln the sixteenth president of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,was abraham lincoln the sixteenth president of...,yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,did lincoln sign the national banking act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,did lincoln sign the national banking act of 1...,yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,did his mother die of pneumonia ?,no,easy,medium,S08_set3_a4
...,...,...,...,...,...,...
3353,Zebra,what areas do the grevy 's zebras inhabit ?,semi-arid grasslands of ethiopia and northern ...,hard,hard,S10_set1_a9
3354,Zebra,which species of zebra is known as the common ...,"plains zebra ( equus quagga , formerly equus b...",hard,medium,S10_set1_a9
3355,Zebra,which species of zebra is known as the common ...,plains zebra,hard,medium,S10_set1_a9
3356,Zebra,at what age can a zebra breed ?,five or six,hard,medium,S10_set1_a9


In [10]:
predictions_tatman = full_model.predict(tatman.Question)

2024-05-05 00:04:42.223781: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:933] Skipping loop optimization for Merge node with control input: functional_3_1/roberta_preprocessor_1/roberta_tokenizer_1/RaggedFromUniformRowLength/RowPartitionFromUniformRowLength/assert_greater_equal/Assert/AssertGuard/branch_executed/_107
2024-05-05 00:04:43.269910: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT64 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1481s[0m 14s/step


In [None]:
tatman_indices = np.argmax(predictions_tatman, axis=1)
tatman["BT_Level"] = np.sort(levels.unique())[tatman_indices]
tatman.to_csv('data/tatman/tatman_bt.csv', sep=';')

## Question-Answer Jokes

In [46]:
jokes = pd.read_csv('data/jokes/jokes_clean.csv', sep=';')
jokes

Unnamed: 0,Question,Answer
0,did you hear about the native american man tha...,he nearly drown in his own tea pee.
1,what 's the best anti diarrheal prescription ?,mycheexarphlexin
2,what do you call a person who is outside a doo...,matt
3,which star trek character is a member of the m...,jean-luc pickacard
4,what 's the difference between a bullet and a ...,a bullet doesn 't miss harambe
...,...,...
38261,why did the pacifist /b /tard try to calm ever...,he did it for the
38262,why can 't obama poke fun at himself ?,because that would be racist.
38263,why is gambling not allowed in africa ?,because there are too many cheetahs.
38264,what do you call three witches in a hot tub ?,a self-cleaning coven.


In [47]:
jokes_sub = jokes.loc[:4000]

In [48]:
predictions_jokes = full_model.predict(jokes_sub.Question)
jokes_indices = np.argmax(predictions_jokes, axis=1)
jokes_sub["BT_Level"] = np.sort(levels.unique())[jokes_indices]
jokes_sub.to_csv('data/jokes/jokes4k.csv', sep=';')

[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1763s[0m 14s/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jokes_sub["BT_Level"] = np.sort(levels.unique())[jokes_indices]


## Stanford Question Answering Dataset

In [50]:
squad = pd.read_csv('data/squad/squad.csv', sep=';')
squad

Unnamed: 0,Question,Answer
0,to whom did the virgin mary allegedly appear i...,"it is a replica of the grotto at lourdes , fra..."
1,what is in front of the notre dame main buildi...,immediately in front of the main building and ...
2,the basilica of the sacred heart at notre dame...,next to the main building is the basilica of t...
3,what is the grotto at notre dame ?,immediately behind the basilica is the grotto ...
4,what sits on top of the main building at notre...,atop the main building 's gold dome is a golde...
...,...,...
104559,what paved the way for the augsburg confession ?,"despite the disagreements on the eucharist , t..."
104560,how many fraternities are apart of the univers...,there are fifteen fraternities and seven soror...
104561,in what year was the trial of rev. jimmy creech ?,rev. jimmy creech was defrocked after a highly...
104562,what town was actually granted to the huguenot...,"when they arrived , colonial authorities offer..."


In [51]:
squad_sub = squad.loc[:8000]

In [52]:
predictions_squad = full_model.predict(squad_sub.Question)
squad_indices = np.argmax(predictions_squad, axis=1)
squad_sub["BT_Level"] = np.sort(levels.unique())[squad_indices]
squad_sub.to_csv('data/squad/squad8k.csv', sep=';')

[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3445s[0m 14s/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  squad_sub["BT_Level"] = np.sort(levels.unique())[squad_indices]
