In [21]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["validation"])
df_test = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["test"])

In [22]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Combine question and correct_answer as target
df_train['target'] = df_train['question'] + " " + df_train['correct_answer']
df_val['target'] = df_val['question'] + " " + df_val['correct_answer']

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['support'].tolist() + df_train['target'].tolist())

# Convert text to sequences
X_train = tokenizer.texts_to_sequences(df_train['support'].tolist())
y_train = tokenizer.texts_to_sequences(df_train['target'].tolist())
X_val = tokenizer.texts_to_sequences(df_val['support'].tolist())
y_val = tokenizer.texts_to_sequences(df_val['target'].tolist())

# Pad sequences
max_len = 100
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
y_train = pad_sequences(y_train, maxlen=max_len, padding='post')
X_val = pad_sequences(X_val, maxlen=max_len, padding='post')
y_val = pad_sequences(y_val, maxlen=max_len, padding='post')

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.add(SimpleRNN(128, return_sequences=True))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()



In [24]:
# Reshape y_train and y_val to match the output shape
y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1))
y_val = y_val.reshape((y_val.shape[0], y_val.shape[1], 1))

history = model.fit(X_train, y_train, epochs=6, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/6
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 90ms/step - accuracy: 0.8013 - loss: 4.5080 - val_accuracy: 0.8537 - val_loss: 1.3902
Epoch 2/6
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 76ms/step - accuracy: 0.8559 - loss: 1.3652 - val_accuracy: 0.8580 - val_loss: 1.3139
Epoch 3/6
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 75ms/step - accuracy: 0.8618 - loss: 1.2309 - val_accuracy: 0.8619 - val_loss: 1.1239
Epoch 4/6
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 75ms/step - accuracy: 0.8658 - loss: 1.0789 - val_accuracy: 0.8627 - val_loss: 1.1167
Epoch 5/6
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 75ms/step - accuracy: 0.8655 - loss: 1.0681 - val_accuracy: 0.8631 - val_loss: 1.1015
Epoch 6/6
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 77ms/step - accuracy: 0.8663 - loss: 1.0287 - val_accuracy: 0.8634 - val_loss: 1.0599


In [25]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.8644 - loss: 1.0605
Validation Loss: 1.0599321126937866
Validation Accuracy: 0.8633899688720703


In [26]:
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score

def generate_qa(context, model, tokenizer, max_len=100):
    # Tokenize the input context
    input_seq = tokenizer.texts_to_sequences([context])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

    # Predict the sequence
    predicted_seq = model.predict(input_seq)

    # Get the predicted tokens
    predicted_tokens = np.argmax(predicted_seq, axis=-1)[0]

    # Decode the predicted tokens back to text
    question_answer = tokenizer.sequences_to_texts([predicted_tokens])[0]

    # Split the question and answer
    question, answer = question_answer.split(' ', 1)

    generated_qa = f"{question} {answer}"

    return generated_qa

def compute_rouge_l(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_result = scorer.score(reference, candidate)
    return rouge_result['rougeL'].fmeasure

def compute_bert_score(reference, candidate):
    _, _, f1 = score([candidate], [reference], lang="en", verbose=False)
    bert_f1 = f1.mean().item()
    return bert_f1

def evaluate_context_matching(contexts, model, tokenizer):
    rouge_l_scores = []
    bert_scores = []

    for context in contexts:
        generated_qa = generate_qa(context, model, tokenizer)

        rouge_l_score = compute_rouge_l(context, generated_qa)
        rouge_l_scores.append(rouge_l_score)

        bert_score = compute_bert_score(context, generated_qa)
        bert_scores.append(bert_score)

    summary = {
        'rouge_l_scores': {
            'mean': np.mean(rouge_l_scores),
            'std': np.std(rouge_l_scores)
        },
        'bert_scores': {
            'mean': np.mean(bert_scores),
            'std': np.std(bert_scores)
        }
    }

    return summary

# Example usage
contexts = [
    "A frameshift mutation is a deletion or insertion of one or more nucleotides that changes the reading frame of the base sequence. Deletions remove nucleotides, and insertions add nucleotides. Consider the following sequence of bases in RNA:"
]

summary = evaluate_context_matching(contexts, model, tokenizer)
print(summary)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 443ms/step


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'rouge_l_scores': {'mean': 0.22222222222222224, 'std': 0.0}, 'bert_scores': {'mean': 0.7658838629722595, 'std': 0.0}}


In [27]:
context = "A frameshift mutation is a deletion or insertion of one or more nucleotides that changes the reading frame of the base sequence. Deletions remove nucleotides, and insertions add nucleotides. Consider the following sequence of bases in RNA:"

generate_qa(context, model, tokenizer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


'what is the the the of of of'

In [29]:
model.save('rnn_model.h5')



In [32]:
from google.colab import files
files.download('rnn_model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>