In [None]:
import os
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TFGPT2LMHeadModel
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

autocast_questions = json.load(open('autocast_questions.json')) # from the Autocast dataset
print(len(autocast_questions))
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]


In [None]:
# Filter the questions and labels
filtered_questions = [q for q in autocast_questions if q['id'] not in test_ids and q['answer'] is not None]
print(len(filtered_questions))
#print the number of questions that have the choice key 
print(len([q for q in filtered_questions if 'choices' in q]))
# questions = [item['question'] for item in filtered_questions]


# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# encoded_questions = tokenizer(questions, padding=True, truncation=True, max_length=128, return_tensors='tf')

# label_encoder = LabelEncoder()
# encoded_labels = label_encoder.fit_transform(labels)

# # Split the data into training and testing sets
# input_ids_train, input_ids_test, attention_mask_train, attention_mask_test, y_train, y_test = train_test_split(
#     encoded_questions['input_ids'], encoded_questions['attention_mask'], encoded_labels, test_size=0.2, random_state=42
# )


In [None]:
# autocast_questions = json.load(open('autocast_questions.json')) # from the Autocast dataset
# test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

## Create baseline models outputting random answers

In [None]:
def random_baseline_model(question):
    if question['qtype'] == 't/f':
        return np.random.random(size=2)
    elif question['qtype'] == 'mc':
        probs = np.random.random(size=len(question['choices']))
        return probs / probs.sum()
    elif question['qtype'] == 'num':
        return np.random.random()


def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.5

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [None]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [None]:
preds = []
answers = []
qtypes = []
# for question in autocast_questions:
#     if question['id'] in test_ids: # skipping questions in the competition test set
#         continue
#     if question['answer'] is None: # skipping questions without answer
#         continue
#     preds.append(calibrated_random_baseline_model(question))
#     if question['qtype'] == 't/f':
#         ans_idx = 0 if question['answer'] == 'no' else 1
#         ans = np.zeros(len(question['choices']))
#         ans[ans_idx] = 1
#         qtypes.append('t/f')
#     elif question['qtype'] == 'mc':
#         ans_idx = ord(question['answer']) - ord('A')
#         ans = np.zeros(len(question['choices']))
#         ans[ans_idx] = 1
#         qtypes.append('mc')
#     elif question['qtype'] == 'num':
#         ans = float(question['answer'])
#         qtypes.append('num')
#     answers.append(ans)

## Evaluate the model

In [None]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

## Make predictions on test set

In [None]:
preds = []
for question in test_questions:
    preds.append(calibrated_random_baseline_model(question))

In [None]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

In [None]:
!ls

In [None]:
import os
import json
import pickle
import numpy as np
from transformers import TFBertForSequenceClassification, BertTokenizerFast, TextClassificationPipeline
import tensorflow as tf

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased',
                                              max_length=256,  # max length of the text that can go to BERT
                                              pad_to_max_length=True)  # pads shorter sequences of text up to the max length

mc_model = TFBertForSequenceClassification.from_pretrained("saved_model/mc_model", num_labels=6)
tf_model = TFBertForSequenceClassification.from_pretrained("saved_model/tf_model", num_labels=2)


def format_output(output, size):

    preds = output

    if size > len(output):
        num = output[len(output)-1]/(size-len(output)+1)
        preds[len(output)-1] = num
        for i in range(size-len(output)):
            preds.append(num)

    elif size < len(output):
        preds = output[0:size]
    sum = 0
    minv = 100

    for i in range(len(preds)):
        if preds[i] < minv:
            minv = preds[i]

    for i in range(len(preds)):
        preds[i] = preds[i]-minv+10
        preds[i]= round(preds[i], 6)
    preds=np.array(preds)

    return preds/preds.sum()


autocast_questions = json.load(open('data/autocast/autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('data/autocast/autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]


def get_answer(question):
    if question['qtype'] == 't/f':
        encoded_input = tokenizer(question["question"], return_tensors='tf')
        output = np.array(tf_model(encoded_input)["logits"]).tolist()[0]
        return format_output(output, 2)

    elif question['qtype'] == 'mc':
        encoded_input = tokenizer(question["question"], return_tensors='tf')
        output = np.array(mc_model(encoded_input)["logits"]).tolist()[0]
        return format_output(output, len(question["choices"]))

    elif question['qtype'] == 'num':
        return 0.5


preds = []

for question in test_questions:
    pred = get_answer(question)
    print(pred)
    preds.append(pred)


if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

In [None]:
!cd submission && zip ../submission.zip ./* && cd ..