In [22]:
import os
import re
import csv
import json
import nltk
import string
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.metrics.scores import f_measure
from rouge_score import rouge_scorer

In [24]:
def normalize_text(text: str) -> str:
    """Lower text and remove punctuation, articles and extra whitespace.
    Copied from the [QuAC](http://quac.ai/) evaluation script found at
    https://s3.amazonaws.com/my89public/quac/scorer.py"""

    def remove_articles(text: str) -> str:
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text: str) -> str:
        return " ".join(text.split())

    def remove_punc(text: str) -> str:
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text: str) -> str:
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(text))))

def f1_score(gold: str, pred: str) -> float:
    ret = f_measure(set(normalize_text(gold).split()), set(normalize_text(pred).split()))
    if ret is None:
        return 0.0
    return ret

In [25]:
dataset="gsm"
folder_path = f"../../data/raw_data/{dataset}"
answer_map = {" A": 0, " B": 1, " C": 2, " D": 3, "A": 0, "B": 1, "C": 2, "D": 3}
final = []
file_list = os.listdir(folder_path)
json_files = [file for file in file_list if file.endswith(".json")]
json_files = sorted(json_files, key=lambda x: len(x))
scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
json_files = json_files[:5]


In [26]:
def extract_number_from_text(text):
    match = re.search(r'The answer is (\d+)', text)
    if match:
        return int(match.group(1))
    return None
for file_id, file_name in enumerate(json_files):
    print(file_id)
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, "r") as file:
        data = json.load(file)
        method = data["adapter_spec"]["method"]
        if method == 'generation':
            for qid, rs in enumerate(data["request_states"]):
                candidate = rs["result"]["completions"][0]["text"]
                candidate_number = extract_number_from_text(candidate)
                label = 0
                for i in range(len(rs["instance"]["references"])):
                    reference = rs["instance"]["references"][i]["output"]["text"]
                    reference_number = extract_number_from_text(reference)
                    if candidate_number is not None and reference_number is not None:
                        if candidate_number == reference_number:
                            label = 1
                            break
                insert = {"s_id": file_id, "q_id": qid, "label": label}
                final.append(insert)
        elif method == 'ranking_binary':
            for k in range(0, len(data["request_states"]),60):
                RR_score=0
                rank=data["request_states"][k]["instance"]["references"]
                for t in range(10):
                    if len(rank[t]["tags"])>2:
                        RR_score=1/t
                        break
                insert = {"s_id": file_id, "q_id": k/60, "label": RR_score}
                print(insert)
                final.append(insert)
        else:
            for qid, rs in enumerate(data["request_states"]):
                Answer = 0
                for ans in rs["instance"]["references"]:
                    if len(ans["tags"]) > 0:
                        break
                    else:
                        Answer = Answer + 1
                result = rs["result"]["completions"][0]["text"]
                if result in answer_map:
                    res = answer_map[result]
                else:
                    res = 999
                label = 0
                if res == Answer:
                    label = 1
                insert = {"s_id": file_id, "q_id": qid, "label": label}
                final.append(insert)


def json_to_csv(data, csv_file):
    with open(csv_file, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["model_id", "question_id", "correct"])

        for item in data:
            s_id = item["s_id"]
            q_id = item["q_id"]
            correct = item["label"]
            writer.writerow([s_id, q_id, correct])

    print(f"CSV file '{csv_file}' created successfully.")

json_to_csv(final, "triples.csv")

0
1
2
3
4
CSV file 'triples.csv' created successfully.


In [27]:
def stat_unique(data: pd.DataFrame, key):
    if key is None:
        print("Total length: {}".format(len(data)))
    elif isinstance(key, str):
        print("Number of unique {}: {}".format(key, len(data[key].unique())))
        return len(data[key].unique())
    elif isinstance(key, list):
        print(
            "Number of unique [{}]: {}".format(
                ",".join(key), len(data.drop_duplicates(key, keep="first"))
            )
        )
        return len(data.drop_duplicates(key, keep="first"))

In [28]:
all_data = pd.read_csv("triples.csv", encoding = 'utf-8', dtype={'skill_id': str})
all_data.head()

Unnamed: 0,model_id,question_id,correct
0,0,0,0
1,0,1,0
2,0,2,0
3,0,3,1
4,0,4,0


In [29]:
stat_unique(all_data, None)
stat_unique(all_data, ['model_id', 'question_id'])
stat_unique(all_data, 'model_id')
stat_unique(all_data, 'question_id')

Total length: 7000
Number of unique [model_id,question_id]: 7000
Number of unique model_id: 5
Number of unique question_id: 3000


3000

## Filter data

In [30]:
selected_data = all_data

In [14]:
# filter questions
n_models = selected_data.groupby('question_id')['model_id'].count()
question_filter = n_models[n_models < 50].index.tolist()
print(f'filter {len(question_filter)} questions')
selected_data = selected_data[~selected_data['question_id'].isin(question_filter)]

filter 3000 questions


In [31]:
# filter models
n_questions = selected_data.groupby('model_id')['question_id'].count()
model_filter = n_questions[n_questions < 10].index.tolist()
print(f'filter {len(model_filter)} models')
selected_data = selected_data[~selected_data['model_id'].isin(model_filter)]

filter 0 models


In [32]:
# renumber the models
s2n = {}
cnt = 0
for i, row in selected_data.iterrows():
    if row.model_id not in s2n:
        s2n[row.model_id] = cnt
        cnt += 1
selected_data.loc[:, 'model_id'] = selected_data.loc[:, 'model_id'].apply(lambda x: s2n[x])

In [33]:
# renumber the questions
q2n = {}
cnt = 0
for i, row in selected_data.iterrows():
    if row.question_id not in q2n:
        q2n[row.question_id] = cnt
        cnt += 1
selected_data.loc[:, 'question_id'] = selected_data.loc[:, 'question_id'].apply(lambda x: q2n[x])

In [34]:
selected_data.to_csv("triples.csv", index=False)

## save data

In [35]:
al={}
cm = 20000
for i in range(cm):
    al[str(i)] = [i]
json_file = "./concept_map.json"
with open(json_file, "w") as file:
    json.dump(al, file)

In [36]:
def parse_data(data):

    model_data = defaultdict(lambda: defaultdict(dict))
    ques_data = defaultdict(lambda: defaultdict(dict))
    for i, row in data.iterrows():
        sid = row.model_id
        qid = row.question_id
        correct = row.correct
        model_data[sid][qid] = correct
        ques_data[qid][sid] = correct
    return model_data, ques_data


data = []
for i, row in selected_data.iterrows():
    data.append([row.model_id, row.question_id, row.correct])

stu_data, ques_data = parse_data(selected_data)

In [37]:
test_size = 0.2
least_test_length = 5

stat_unique(selected_data, None)
a = stat_unique(selected_data, ["model_id", "question_id"])
b = stat_unique(selected_data, "model_id")
c = stat_unique(selected_data, "question_id")


n_students = len(stu_data)
if isinstance(test_size, float):
    test_size = int(n_students * test_size)
train_size = n_students - test_size
assert train_size > 0 and test_size > 0

students = list(range(n_students))
random.shuffle(students)
if least_test_length is not None:
    student_lens = defaultdict(int)
    for t in data:
        student_lens[t[0]] += 1
    students = [
        student for student in students if student_lens[student] >= least_test_length
    ]
test_students = set(students[:test_size])
print(n_students, len(test_students))
train_data = [record for record in data if record[0] not in test_students]
test_data = [record for record in data if record[0] in test_students]

Total length: 7000
Number of unique [model_id,question_id]: 7000
Number of unique model_id: 5
Number of unique question_id: 3000
5 1


In [38]:
def renumber_student_id(data):
    """

    Args:
        data: list of triplets (sid, qid, score)

    Returns:
        renumbered datasets: list of triplets (sid, qid, score)
    """
    student_ids = sorted(set(t[0] for t in data))
    renumber_map = {sid: i for i, sid in enumerate(student_ids)}
    data = [(renumber_map[t[0]], t[1], t[2]) for t in data]
    return data


train_data = renumber_student_id(train_data)
test_data = renumber_student_id(test_data)
all_data = renumber_student_id(data)

print(f"train records length: {len(train_data)}")
print(f"test records length: {len(test_data)}")
print(f"all records length: {len(all_data)}")

train records length: 6000
test records length: 1000
all records length: 7000


In [39]:
def save_to_csv(data, path):
    """
    Args:
        data: list of triplets (sid, qid, scoreRate)
        path: str representing saving path
    """
    df = pd.DataFrame.from_records(
        sorted(data), columns=["model_id", "question_id", "correct"]
    )
    df["model_id"] = df["model_id"].astype(int)
    df["question_id"] = df["question_id"].astype(int)
    df.to_csv(path, index=False)

save_to_csv(
    train_data, "train_triples.csv"
)
save_to_csv(
    test_data, "test_triples.csv"
)
save_to_csv(all_data, "triples.csv")

In [40]:
metadata = {
    "num_models": n_students,
    "num_questions": c,
    "num_concepts":20000,
    "num_records": len(all_data),
    "num_train_students": n_students - len(test_students),
    "num_test_students": len(test_students),

}

with open("metadata.json", "w") as f:
    json.dump(metadata, f)