In [24]:
import json
import glob
from dataclasses import dataclass
from bloom_taxonomy_model import categorize_question
from pathlib import Path

In [25]:
@dataclass
class ExamSetting:
    llm: str
    retrieval: str
    icl: int
    name: str
    path_pattern: str  # Assuming base path is a constant attribute of the class

    def find_file_path(self):
        """
        Find the file path using the class attributes.
        """
        # Search for files matching the pattern
        matching_files = glob.glob(self.path_pattern)
        
        # Return the first matching file or None
        if matching_files is None or matching_files == []:
            raise ValueError(f"Incorrect path pattern {self.path_pattern}")

        return matching_files[0]
    
    @property
    def exists(self):

        # Search for files matching the pattern
        matching_files = glob.glob(self.path_pattern)

        return matching_files is not None and matching_files != []

    @property
    def data_path(self):
        """
        Property to get the data path.
        """
        return self.find_file_path()

In [26]:
def get_all_students(model, task):

    root_path = f'{Path('.').resolve().parent}/Data/{task}/EvalResults'
    extended_students = [
        [ExamSetting(path_pattern=f'{root_path}/{task}Exam/llamav2/13b/full_sample_{task}Exam_closed_book_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='closed_book',
                     icl=i,
                     name=f'Closed Book@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/13b/full_sample_{task}Exam_rag_siamese_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_siamese',
                     icl=i,
                     name=f'Rag Siamese@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/13b/full_sample_{task}Exam_rag_dpr_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_dpr',
                     icl=i,
                     name=f'Rag DPR@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/13b/full_sample_{task}Exam_rag_bm25_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_bm25',
                     icl=i,
                     name=f'Rag BM25@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/llamav2/13b/full_sample_{task}Exam_rag_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_multi_qa',
                     icl=i,
                     name=f'Rag MultiQA@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/llamav2/13b/full_sample_{task}Exam_rag_dpr_bm25_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_dprv2',
                     icl=i,
                     name=f'Rag DPRV2@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}Exam/llamav2/13b/full_sample_{task}Exam_open_book_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='open_book',
                     icl=i,
                     name=f'Open Book@{i} [13B]')] 
        for i in range(3)
    ]

    # Add 70B Models
    extended_students.extend([[
        ExamSetting(path_pattern=f'{root_path}/{task}Exam/llamav2/70b/full_sample_{task}Exam_closed_book_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='closed_book',
                    icl=i,
                    name=f'Closed Book@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/70b/full_sample_{task}Exam_rag_siamese_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='rag_siamese',
                    icl=i,
                    name=f'Rag Siamese@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/70b/full_sample_{task}Exam_rag_dpr_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='rag_dpr',
                    icl=i,
                    name=f'Rag DPR@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/70b/full_sample_{task}Exam_rag_bm25_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='rag_bm25',
                    icl=i,
                    name=f'Rag BM25@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/llamav2/70b/full_sample_{task}Exam_rag_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:70B',
                     retrieval='rag_multi_qa',
                     icl=i,
                     name=f'Rag MultiQA@{i} [70B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/llamav2/70b/full_sample_{task}Exam_rag_dpr_bm25_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:70B',
                     retrieval='rag_dprv2',
                     icl=i,
                     name=f'Rag DPRV2@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}Exam/llamav2/70b/full_sample_{task}Exam_open_book_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='open_book',
                    icl=i,
                    name=f'Open Book@{i} [70B]')] for i in range(3)],
    )

    # Add Mistral:7B Models
    extended_students.extend([[
        ExamSetting(path_pattern=f'{root_path}/{task}Exam/mistral/7b/full_sample_{task}Exam_closed_book_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='closed_book',
                    icl=i,
                    name=f'Closed Book@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/mistral/7b/full_sample_{task}Exam_rag_siamese_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='rag_siamese',
                    icl=i,
                    name=f'Rag Siamese@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/mistral/7b/full_sample_{task}Exam_rag_dpr_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='rag_dpr',
                    icl=i,
                    name=f'Rag DPR@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/mistral/7b/full_sample_{task}Exam_rag_bm25_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='rag_bm25',
                    icl=i,
                    name=f'Rag BM25@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/mistral/7b/full_sample_{task}Exam_rag_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='mistral:7b',
                     retrieval='rag_multi_qa',
                     icl=i,
                     name=f'Rag MultiQA@{i} [7B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/mistral/7b/full_sample_{task}Exam_rag_dpr_bm25_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='mistral:7b',
                     retrieval='rag_dprv2',
                     icl=i,
                     name=f'Rag DPRV2@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}Exam/mistral/7b/full_sample_{task}Exam_open_book_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='open_book',
                    icl=i,
                    name=f'Open Book@{i} [7B]')] for i in range(3)],
    )

    return [i for elem in extended_students for i in elem]

In [27]:
MODELS = ["llamav2"]
TASKS = ['StackExchange', 'Arxiv', 'SecFilings']

def load_data(data_path):
    with open(data_path, 'r') as f:
        data = [json.loads(line) for line in f]

    return data

In [42]:
for task in ['StackExchange', 'Arxiv', 'SecFilings', 'DevOps']:

    students = get_all_students(model='llamav2', task=task)

    questions_taxonomy = [categorize_question(elem['doc']['question'])
                            for elem in load_data(students[0].data_path)]

    my_task_dict = {
        k: [elem['doc']['question'] for i, elem in enumerate(load_data(students[0].data_path)) if k in questions_taxonomy[i]]
        for k in ['Remembering', 'Understanding', 'Applying', 'Analyzing', 'Evaluating', 'Creating', 'Uncategorized']
    }

    with open(f"{task}.json", 'w') as file:
        json.dump(my_task_dict, file)