In [None]:
import json
import os
from pathlib import Path
from typing import Dict, List

import pandas as pd
import plotly.express as px
import glob
from dataclasses import dataclass

In [None]:
#from item_response_models import ExamSetting

@dataclass
class ExamSetting:
    llm: str
    retrieval: str
    icl: int
    name: str
    path_pattern: str  # Assuming base path is a constant attribute of the class

    def find_file_path(self):
        """
        Find the file path using the class attributes.
        """
        # Search for files matching the pattern
        matching_files = glob.glob(self.path_pattern)
        
        # Return the first matching file or None
        if matching_files is None or matching_files == []:
            raise ValueError(f"Incorrect path pattern {self.path_pattern}")

        return matching_files[0]
    
    @property
    def exists(self):

        # Search for files matching the pattern
        matching_files = glob.glob(self.path_pattern)

        return matching_files is not None and matching_files != []

    @property
    def data_path(self):
        """
        Property to get the data path.
        """
        return self.find_file_path()

In [None]:
def get_all_students(model, task):

    root_path = f'{Path(".").resolve().parent}/Data/{task}/EvalResults'

    extended_students = [
        [ExamSetting(path_pattern=f'{root_path}/{task}Exam/llamav2/13b/full_sample_{task}Exam_closed_book_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='closed_book',
                     icl=i,
                     name=f'ClosedBook@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/13b/full_sample_{task}Exam_rag_siamese_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_siamese',
                     icl=i,
                     name=f'Rag Siamese@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/13b/full_sample_{task}Exam_rag_dpr_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_dpr',
                     icl=i,
                     name=f'Rag DPR@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/13b/full_sample_{task}Exam_rag_bm25_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_bm25',
                     icl=i,
                     name=f'Rag BM25@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/llamav2/13b/full_sample_{task}Exam_rag_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_multi_qa',
                     icl=i,
                     name=f'Rag MultiQA@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/llamav2/13b/full_sample_{task}Exam_rag_dpr_bm25_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='rag_dprv2',
                     icl=i,
                     name=f'Rag DPRV2@{i} [13B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}Exam/llamav2/13b/full_sample_{task}Exam_open_book_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:13B',
                     retrieval='open_book',
                     icl=i,
                     name=f'Oracle@{i} [13B]')] 
        for i in range(3)
    ]

    # Add 70B Models
    extended_students.extend([[
        ExamSetting(path_pattern=f'{root_path}/{task}Exam/llamav2/70b/full_sample_{task}Exam_closed_book_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='closed_book',
                    icl=i,
                    name=f'ClosedBook@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/70b/full_sample_{task}Exam_rag_siamese_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='rag_siamese',
                    icl=i,
                    name=f'Rag Siamese@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/70b/full_sample_{task}Exam_rag_dpr_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='rag_dpr',
                    icl=i,
                    name=f'Rag DPR@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/llamav2/70b/full_sample_{task}Exam_rag_bm25_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='rag_bm25',
                    icl=i,
                    name=f'Rag BM25@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/llamav2/70b/full_sample_{task}Exam_rag_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:70B',
                     retrieval='rag_multi_qa',
                     icl=i,
                     name=f'Rag MultiQA@{i} [70B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/llamav2/70b/full_sample_{task}Exam_rag_dpr_bm25_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='llamav2:70B',
                     retrieval='rag_dprv2',
                     icl=i,
                     name=f'Rag DPRV2@{i} [70B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}Exam/llamav2/70b/full_sample_{task}Exam_open_book_{model}_results_*_icl{i}.jsonl',
                    llm='llamav2:70B',
                    retrieval='open_book',
                    icl=i,
                    name=f'Oracle@{i} [70B]')] for i in range(3)],
    )

    # Add Mistral:7B Models
    extended_students.extend([[
        ExamSetting(path_pattern=f'{root_path}/{task}Exam/mistral/7b/full_sample_{task}Exam_closed_book_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='closed_book',
                    icl=i,
                    name=f'ClosedBook@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/mistral/7b/full_sample_{task}Exam_rag_siamese_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='rag_siamese',
                    icl=i,
                    name=f'Rag Siamese@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/mistral/7b/full_sample_{task}Exam_rag_dpr_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='rag_dpr',
                    icl=i,
                    name=f'Rag DPR@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}RagExam/mistral/7b/full_sample_{task}Exam_rag_bm25_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='rag_bm25',
                    icl=i,
                    name=f'Rag BM25@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/mistral/7b/full_sample_{task}Exam_rag_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='mistral:7b',
                     retrieval='rag_multi_qa',
                     icl=i,
                     name=f'Rag MultiQA@{i} [7B]'),
         ExamSetting(path_pattern=f'{root_path}/{task}NewRagExam/mistral/7b/full_sample_{task}Exam_rag_dpr_bm25_multi_qa_{model}_results_*_icl{i}.jsonl',
                     llm='mistral:7b',
                     retrieval='rag_dprv2',
                     icl=i,
                     name=f'Rag DPRV2@{i} [7B]'),
        ExamSetting(path_pattern=f'{root_path}/{task}Exam/mistral/7b/full_sample_{task}Exam_open_book_{model}_results_*_icl{i}.jsonl',
                    llm='mistral:7b',
                    retrieval='open_book',
                    icl=i,
                    name=f'Oracle@{i} [7B]')] for i in range(3)],
    )

    return [i for elem in extended_students for i in elem]


In [None]:
def get_single_file_in_folder(folder_path):
    # List all entries in the given folder
    entries = os.listdir(folder_path)

    # Filter out only the files (excluding directories and other types)
    files = [os.path.join(folder_path, f) for f in entries if os.path.isfile(os.path.join(folder_path, f))]

    # Check the number of files
    if len(files) == 1:
        return files[0]
    elif len(files) == 0:
        raise ValueError(f"No files found in the directory {folder_path}")
    else:
        raise ValueError(f"More than one file found in the directory {folder_path}. Files are: {', '.join(files)}")

def get_documentation_data(task: str) -> Dict[str, str]:

    main_folder = 'main_v1' if task == 'StackExchange' else 'main'
    root_path = f'{Path(".").resolve().parent}/Data/{task}/KnowledgeCorpus/{main_folder}'

    documentation_path = get_single_file_in_folder(root_path)
    with open(documentation_path, 'r') as f:
        data = json.load(f)

    return data

In [None]:
def get_topic_from_html(text: str):

    return text.split('/')[4].capitalize() if 'html' not in text.split('/')[4] else text.split('/')[3].capitalize()


def get_topic_from_stack(source_list: List[str]) -> str:

        filtered_list = list(set([elem.replace('https://', '').split('/')[0].split('.')[0] for elem in source_list]))

        return filtered_list[0]

def get_documentation_topic_list(qna_doc: Dict[str, str], ref_data: Dict[str, str], task: str) -> str:

    try:

        matching_documentation_list = [elem for elem in ref_data if ref_data if elem['text']==qna_doc['doc']['documentation']]

        if len(matching_documentation_list) == 0:
            return []
        else:
            matching_documentation = matching_documentation_list[0]

        if task == 'DevOps':

            return matching_documentation['section']
        
        elif task == 'StackExchange':

            return [get_topic_from_stack(matching_documentation['source'])]
        
        elif task == 'Arxiv':

            return [i.split('.')[0] for i in matching_documentation['section']]
        
        elif task == 'SecFilings':

            sec_cat = ["ACME UNITED CORP",
                        "ABBOTT LABORATORIES",
                        "AMD (ADVANCED MICRO DEVICES)",
                        "BK Technologies Corporation",
                        "AAR CORP",
                        "Air Products and Chemicals",
                        "CECO Environmental Corp",
                        "Worlds Online",
                        "ADAMS RESOURCES & ENERGY",
                        "Matson"]

            return [comp for comp in sec_cat if comp in qna_doc['doc']['documentation'].split(':')[0]]

        
    except:

        print('Failure')
        return []

In [None]:
#get_documentation_topic(data[0]['doc']['documentation'], documentation)

def get_acc(exam_setting: ExamSetting, 
            task: str, 
            categories: List[str]):

    documentation = get_documentation_data(task)

    with open(exam_setting.data_path, 'r') as f:
        exam_doc = [json.loads(line) for line in f]

    acc = {k: {'n_pts': 0, 'correct_classif': 0, 'acc': 0} for k in categories}

    for qna_doc in exam_doc:

        topic_list = get_documentation_topic_list(qna_doc, documentation, task)

        for topic in topic_list: 

            if topic in acc.keys():
                acc[topic]['correct_classif'] += qna_doc['acc']
                acc[topic]['n_pts'] += 1
            else:
                print(topic)


    for topic in acc.keys():

        acc[topic]['acc'] = 100*acc[topic]['correct_classif']/acc[topic]['n_pts'] if acc[topic]['n_pts'] else 0

    return acc

def get_acc_dict(task, cat_list):

    return {exam.name: get_acc(exam, task, cat_list) 
               for exam in get_all_students('llamav2', task)
               if exam.exists}

## Plots

### DevOps

In [None]:
devops_cat = ['Beanstalk', 'SageMaker', 'Lambda', 'RDS', 'S3', 'Gateway', 'RedShift', 'CloudFront', 'EC2', 'ECS', 'Load Balancer', 'ALB', 'DynamoDB', 'SQS', 'ElastiCache', 'ELB']
devops_acc_list = get_acc_dict('DevOps', devops_cat)

In [None]:
MIN_N_PTS = 10
target_models = ["ClosedBook@1 [13B]", 
                 "Rag MultiQA@1 [13B]",
                 "Rag DPRV2@1 [13B]" ,
                 "Oracle@1 [13B]",
                 #"ClosedBook@0 [70B]", 
                 #"Rag MultiQA@0 [70B]",
                 #"Rag DPRV2@0 [70B]" ,
                 #"Oracle@0 [70B]",
                 "ClosedBook@1 [70B]", 
                 "Rag MultiQA@1 [70B]",
                 "Rag DPRV2@1 [70B]" ,
                 "Oracle@1 [70B]",
                 "ClosedBook@1 [7B]", 
                 "Rag MultiQA@1 [7B]",
                 "Rag DPRV2@1 [7B]" ,
                 "Oracle@1 [7B]"
                 ]

acc_list_to_use = devops_acc_list


scores_all = []

for key, val in acc_list_to_use.items():

    for cat, dict_val in acc_list_to_use[key].items():

        scores_all.append({"model": key, 
                           "Size": key.split()[2] if len(key.split())==3 else key.split()[1],
                           "Retriever": key.split()[1].split('@')[0] if len(key.split())==3 else key.split()[0].split('@')[0],
                           "category": cat, 
                           "score": dict_val['acc']})

filtered_cat = [k for k,v in acc_list_to_use['ClosedBook@0 [13B]'].items() if v['n_pts'] >=MIN_N_PTS]
scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]

# sort by target_models
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)

df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]
df_score = df_score[df_score["category"].isin(filtered_cat)]

fig = px.line_polar(df_score, 
                    r = 'score', 
                    theta = 'category', 
                    line_close = True, 
                    category_orders = {"category": filtered_cat},
                    color = 'Retriever', 
                    #markers=True, 
                    line_dash = 'Size',
                    #symbol = 'retriever',
                    color_discrete_sequence=px.colors.qualitative.Pastel)

# Setting the width and height
fig.update_layout(width=1100, 
                  height=800,
                  title = 'AWS DevOps Exam Evaluation',
                  #title_font_size=27,
                  font = {'size': 27,
                          'family': 'Times',
                          'color': "black"},
                #   polar=dict(
                #         radialaxis=dict(
                #             #title='Radial Axis Title',
                #             title_font_size=18,
                #             tickfont_size=16,
                #         ),
                #         angularaxis=dict(
                #             title_font_size=18,
                #             tickfont_size=16,
                #         )
                #     ),
                #     legend=dict(
                #         font=dict(
                #             size=16
                #         )
                #    )
)

fig.show()

### Arxiv Exam

In [None]:
arxiv_cat = ['physics',
 'astro-ph',
 'cond-mat',
 'econ',
 'cs',
 'nlin',
 'q-fin',
 'hep-th',
 'gr-qc',
 'hep-ph',
 'stat',
 'quant-ph',
 'math',
 'q-bio',
 'eess',
 'nucl-ex',
 'hep-ex',
 'nucl-th',
 'hep-lat',
 'math-ph']

arxiv_acc_list = get_acc_dict('Arxiv', arxiv_cat)

In [None]:
MIN_N_PTS = 10
target_models = ["ClosedBook@1 [13B]", 
                 "Rag MultiQA@1 [13B]",
                 "Rag DPRV2@1 [13B]" ,
                 "Oracle@1 [13B]",
                 "ClosedBook@0 [70B]", 
                 "Rag MultiQA@0 [70B]",
                 "Rag DPRV2@0 [70B]" ,
                 "Oracle@0 [70B]",
                 "ClosedBook@1 [7B]", 
                 "Rag MultiQA@1 [7B]",
                 "Rag DPRV2@1 [7B]" ,
                 "Oracle@1 [7B]"
                 ]

acc_list_to_use = arxiv_acc_list


scores_all = []

for key, val in acc_list_to_use.items():

    for cat, dict_val in acc_list_to_use[key].items():

        scores_all.append({"model": key, 
                           "Size": key.split()[2] if len(key.split())==3 else key.split()[1],
                           "Retriever": key.split()[1].split('@')[0] if len(key.split())==3 else key.split()[0].split('@')[0],
                           "category": cat, 
                           "score": dict_val['acc']})

filtered_cat = [k for k,v in acc_list_to_use['ClosedBook@0 [13B]'].items() if v['n_pts'] >=MIN_N_PTS]
scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]

# sort by target_models
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)

df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]
df_score = df_score[df_score["category"].isin(filtered_cat)]

fig = px.line_polar(df_score, 
                    r = 'score', 
                    theta = 'category', 
                    line_close = True, 
                    category_orders = {"category": filtered_cat},
                    color = 'Retriever', 
                    #markers=True, 
                    line_dash = 'Size',
                    #symbol = 'retriever',
                    color_discrete_sequence=px.colors.qualitative.Pastel)

# Setting the width and height
fig.update_layout(width=800,
                  height=600,
                  title = 'Arxiv Exam Evaluation',
                  font = {'size': 18,
                          'family': 'Times',
                          'color': "black"})

fig.show()

### Sec Filing

In [None]:
sec_cat = ["ACME UNITED CORP",
"AAR CORP",
"Matson",
"BK Technologies Corporation",
"ABBOTT LABORATORIES",
"AAR CORP",
"Air Products and Chemicals",
"CECO Environmental Corp",
"Worlds Online",
"ADAMS RESOURCES & ENERGY",
"AMD (ADVANCED MICRO DEVICES)"]

sec_acc_list = get_acc_dict('SecFilings', sec_cat)

In [None]:
MIN_N_PTS = 10
target_models = ["ClosedBook@1 [13B]", 
                 "Rag MultiQA@1 [13B]",
                 "Rag DPRV2@1 [13B]" ,
                 "Oracle@1 [13B]",
                 "ClosedBook@1 [70B]", 
                 "Rag MultiQA@1 [70B]",
                 "Rag DPRV2@1 [70B]" ,
                 "Oracle@1 [70B]",
                 "ClosedBook@1 [7B]", 
                 "Rag MultiQA@1 [7B]",
                 "Rag DPRV2@1 [7B]" ,
                 "Oracle@1 [7B]"
                 ]

acc_list_to_use = sec_acc_list


scores_all = []

for key, val in acc_list_to_use.items():

    for cat, dict_val in acc_list_to_use[key].items():

        scores_all.append({"model": key, 
                           "Size": key.split()[2] if len(key.split())==3 else key.split()[1],
                           "Retriever": key.split()[1].split('@')[0] if len(key.split())==3 else key.split()[0].split('@')[0],
                           "category": cat, 
                           "score": dict_val['acc']})

filtered_cat = [k for k,v in acc_list_to_use['ClosedBook@0 [13B]'].items() if v['n_pts'] >=MIN_N_PTS]
scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]

# sort by target_models
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)

df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]
df_score = df_score[df_score["category"].isin(filtered_cat)]

fig = px.line_polar(df_score, 
                    r = 'score', 
                    theta = 'category', 
                    line_close = True, 
                    category_orders = {"category": filtered_cat},
                    color = 'Retriever', 
                    #markers=True, 
                    line_dash = 'Size',
                    #symbol = 'retriever',
                    color_discrete_sequence=px.colors.qualitative.Pastel)

# Setting the width and height
fig.update_layout(width=800,
                  height=600,
                  title = 'Sec Filings Exam Evaluation',
                  #font = {#'size': 12,
                          #'family': 'Times',
                          #'color': "black"
                          #}
                  )

fig.show()

### StackExchange

In [None]:
stack_topics = ['Stackoverflow', 'math', 'superuser',
                'serverfault', 'askubuntu', 'electronics',
                'physics', 'unix', 'tex', 'english',
                'meta', 'apple', 'ell', 'gaming',
                'stats', 'softwareengineering',
                'mathoverflow', 'gis', 'diy', 'magento',
                'salesforce',
'eosio',
'sharepoint',
'raspberrypi',
'salesforce',
'wordpress',
'history',
'ux',
'emacs',
'ai']

stack_acc_list = get_acc_dict('StackExchange', stack_topics)

In [None]:
MIN_N_PTS = 10
target_models = ["ClosedBook@1 [13B]", 
                 "Rag MultiQA@1 [13B]",
                 "Rag DPRV2@1 [13B]" ,
                 "Oracle@1 [13B]",
                 "ClosedBook@1 [70B]", 
                 "Rag MultiQA@1 [70B]",
                 "Rag DPRV2@1 [70B]" ,
                 "Oracle@1 [70B]",
                 "ClosedBook@1 [7B]", 
                 "Rag MultiQA@1 [7B]",
                 "Rag DPRV2@1 [7B]" ,
                 "Oracle@1 [7B]"
                 ]

acc_list_to_use = stack_acc_list


scores_all = []

for key, val in acc_list_to_use.items():

    for cat, dict_val in acc_list_to_use[key].items():

        scores_all.append({"model": key, 
                           "Size": key.split()[2] if len(key.split())==3 else key.split()[1],
                           "Retriever": key.split()[1].split('@')[0] if len(key.split())==3 else key.split()[0].split('@')[0],
                           "category": cat, 
                           "score": dict_val['acc']})

filtered_cat = [k for k,v in acc_list_to_use['ClosedBook@0 [13B]'].items() if v['n_pts'] >=MIN_N_PTS]
scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]

# sort by target_models
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)

df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]
df_score = df_score[df_score["category"].isin(filtered_cat)]

fig = px.line_polar(df_score, 
                    r = 'score', 
                    theta = 'category', 
                    line_close = True, 
                    category_orders = {"category": filtered_cat},
                    color = 'Retriever', 
                    #markers=True, 
                    line_dash = 'Size',
                    #symbol = 'retriever',
                    color_discrete_sequence=px.colors.qualitative.Pastel)

# Setting the width and height
fig.update_layout(width=800, height=600, title = 'Sec Filings Exam Evaluation')

fig.show()