In [23]:
import sys
sys.path.append("/home/ybtu/FinNLP")

import pandas as pd
import numpy as np
from utils import retrieve_paragraph_from_docid, get_10K_file_name, parse_trec_file, trec_file_to_csv
from utils import docid_to_cik, cik_to_sector

In [24]:
results = parse_trec_file("retrieval_results_trec/retrieval_results_full.txt")
trec_file_to_csv("retrieval_results_trec/retrieval_results_full.txt", "retrieval_results_full.csv")

target_paras = list(results.keys())
df_trec_results = pd.read_csv("retrieval_results_full.csv")
print(df_trec_results.columns)
print(target_paras)

Index(['target_id', 'Q0', 'reference_id', 'rank', 'score', 'tag'], dtype='object')
['20220318_10-K_1045810_part2_item7_para5', '20220222_10-K_1090727_part2_item7_para5', '20220216_10-K_1585689_part2_item7_para74', '20220217_10-K_200406_part2_item7_para42', '20221028_10-K_320193_part2_item7_para7']


In [25]:
def parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b):
    return f"{format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}"

def calculate_overlap(list1, list2):
    return set(list1).intersection(set(list2))

def calculate_difference(list1, list2):
    return set(list1).difference(set(list2))

def filter_docids_by_cik(docids, cik):
    return [docid for docid in docids if docid_to_cik(docid) == cik]

def get_irrelevant_ciks(cik_lists, target_cik):
    return set([cik for cik in cik_lists if cik != target_cik])

def get_irrelevant_sectors(sector_lists, target_sector):
    return set([sector for sector in sector_lists if sector != target_sector])

def get_format_type_from_system_tag(system_tag):
    # {format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}
    return system_tag.split("-")[0]

def get_prepend_info_from_system_tag(system_tag):
    # {format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}
    return system_tag.split("-")[2]
 
def get_k1_from_system_tag(system_tag):
    # {format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}
    return float(system_tag.split("-")[3].split("_")[1])

def get_b_from_system_tag(system_tag):
    # {format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}
    return float(system_tag.split("-")[4].split("_")[1])

class BM25RetrievalResult:
    def __init__(self, target_docid, retrieval_system_tag, retrieved_docids):
        self.target_docid = target_docid
        self.retrieval_system_tag = retrieval_system_tag
        self.retrieved_docids = retrieved_docids

class BM25ExperimentConfig:
    COLUMNS = [
        "target_cik", 
        "target_id", 
        "system_tag",
        "format_type", 
        "prepend_info",
        "k", 
        "k1", 
        "b", 
        "ciks_cnt",
        "irrelevant_ciks_cnt", 
        "sectors_cnt", 
        "irrelevant_sectors_cnt",
        "relevant_docids_cnt",
        "overlapped_docids_cnt",
        "comparison_tag"
    ]

    def __init__(self, target_docids, filter_name, retrieval_system_tags, df_trec_results, 
                 baseline_format_type="basic", baseline_prepend_info="null", baseline_k1=0.9, baseline_b=0.4):
        self.target_docids = target_docids
        self.target_ciks = [docid_to_cik(docid) for docid in target_docids]
        self.target_sectors = [cik_to_sector(cik) for cik in self.target_ciks]
        self.filter_name = filter_name
        self.retrieval_system_tags = retrieval_system_tags
        self.df_trec_results = df_trec_results # DataFrame holding all the retrieval results
        
        self.baseline = self.init_baseline(baseline_format_type, baseline_prepend_info, baseline_k1, baseline_b)
        self.comparison_df = self.init_comparison_df(BM25ExperimentConfig.COLUMNS)
    
    def init_baseline(self, format_type="basic", prepend_info="null", k1=0.9, b=0.4):
        baseline = {}
        for target_docid in self.target_docids:
            retrieval_system_tag = parse_retrieval_system_tag(format_type, self.filter_name, prepend_info, k1, b)
            retrieved_docids = self.df_trec_results[(self.df_trec_results["target_id"] == target_docid) & (self.df_trec_results["tag"] == retrieval_system_tag)]["reference_id"].tolist()
            baseline[target_docid] = BM25RetrievalResult(target_docid, retrieval_system_tag, retrieved_docids)

        return baseline
    
    def init_comparison_df(self, columns):
        comparison_df = pd.DataFrame(columns=columns)

        for target_docid in self.target_docids:
            target_cik = docid_to_cik(target_docid)
            retrieved_docids = self.baseline[target_docid].retrieved_docids
            retrieval_system_tag = self.baseline[target_docid].retrieval_system_tag
            retrieved_ciks = set([docid_to_cik(docid) for docid in retrieved_docids])
            retrieved_sectors = set([cik_to_sector(cik) for cik in retrieved_ciks])

            irrelevant_ciks = get_irrelevant_ciks(retrieved_ciks, target_cik)
            irrelevant_sectors = get_irrelevant_sectors(retrieved_sectors, cik_to_sector(target_cik))

            # here, relevant docids are the ones that have the same cik as the target cik
            relevant_docids = filter_docids_by_cik(retrieved_docids, target_cik)
            irrelevant_docids = calculate_difference(retrieved_docids, relevant_docids)

            new_row = {
                "target_cik": target_cik,
                "target_id": target_docid,
                "system_tag": retrieval_system_tag,
                "format_type": get_format_type_from_system_tag(retrieval_system_tag),
                "prepend_info": get_prepend_info_from_system_tag(retrieval_system_tag),
                "k": len(retrieved_docids),
                "k1": get_k1_from_system_tag(retrieval_system_tag),
                "b": get_b_from_system_tag(retrieval_system_tag),
                "ciks_cnt": len(retrieved_ciks),
                "irrelevant_ciks_cnt": len(irrelevant_ciks),
                "sectors_cnt": len(retrieved_sectors),
                "irrelevant_sectors_cnt": len(irrelevant_sectors),
                "relevant_docids_cnt": len(relevant_docids),
            }
            comparison_df.loc[len(comparison_df)] = new_row

        return comparison_df

    def get_retrieved_docids(self, target_docid, retrieval_system_tag):
        return self.df_trec_results[(self.df_trec_results["target_id"] == target_docid) & (self.df_trec_results["tag"] == retrieval_system_tag)]["reference_id"].tolist()

    def add_comparison_result(self, new_row):
        self.comparison_df.loc[len(self.comparison_df)] = new_row

    @staticmethod
    def setup_new_row(target_docid, base_system_tag, retrieval_system_tag, retrieved_docids, overlapped_docids_cnt):
        target_cik = docid_to_cik(target_docid)
        retrieved_ciks = set([docid_to_cik(docid) for docid in retrieved_docids])
        retrieved_sectors = set([cik_to_sector(cik) for cik in retrieved_ciks])
        relevant_docids = filter_docids_by_cik(retrieved_docids, target_cik)

        irrelevant_ciks = get_irrelevant_ciks(retrieved_ciks, target_cik)
        irrelevant_sectors = get_irrelevant_sectors(retrieved_sectors, cik_to_sector(target_cik))

        new_row = {
            "target_cik": docid_to_cik(target_docid),
            "target_id": target_docid,
            "system_tag": retrieval_system_tag,
            "format_type": get_format_type_from_system_tag(retrieval_system_tag),
            "prepend_info": get_prepend_info_from_system_tag(retrieval_system_tag),
            "k": len(retrieved_docids),
            "k1": get_k1_from_system_tag(retrieval_system_tag),
            "b": get_b_from_system_tag(retrieval_system_tag),
            "ciks_cnt": len(retrieved_ciks),
            "sectors_cnt": len(retrieved_sectors),
            "irrelevant_ciks_cnt": len(irrelevant_ciks),
            "irrelevant_sectors_cnt": len(irrelevant_sectors),
            "relevant_docids_cnt": len(relevant_docids),
            "overlapped_docids_cnt": overlapped_docids_cnt,
            "comparison_tag": base_system_tag
        }

        return new_row
    
    def compare_with_baseline(self):
        for target_docid in self.target_docids:
            target_cik = docid_to_cik(target_docid)
            target_sector = cik_to_sector(target_cik)
            print(f"Target docid: {target_cik}")
            print(f"Target sector: {target_sector}")

            base_system_tag = self.baseline[target_docid].retrieval_system_tag
            base_retrieved_docids = self.baseline[target_docid].retrieved_docids
            base_ciks = set([docid_to_cik(docid) for docid in base_retrieved_docids])
            base_sectors = set([cik_to_sector(cik) for cik in base_ciks])
            base_relevant_docids = filter_docids_by_cik(base_retrieved_docids, target_cik)

            print(f"    tag: {base_system_tag}")
            print(f"        number of relevant docids: {len(base_relevant_docids)}")
            print(f"        number of ciks: {len(base_ciks)}")
            print(f"        number of sectors: {len(base_sectors)}")

            for retrieval_system_tag in self.retrieval_system_tags:
                if retrieval_system_tag == base_system_tag:
                    continue

                comp_retrieved_docids = self.get_retrieved_docids(target_docid, retrieval_system_tag)
                comp_ciks = set([docid_to_cik(docid) for docid in comp_retrieved_docids])
                comp_sectors = set([cik_to_sector(cik) for cik in comp_ciks])
                comp_relevant_docids = filter_docids_by_cik(comp_retrieved_docids, target_cik)

                overlapped_docids = calculate_overlap(base_retrieved_docids, comp_retrieved_docids)
                docids_unique_to_base = calculate_difference(base_retrieved_docids, comp_retrieved_docids)
                docids_unique_to_comp = calculate_difference(comp_retrieved_docids, base_retrieved_docids)

                new_row = self.setup_new_row(target_docid, base_system_tag, retrieval_system_tag, comp_retrieved_docids, len(overlapped_docids))
                self.add_comparison_result(new_row)

                print(f"    tag: {retrieval_system_tag}")
                print(f"        number of ciks: {len(comp_ciks)}")
                print(f"        number of sectors: {len(comp_sectors)}")
                print(f"        number of relevant docids: {len(comp_relevant_docids)}")
                print(f"        number of overlapped docids: {len(overlapped_docids)}")
                print(f"        docids unique to base:", docids_unique_to_base)
                print(f"        docids unique to comp:", docids_unique_to_comp)
        
    @staticmethod
    def get_comparison_df_columns():
        return BM25ExperimentConfig.COLUMNS

In [26]:
def concat_comparison_dfs(exp_configs):
    full_comparison_df = pd.DataFrame(columns=BM25ExperimentConfig.get_comparison_df_columns())
    for exp_name, exp_config in exp_configs.items():
        print("Experiemnt Name:", exp_name)
        comparison_df = exp_config.comparison_df
        full_comparison_df = pd.concat([full_comparison_df, comparison_df])

    return full_comparison_df.drop_duplicates()

# Test the effect of prepended information

fix `k1` = 0.9 `b` = 0.4

In [27]:
baseline_k1 = 0.9
baseline_b = 0.4

k1 = 0.9
b = 0.4
filter_name = "year2018_2022"

retrieval_sys = {
    "company_name": "target_company",
    "title": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []

    retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results)
    exp_configs[exp_config_name].compare_with_baseline()

# multi_fields
for prepend_info in ["target_company", "target_title"]:
    exp_config_name = f"multi_fields-{prepend_info}"
    retrieval_system_tags = []

    retrieval_system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results)
    exp_configs[exp_config_name].compare_with_baseline() 

Target docid: 1045810
Target sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 3
    tag: company_name-year2018_2022-target_company-k1_0.9-b_0.4
        number of ciks: 2
        number of sectors: 2
        number of relevant docids: 7
        number of overlapped docids: 9
        docids unique to base: {'20191028_10-K_1618921_part1_item1_para9'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para71'}
Target docid: 1090727
Target sector: Industrials
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 4
        number of ciks: 5
        number of sectors: 3
    tag: company_name-year2018_2022-target_company-k1_0.9-b_0.4
        number of ciks: 2
        number of sectors: 1
        number of relevant docids: 8
        number of overlapped docids: 6
        docids unique to base: {'20200221_10-K_1101239_part2_item6_para1

In [28]:
full_comparison_df = concat_comparison_dfs(exp_configs)
full_comparison_df.to_csv("prepended_info_exp.csv", index=False)

Experiemnt Name: company_name-target_company
Experiemnt Name: title-target_title
Experiemnt Name: multi_fields-target_company
Experiemnt Name: multi_fields-target_title


  full_comparison_df = pd.concat([full_comparison_df, comparison_df])


In [7]:
baseline_k1 = 0.9
baseline_b = 0.4

k1 = 0.9
b = 0.4
filter_name = "year2018_2022"

retrieval_sys = {
    "multi_fields": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []

    retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, "title", "target_title")
    exp_configs[exp_config_name].compare_with_baseline()


Target docid: 1045810
Target sector: Information Technology
    tag: title-year2018_2022-target_title-k1_0.9-b_0.4
        number of relevant docids: 7
        number of ciks: 2
        number of sectors: 2
    tag: multi_fields-year2018_2022-target_title-k1_0.9-b_0.4
        number of ciks: 3
        number of sectors: 2
        number of relevant docids: 6
        number of overlapped docids: 9
        docids unique to base: {'20210226_10-K_1045810_part2_item7_para71'}
        docids unique to comp: {'20210129_10-K_2488_part1_item1_para62'}
Target docid: 1090727
Target sector: Industrials
    tag: title-year2018_2022-target_title-k1_0.9-b_0.4
        number of relevant docids: 8
        number of ciks: 2
        number of sectors: 1
    tag: multi_fields-year2018_2022-target_title-k1_0.9-b_0.4
        number of ciks: 4
        number of sectors: 3
        number of relevant docids: 3
        number of overlapped docids: 3
        docids unique to base: {'20180221_10-K_1090727_part1_i

# Analyze `k1`

* baseline_k1 = 0.9
* baseline_b = 0.4

* fix `b` = 0.4
* findings: 
    * `basic-null`: 
    * `company_name-target_company`: 
    * `title-target_title`: 
    * `multi_fields-target_company`: 
    * `multi_fields-target_title`: 

In [9]:
baseline_k1 = 0.9
baseline_b = 0.4

k1_list = [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
b = 0.4
filter_name = "year2018_2022"

retrieval_sys = {
    "basic": "null", 
    "company_name": "target_company",
    "title": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []

    for k1 in k1_list:
        retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                        baseline_format_type=format_type, baseline_prepend_info=prepend_info, 
                                                        baseline_k1=baseline_k1, baseline_b=baseline_b)
    exp_configs[exp_config_name].compare_with_baseline()

# multi_fields
for prepend_info in ["target_company", "target_title"]:
    exp_config_name = f"multi_fields-{prepend_info}"
    retrieval_system_tags = []

    for k1 in k1_list:
        retrieval_system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                        baseline_format_type="multi_fields", baseline_prepend_info=prepend_info, 
                                                        baseline_k1=baseline_k1, baseline_b=baseline_b)
    exp_configs[exp_config_name].compare_with_baseline()        

Target docid: 1045810
Target sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 3
    tag: basic-year2018_2022-null-k1_0.0-b_0.4
        number of ciks: 4
        number of sectors: 4
        number of relevant docids: 5
        number of overlapped docids: 6
        docids unique to base: {'20220125_10-K_936468_part2_item8_para98', '20191028_10-K_1618921_part1_item1_para9', '20220125_10-K_936468_part1_item1_para5', '20210226_10-K_1045810_part1_item1a_para31'}
        docids unique to comp: {'20220224_10-K_352915_part2_item7_para11', '20211119_10-K_1732845_part1_item1_para74', '20210225_10-K_352915_part2_item7_para10', '20201123_10-K_1732845_part1_item1_para67'}
    tag: basic-year2018_2022-null-k1_0.5-b_0.4
        number of ciks: 3
        number of sectors: 3
        number of relevant docids: 6
        number of overlapped docids: 10
        docids unique to base: se

In [10]:
full_comparison_df = concat_comparison_dfs(exp_configs)
full_comparison_df.to_csv("k1_exp.csv", index=False)

Experiemnt Name: basic-null
Experiemnt Name: company_name-target_company
Experiemnt Name: title-target_title
Experiemnt Name: multi_fields-target_company
Experiemnt Name: multi_fields-target_title


  full_comparison_df = pd.concat([full_comparison_df, comparison_df])


In [22]:
baseline_k1 = 1.2
baseline_b = 0.9

k1_list = [1.5]
b = 0.9
filter_name = "year2018_2022"

retrieval_sys = {
    "multi_fields": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []

    for k1 in k1_list:
        retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                        baseline_format_type=format_type, baseline_prepend_info=prepend_info, 
                                                        baseline_k1=baseline_k1, baseline_b=baseline_b)
    exp_configs[exp_config_name].compare_with_baseline()     

Target docid: 1045810
Target sector: Information Technology
    tag: multi_fields-year2018_2022-target_title-k1_1.2-b_0.9
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 2
    tag: multi_fields-year2018_2022-target_title-k1_1.5-b_0.9
        number of ciks: 2
        number of sectors: 1
        number of relevant docids: 7
        number of overlapped docids: 9
        docids unique to base: {'20220125_10-K_936468_part2_item7_para16'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para28'}
Target docid: 1090727
Target sector: Industrials
    tag: multi_fields-year2018_2022-target_title-k1_1.2-b_0.9
        number of relevant docids: 4
        number of ciks: 4
        number of sectors: 3
    tag: multi_fields-year2018_2022-target_title-k1_1.5-b_0.9
        number of ciks: 5
        number of sectors: 4
        number of relevant docids: 3
        number of overlapped docids: 9
        docids unique to base: {'20200220_10-K

# Analyze `b`

* baseline_k1 = 0.9
* baseline_b = 0.4

* fix `k1` = 0.9
* `b` = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
* findings: 
    * `basic-null`: not affected
    * `company_name-target_company`: not affected
    * `title-target_title`: not affected
    * `multi_fields-target_company`: not affected
    * `multi_fields-target_title`: not affected

In [13]:
baseline_k1 = 0.9
baseline_b = 0.4

k1 = 0.9
b_list = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
filter_name = "year2018_2022"

retrieval_sys = {
    "basic": "null", 
    "company_name": "target_company",
    "title": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []
    
    for b in b_list:
        retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                        baseline_format_type=format_type, baseline_prepend_info=prepend_info, 
                                                        baseline_k1=baseline_k1, baseline_b=baseline_b)
    exp_configs[exp_config_name].compare_with_baseline()

# multi_fields
for prepend_info in ["target_company", "target_title"]:
    exp_config_name = f"multi_fields-{prepend_info}"
    retrieval_system_tags = []

    for b in b_list:
        retrieval_system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                        baseline_format_type="multi_fields", baseline_prepend_info=prepend_info, 
                                                        baseline_k1=baseline_k1, baseline_b=baseline_b)
    exp_configs[exp_config_name].compare_with_baseline()

Target docid: 1045810
Target sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 3
    tag: basic-year2018_2022-null-k1_0.9-b_0.0
        number of ciks: 3
        number of sectors: 3
        number of relevant docids: 6
        number of overlapped docids: 6
        docids unique to base: {'20220125_10-K_936468_part2_item7_para16', '20191028_10-K_1618921_part1_item1_para9', '20220125_10-K_936468_part1_item1_para5', '20220125_10-K_936468_part2_item8_para98'}
        docids unique to comp: {'20211119_10-K_1732845_part1_item1_para74', '20210225_10-K_352915_part2_item7_para10', '20201123_10-K_1732845_part1_item1_para67', '20220224_10-K_352915_part2_item7_para11'}
    tag: basic-year2018_2022-null-k1_0.9-b_0.1
        number of ciks: 3
        number of sectors: 3
        number of relevant docids: 6
        number of overlapped docids: 9
        docids unique to base: {'201

In [14]:
full_comparison_df = concat_comparison_dfs(exp_configs)
full_comparison_df.to_csv("b_exp.csv", index=False)

Experiemnt Name: basic-null
Experiemnt Name: company_name-target_company
Experiemnt Name: title-target_title
Experiemnt Name: multi_fields-target_company
Experiemnt Name: multi_fields-target_title


  full_comparison_df = pd.concat([full_comparison_df, comparison_df])


In [14]:
baseline_k1 = 0.9
baseline_b = 0.4

k1 = 0.9
b_list = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
filter_name = "year2018_2022"

retrieval_sys = {
    "multi_fields": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []
    
    for b in b_list:
        retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                        baseline_format_type=format_type, baseline_prepend_info=prepend_info, 
                                                        baseline_k1=baseline_k1, baseline_b=baseline_b)
    exp_configs[exp_config_name].compare_with_baseline()


Target docid: 1045810
Target sector: Information Technology
    tag: multi_fields-year2018_2022-target_title-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 2
    tag: multi_fields-year2018_2022-target_title-k1_0.9-b_0.0
        number of ciks: 4
        number of sectors: 4
        number of relevant docids: 5
        number of overlapped docids: 5
        docids unique to base: {'20220125_10-K_936468_part2_item7_para16', '20210226_10-K_1045810_part1_item1a_para31', '20220125_10-K_936468_part1_item1_para5', '20210129_10-K_2488_part1_item1_para62', '20220125_10-K_936468_part2_item8_para98'}
        docids unique to comp: {'20201123_10-K_1732845_part1_item1_para67', '20220224_10-K_352915_part2_item7_para11', '20210208_10-K_101829_part1_item1_para54', '20210225_10-K_352915_part2_item7_para10', '20211119_10-K_1732845_part1_item1_para74'}
    tag: multi_fields-year2018_2022-target_title-k1_0.9-b_0.1
        number of ciks: 3
        nu