In [1]:
import sys
sys.path.append("/home/ybtu/FinNLP")

import pandas as pd
import numpy as np
from utils import retrieve_paragraph_from_docid, get_10K_file_name, parse_trec_file, trec_file_to_csv
from utils import docid_to_cik, cik_to_sector

In [3]:
results = parse_trec_file("retrieval_results_trec/retrieval_results_full.txt")
trec_file_to_csv("retrieval_results_trec/retrieval_results_full.txt", "retrieval_results_full.csv")

target_paras = list(results.keys())
df_trec_results = pd.read_csv("retrieval_results_full.csv")
print(df_trec_results.columns)
print(target_paras)

Index(['target_id', 'Q0', 'reference_id', 'rank', 'score', 'tag'], dtype='object')
['20220318_10-K_1045810_part2_item7_para5', '20220222_10-K_1090727_part2_item7_para5', '20220216_10-K_1585689_part2_item7_para74', '20220217_10-K_200406_part2_item7_para42', '20221028_10-K_320193_part2_item7_para7']


In [4]:
def parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b):
    return f"{format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}"

def calculate_overlap(list1, list2):
    return set(list1).intersection(set(list2))

def calculate_difference(list1, list2):
    return set(list1).difference(set(list2))

def filter_docids_by_cik(docids, cik):
    return [docid for docid in docids if docid_to_cik(docid) == cik]

def get_irrelevant_ciks(cik_lists, target_cik):
    return set([cik for cik in cik_lists if cik != target_cik])

def get_irrelevant_sectors(sector_lists, target_sector):
    return set([sector for sector in sector_lists if sector != target_sector])

def get_format_type_from_system_tag(system_tag):
    # {format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}
    return system_tag.split("-")[0]

def get_prepend_info_from_system_tag(system_tag):
    # {format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}
    return system_tag.split("-")[2]
 
def get_k1_from_system_tag(system_tag):
    # {format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}
    return float(system_tag.split("-")[3].split("_")[1])

def get_b_from_system_tag(system_tag):
    # {format_type}-{filter_name}-{prepend_info}-k1_{k1}-b_{b}
    return float(system_tag.split("-")[4].split("_")[1])

class BM25RetrievalResult:
    def __init__(self, target_docid, retrieval_system_tag, retrieved_docids):
        self.target_docid = target_docid
        self.retrieval_system_tag = retrieval_system_tag
        self.retrieved_docids = retrieved_docids

class BM25ExperimentConfig:
    COLUMNS = [
        "target_cik", 
        "target_id", 
        "system_tag",
        "format_type", 
        "prepend_info",
        "k", 
        "k1", 
        "b", 
        "ciks_cnt",
        "sectors_cnt", 
        "relevant_docids_cnt",
        "overlapped_docids_cnt",
        "comparison_tag"
    ]

    def __init__(self, target_docids, filter_name, retrieval_system_tags, df_trec_results, 
                 baseline_format_type="basic", baseline_prepend_info="null", baseline_k1=0.9, baseline_b=0.4):
        self.target_docids = target_docids
        self.target_ciks = [docid_to_cik(docid) for docid in target_docids]
        self.target_sectors = [cik_to_sector(cik) for cik in self.target_ciks]
        self.filter_name = filter_name
        self.retrieval_system_tags = retrieval_system_tags
        self.df_trec_results = df_trec_results # DataFrame holding all the retrieval results
        
        self.baseline = self.init_baseline(baseline_format_type, baseline_prepend_info, baseline_k1, baseline_b)
        self.comparison_df = self.init_comparison_df(BM25ExperimentConfig.COLUMNS)
    
    def init_baseline(self, format_type="basic", prepend_info="null", k1=0.9, b=0.4):
        baseline = {}
        for target_docid in self.target_docids:
            retrieval_system_tag = parse_retrieval_system_tag(format_type, self.filter_name, prepend_info, k1, b)
            retrieved_docids = self.df_trec_results[(self.df_trec_results["target_id"] == target_docid) & (self.df_trec_results["tag"] == retrieval_system_tag)]["reference_id"].tolist()
            baseline[target_docid] = BM25RetrievalResult(target_docid, retrieval_system_tag, retrieved_docids)

        return baseline
    
    def init_comparison_df(self, columns):
        comparison_df = pd.DataFrame(columns=columns)

        for target_docid in self.target_docids:
            target_cik = docid_to_cik(target_docid)
            retrieved_docids = self.baseline[target_docid].retrieved_docids
            retrieval_system_tag = self.baseline[target_docid].retrieval_system_tag
            retrieval_ciks = set([docid_to_cik(docid) for docid in retrieved_docids])
            retrieval_sectors = set([cik_to_sector(cik) for cik in retrieval_ciks])

            # here, relevant docids are the ones that have the same cik as the target cik
            relevant_docids = filter_docids_by_cik(retrieved_docids, target_cik)
            irrelevant_docids = calculate_difference(retrieved_docids, relevant_docids)

            new_row = {
                "target_cik": target_cik,
                "target_id": target_docid,
                "system_tag": retrieval_system_tag,
                "format_type": get_format_type_from_system_tag(retrieval_system_tag),
                "prepend_info": get_prepend_info_from_system_tag(retrieval_system_tag),
                "k": len(retrieved_docids),
                "k1": get_k1_from_system_tag(retrieval_system_tag),
                "b": get_b_from_system_tag(retrieval_system_tag),
                "ciks_cnt": len(retrieval_ciks),
                "sectors_cnt": len(retrieval_sectors),
                "relevant_docids_cnt": len(relevant_docids),
            }
            comparison_df.loc[len(comparison_df)] = new_row

        return comparison_df

    def get_retrieved_docids(self, target_docid, retrieval_system_tag):
        return self.df_trec_results[(self.df_trec_results["target_id"] == target_docid) & (self.df_trec_results["tag"] == retrieval_system_tag)]["reference_id"].tolist()

    def add_comparison_result(self, new_row):
        self.comparison_df.loc[len(self.comparison_df)] = new_row

    @staticmethod
    def setup_new_row(target_docid, base_system_tag, retrieval_system_tag, retrieved_docids, overlapped_docids_cnt):
        target_cik = docid_to_cik(target_docid)
        retrieved_ciks = set([docid_to_cik(docid) for docid in retrieved_docids])
        retrieved_sectors = set([cik_to_sector(cik) for cik in retrieved_ciks])
        relevant_docids = filter_docids_by_cik(retrieved_docids, target_cik)

        new_row = {
            "target_cik": docid_to_cik(target_docid),
            "target_id": target_docid,
            "system_tag": retrieval_system_tag,
            "format_type": get_format_type_from_system_tag(retrieval_system_tag),
            "prepend_info": get_prepend_info_from_system_tag(retrieval_system_tag),
            "k": len(retrieved_docids),
            "k1": get_k1_from_system_tag(retrieval_system_tag),
            "b": get_b_from_system_tag(retrieval_system_tag),
            "ciks_cnt": len(retrieved_ciks),
            "sectors_cnt": len(retrieved_sectors),
            "relevant_docids_cnt": len(relevant_docids),
            "overlapped_docids_cnt": overlapped_docids_cnt,
            "comparison_tag": base_system_tag
        }

        return new_row
    
    def compare_with_baseline(self):
        for target_docid in self.target_docids:
            target_cik = docid_to_cik(target_docid)
            target_sector = cik_to_sector(target_cik)
            print(f"Target docid: {target_cik}")
            print(f"Target sector: {target_sector}")

            base_system_tag = self.baseline[target_docid].retrieval_system_tag
            base_retrieved_docids = self.baseline[target_docid].retrieved_docids
            base_ciks = set([docid_to_cik(docid) for docid in base_retrieved_docids])
            base_sectors = set([cik_to_sector(cik) for cik in base_ciks])
            base_relevant_docids = filter_docids_by_cik(base_retrieved_docids, target_cik)

            print(f"    tag: {base_system_tag}")
            print(f"        number of relevant docids: {len(base_relevant_docids)}")
            print(f"        number of ciks: {len(base_ciks)}")
            print(f"        number of sectors: {len(base_sectors)}")

            for retrieval_system_tag in self.retrieval_system_tags:
                if retrieval_system_tag == base_system_tag:
                    continue

                comp_retrieved_docids = self.get_retrieved_docids(target_docid, retrieval_system_tag)
                comp_ciks = set([docid_to_cik(docid) for docid in comp_retrieved_docids])
                comp_sectors = set([cik_to_sector(cik) for cik in comp_ciks])
                comp_relevant_docids = filter_docids_by_cik(comp_retrieved_docids, target_cik)

                overlapped_docids = calculate_overlap(base_retrieved_docids, comp_retrieved_docids)
                docids_unique_to_base = calculate_difference(base_retrieved_docids, comp_retrieved_docids)
                docids_unique_to_comp = calculate_difference(comp_retrieved_docids, base_retrieved_docids)

                new_row = self.setup_new_row(target_docid, base_system_tag, retrieval_system_tag, comp_retrieved_docids, len(overlapped_docids))
                self.add_comparison_result(new_row)

                print(f"    tag: {retrieval_system_tag}")
                print(f"        number of ciks: {len(comp_ciks)}")
                print(f"        number of sectors: {len(comp_sectors)}")
                print(f"        number of relevant docids: {len(comp_relevant_docids)}")
                print(f"        number of overlapped docids: {len(overlapped_docids)}")
                print(f"        docids unique to base:", docids_unique_to_base)
                print(f"        docids unique to comp:", docids_unique_to_comp)
        
    @staticmethod
    def get_comparison_df_columns():
        return BM25ExperimentConfig.COLUMNS

In [5]:
def concat_comparison_dfs(exp_configs):
    full_comparison_df = pd.DataFrame(columns=BM25ExperimentConfig.get_comparison_df_columns())
    for exp_name, exp_config in exp_configs.items():
        print("Experiemnt Name:", exp_name)
        comparison_df = exp_config.comparison_df
        full_comparison_df = pd.concat([full_comparison_df, comparison_df])

    return full_comparison_df.drop_duplicates()

# Test the effect of prepended information

fix `k1` = 0.9 `b` = 0.4

In [6]:
baseline_k1 = 0.9
baseline_b = 0.4

k1 = 0.9
b = 0.4
filter_name = "year2018_2022"

retrieval_sys = {
    "company_name": "target_company",
    "title": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []

    retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results)
    exp_configs[exp_config_name].compare_with_baseline()

# multi_fields
for prepend_info in ["target_company", "target_title"]:
    exp_config_name = f"multi_fields-{prepend_info}"
    retrieval_system_tags = []

    retrieval_system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

    exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results)
    exp_configs[exp_config_name].compare_with_baseline() 

Target docid: 1045810
Target sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 3
    tag: company_name-year2018_2022-target_company-k1_0.9-b_0.4
        number of ciks: 2
        number of sectors: 2
        number of relevant docids: 7
        number of overlapped docids: 9
        docids unique to base: {'20191028_10-K_1618921_part1_item1_para9'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para71'}
Target docid: 1090727
Target sector: Industrials
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 4
        number of ciks: 5
        number of sectors: 3
    tag: company_name-year2018_2022-target_company-k1_0.9-b_0.4
        number of ciks: 2
        number of sectors: 1
        number of relevant docids: 8
        number of overlapped docids: 6
        docids unique to base: {'20200221_10-K_1101239_part2_item6_para1

In [7]:
full_comparison_df = concat_comparison_dfs(exp_configs)
full_comparison_df.to_csv("prepended_info_exp.csv", index=False)

Experiemnt Name: company_name-target_company
Experiemnt Name: title-target_title
Experiemnt Name: multi_fields-target_company
Experiemnt Name: multi_fields-target_title


  full_comparison_df = pd.concat([full_comparison_df, comparison_df])


# Analyze `k1`

* baseline_k1 = 0.9
* baseline_b = 0.4

* fix `b` = 0.4
* findings: 
    * `basic-null`: not affected
    * `company_name-target_company`: not affected
    * `title-target_title`: not affected
    * `multi_fields-target_company`: not affected
    * `multi_fields-target_title`: not affected

In [8]:
baseline_k1 = 0.9
baseline_b = 0.4

k1_list = [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
b = 0.4
filter_name = "year2018_2022"

retrieval_sys = {
    "basic": "null", 
    "company_name": "target_company",
    "title": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []

    for k1 in k1_list:
        retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

        exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                            baseline_format_type=format_type, baseline_prepend_info=prepend_info, 
                                                            baseline_k1=baseline_k1, baseline_b=baseline_b)
        exp_configs[exp_config_name].compare_with_baseline()

# multi_fields
for prepend_info in ["target_company", "target_title"]:
    exp_config_name = f"multi_fields-{prepend_info}"
    retrieval_system_tags = []

    for k1 in k1_list:
        retrieval_system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

        exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                            baseline_format_type="multi_fields", baseline_prepend_info=prepend_info, 
                                                            baseline_k1=baseline_k1, baseline_b=baseline_b)
        exp_configs[exp_config_name].compare_with_baseline()        

Target docid: 1045810
Target sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 3
    tag: basic-year2018_2022-null-k1_0.0-b_0.4
        number of ciks: 3
        number of sectors: 3
        number of relevant docids: 6
        number of overlapped docids: 10
        docids unique to base: set()
        docids unique to comp: set()
Target docid: 1090727
Target sector: Industrials
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 4
        number of ciks: 5
        number of sectors: 3
    tag: basic-year2018_2022-null-k1_0.0-b_0.4
        number of ciks: 5
        number of sectors: 3
        number of relevant docids: 4
        number of overlapped docids: 10
        docids unique to base: set()
        docids unique to comp: set()
Target docid: 1585689
Target sector: Consumer Discretionary
    tag: basic-year2018_2022-null-k1_0.9-b_0.4


In [8]:
full_comparison_df = concat_comparison_dfs(exp_configs)
full_comparison_df.to_csv("k1_exp.csv", index=False)

Experiemnt Name: basic-null
Experiemnt Name: company_name-target_company
Experiemnt Name: title-target_title
Experiemnt Name: multi_fields-target_company
Experiemnt Name: multi_fields-target_title


  full_comparison_df = pd.concat([full_comparison_df, comparison_df])


# Analyze `b`

* baseline_k1 = 0.9
* baseline_b = 0.4

* fix `k1` = 0.9
* `b` = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
* findings: 
    * `basic-null`: not affected
    * `company_name-target_company`: not affected
    * `title-target_title`: not affected
    * `multi_fields-target_company`: not affected
    * `multi_fields-target_title`: not affected

In [9]:
baseline_k1 = 0.9
baseline_b = 0.4

k1 = 0.9
b_list = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
filter_name = "year2018_2022"

retrieval_sys = {
    "basic": "null", 
    "company_name": "target_company",
    "title": "target_title",
}

exp_configs = {}
for format_type, prepend_info in retrieval_sys.items():
    exp_config_name = f"{format_type}-{prepend_info}"
    retrieval_system_tags = []
    
    for b in b_list:
        retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

        exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                            baseline_format_type=format_type, baseline_prepend_info=prepend_info, 
                                                            baseline_k1=baseline_k1, baseline_b=baseline_b)
        exp_configs[exp_config_name].compare_with_baseline()

# multi_fields
for prepend_info in ["target_company", "target_title"]:
    exp_config_name = f"multi_fields-{prepend_info}"
    retrieval_system_tags = []

    for b in b_list:
        retrieval_system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
        retrieval_system_tags.append(retrieval_system_tag)

        exp_configs[exp_config_name] = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results, 
                                                            baseline_format_type="multi_fields", baseline_prepend_info=prepend_info, 
                                                            baseline_k1=baseline_k1, baseline_b=baseline_b)
        exp_configs[exp_config_name].compare_with_baseline()

Target docid: 1045810
Target sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 3
    tag: basic-year2018_2022-null-k1_0.9-b_0.0
        number of ciks: 3
        number of sectors: 3
        number of relevant docids: 6
        number of overlapped docids: 10
        docids unique to base: set()
        docids unique to comp: set()
Target docid: 1090727
Target sector: Industrials
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 4
        number of ciks: 5
        number of sectors: 3
    tag: basic-year2018_2022-null-k1_0.9-b_0.0
        number of ciks: 5
        number of sectors: 3
        number of relevant docids: 4
        number of overlapped docids: 10
        docids unique to base: set()
        docids unique to comp: set()
Target docid: 1585689
Target sector: Consumer Discretionary
    tag: basic-year2018_2022-null-k1_0.9-b_0.4


In [10]:
full_comparison_df = concat_comparison_dfs(exp_configs)
full_comparison_df.to_csv("b_exp.csv", index=False)

Experiemnt Name: basic-null
Experiemnt Name: company_name-target_company
Experiemnt Name: title-target_title
Experiemnt Name: multi_fields-target_company
Experiemnt Name: multi_fields-target_title


  full_comparison_df = pd.concat([full_comparison_df, comparison_df])


# Archive

In [None]:
k1 = 0.9
b = 0.4
filter_name = "year2018_2022"
retrieval_system_tags = []

retrieval_sys = {
    "company_name": "target_company",
    "title": "target_title",
}

for format_type, prepend_info in retrieval_sys.items():
    retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

for prepend_info in ["target_company", "target_title"]:
    retrieval_system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

prepended_info_exp = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results)

In [None]:

prepended_info_exp.compare_with_baseline()
prepended_info_exp.comparison_df.to_csv("prepended_info_exp.csv", index=False)

Target docid: 1045810
Target sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 3
    tag: company_name-year2018_2022-target_company-k1_0.9-b_0.4
        number of ciks: 2
        number of sectors: 2
        number of relevant docids: 7
        number of overlapped docids: 9
        docids unique to base: {'20191028_10-K_1618921_part1_item1_para9'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para71'}
    tag: title-year2018_2022-target_title-k1_0.9-b_0.4
        number of ciks: 2
        number of sectors: 2
        number of relevant docids: 7
        number of overlapped docids: 9
        docids unique to base: {'20191028_10-K_1618921_part1_item1_para9'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para71'}
    tag: multi_fields-year2018_2022-target_company-k1_0.9-b_0.4
        number of ciks: 3
        number of sectors: 2

In [None]:
k1 = 0.9
b = 0.4
filter_name = "year2018_2022"
retrieval_system_tags = []

retrieval_sys = {
    "company_name": "target_company",
    "title": "target_title",
}

for format_type, prepend_info in retrieval_sys.items():
    retrieval_system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

for prepend_info in ["target_company", "target_title"]:
    retrieval_system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
    retrieval_system_tags.append(retrieval_system_tag)

prepended_info_exp = BM25ExperimentConfig(target_paras, filter_name, retrieval_system_tags, df_trec_results)

In [None]:
for target_docid in prepended_info_exp.target_docids:
    target_cik = docid_to_cik(target_docid)
    target_sector = cik_to_sector(target_cik)
    print(f"Target docid: {target_cik}")
    print(f"Target sector: {target_sector}")

    base_system_tag = prepended_info_exp.baseline[target_docid].retrieval_system_tag
    base_retrieved_docids = prepended_info_exp.baseline[target_docid].retrieved_docids
    base_ciks = set([docid_to_cik(docid) for docid in base_retrieved_docids])
    base_sectors = set([cik_to_sector(cik) for cik in base_ciks])
    base_relevant_docids = filter_docids_by_cik(base_retrieved_docids, target_cik)

    print(f"    tag: {base_system_tag}")
    print(f"        number of relevant docids: {len(base_relevant_docids)}")
    print(f"        number of ciks: {len(base_ciks)}")
    print(f"        number of sectors: {len(base_sectors)}")

    for retrieval_system_tag in prepended_info_exp.retrieval_system_tags:
        comp_retrieved_docids = prepended_info_exp.get_retrieved_docids(target_docid, retrieval_system_tag)
        comp_ciks = set([docid_to_cik(docid) for docid in comp_retrieved_docids])
        comp_sectors = set([cik_to_sector(cik) for cik in comp_ciks])
        comp_relevant_docids = filter_docids_by_cik(comp_retrieved_docids, target_cik)

        overlapped_docids = calculate_overlap(base_retrieved_docids, comp_retrieved_docids)
        docids_unique_to_base = calculate_difference(base_retrieved_docids, comp_retrieved_docids)
        docids_unique_to_comp = calculate_difference(comp_retrieved_docids, base_retrieved_docids)

        new_row = prepended_info_exp.setup_new_row(target_docid, retrieval_system_tag, comp_retrieved_docids, len(overlapped_docids))
        prepended_info_exp.add_comparison_result(new_row)

        print(f"    tag: {retrieval_system_tag}")
        print(f"        number of ciks: {len(comp_ciks)}")
        print(f"        number of sectors: {len(comp_sectors)}")
        print(f"        number of relevant docids: {len(comp_relevant_docids)}")
        print(f"        number of overlapped docids: {len(overlapped_docids)}")
        print(f"        docids unique to base:", docids_unique_to_base)
        print(f"        docids unique to comp:", docids_unique_to_comp)

prepended_info_exp.comparison_df.to_csv("prepended_info_exp.csv", index=False)

Target docid: 1045810
Target sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 3
    tag: company_name-year2018_2022-target_company-k1_0.9-b_0.4
        number of ciks: 2
        number of sectors: 2
        number of relevant docids: 7
        number of overlapped docids: 9
        docids unique to base: {'20191028_10-K_1618921_part1_item1_para9'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para71'}
    tag: title-year2018_2022-target_title-k1_0.9-b_0.4
        number of ciks: 2
        number of sectors: 2
        number of relevant docids: 7
        number of overlapped docids: 9
        docids unique to base: {'20191028_10-K_1618921_part1_item1_para9'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para71'}
    tag: multi_fields-year2018_2022-target_company-k1_0.9-b_0.4
        number of ciks: 3
        number of sectors: 2

## Test the effect of prepended information

fix `k1` = 0.9 `b` = 0.4

In [None]:
k1 = 0.9
b = 0.4
filter_name = "year2018_2022"
retrieval_sys = {
    "company_name": "target_company",
    "title": "target_title",
}

comparison_df = pd.DataFrame(columns=[
    "target_cik", 
    "target_id", 
    "system_tag",
    "k", 
    "k1", 
    "b",  
    "number_of_overlap",
    "number_of_docids_from_irrelevant_cik",
    "number_of_unique_irrelevant_ciks",
    "number_of_unique_irrelevant_sectors",
])

for target_docid in target_paras:
    target_cik = docid_to_cik(target_docid)
    target_sector = cik_to_sector(target_cik)
    print(f"Para ID: {target_docid}")
    print(f"Target Sector: {target_sector}")
    basic_system_tag = parse_retrieval_system_tag("basic", filter_name, "null", k1, b)
    basic_results = df_results[(df_results["target_id"] == target_docid) & (df_results["tag"] == basic_system_tag)]["reference_id"].tolist()
    basic_ciks = [docid_to_cik(docid) for docid in basic_results]
    basic_sectors = [cik_to_sector(cik) for cik in basic_ciks]
    basic_docids_from_irrelevant_cik = paras_from_irrelevant_cik(basic_results, target_cik)
    print(f"    tag: {basic_system_tag}")
    print(f"        number of docids from irrelevant CIKs: {len(basic_docids_from_irrelevant_cik)}")   
    print(f"        number of unique irrelevant CIKs: {len(get_irrelevant_ciks(basic_ciks, target_cik))}")
    print(f"        number of unique irrelevant sectors: {len(get_irrelevant_sectors(basic_sectors, target_sector))}")

    new_row = {
        "target_cik": target_cik,
        "target_id": target_docid,
        "system_tag": basic_system_tag,
        "k": len(basic_results),
        "k1": k1,
        "b": b,
        "number_of_docids_from_irrelevant_cik": len(basic_docids_from_irrelevant_cik),
        "number_of_unique_irrelevant_ciks": len(get_irrelevant_ciks(basic_ciks, target_cik)),
        "number_of_unique_irrelevant_sectors": len(get_irrelevant_sectors(basic_sectors, target_sector))
    }
    comparison_df.loc[len(comparison_df)] = new_row

    for format_type, prepend_info in retrieval_sys.items():
        system_tag = parse_retrieval_system_tag(format_type, filter_name, prepend_info, k1, b)
        comparison_results = df_results[(df_results["target_id"] == target_docid) & (df_results["tag"] == system_tag)]["reference_id"].tolist()
        comparison_ciks = [docid_to_cik(docid) for docid in comparison_results]
        comparison_sectors = [cik_to_sector(cik) for cik in comparison_ciks]


        overlap_docids = calculate_overlap(basic_results, comparison_results)
        docids_unique_to_basic = calculate_difference(basic_results, comparison_results)
        docids_unique_to_comparison = calculate_difference(comparison_results, basic_results)
        
        comparison_docids_from_irrelevant_cik = paras_from_irrelevant_cik(comparison_results, target_cik)
        irrelevant_ciks = get_irrelevant_ciks(comparison_ciks, target_cik)
        irrelevant_sectors = get_irrelevant_sectors(comparison_sectors, target_sector)

        print(f"    tag: {system_tag}")
        print(f"        number of overlap: {len(overlap_docids)}")
        print(f"        basic unique docids:", docids_unique_to_basic)
        print(f"        comparison unique docids:", docids_unique_to_comparison)
        print(f"        number of docids from irrelevant CIKs: {len(comparison_docids_from_irrelevant_cik)}")
        print(f"        number of unique irrelevant CIKs: {len(irrelevant_ciks)}")
        print(f"        number of unique irrelevant sectors: {len(irrelevant_sectors)}")

        new_row = {
            "target_cik": target_cik,
            "target_id": target_docid,
            "system_tag": system_tag,
            "k": len(comparison_results),
            "k1": k1,
            "b": b,
            "number_of_overlap": len(overlap_docids),
            "number_of_docids_from_irrelevant_cik": len(comparison_docids_from_irrelevant_cik),
            "number_of_unique_irrelevant_ciks": len(irrelevant_ciks),
            "number_of_unique_irrelevant_sectors": len(irrelevant_sectors)
        }
        comparison_df.loc[len(comparison_df)] = new_row

    # analysis for mult-fields format type
    for prepend_info in ["null", "target_company", "target_title"]:
        system_tag = parse_retrieval_system_tag("multi_fields", filter_name, prepend_info, k1, b)
        comparison_results = df_results[(df_results["target_id"] == target_docid) & (df_results["tag"] == system_tag)]["reference_id"].tolist()
        comparison_ciks = [docid_to_cik(docid) for docid in comparison_results]
        comparison_sectors = [cik_to_sector(cik) for cik in comparison_ciks]

        overlap_docids = calculate_overlap(basic_results, comparison_results)
        docids_unique_to_basic = calculate_difference(basic_results, comparison_results)
        docids_unique_to_comparison = calculate_difference(comparison_results, basic_results)

        comparison_docids_from_irrelevant_cik = paras_from_irrelevant_cik(comparison_results, target_cik)
        irrelevant_ciks = get_irrelevant_ciks(comparison_ciks, target_cik)
        irrelevant_sectors = get_irrelevant_sectors(comparison_sectors, target_sector)
        
        print(f"    tag: {system_tag}")
        print(f"        number of overlap: {len(overlap_docids)}")
        print(f"        basic unique docids:", docids_unique_to_basic)
        print(f"        comparison unique docids:", docids_unique_to_comparison)
        print(f"        number of docids from irrelevant CIKs: {len(comparison_docids_from_irrelevant_cik)}")
        print(f"        number of unique irrelevant CIKs: {len(irrelevant_ciks)}")
        print(f"        number of unique irrelevant sectors: {len(irrelevant_sectors)}")

        new_row = {
            "target_cik": target_cik,
            "target_id": target_docid,
            "system_tag": system_tag,
            "k": len(comparison_results),
            "k1": k1,
            "b": b,
            "number_of_overlap": len(overlap_docids),
            "number_of_docids_from_irrelevant_cik": len(comparison_docids_from_irrelevant_cik),
            "number_of_unique_irrelevant_ciks": len(irrelevant_ciks),
            "number_of_unique_irrelevant_sectors": len(irrelevant_sectors)
        }
        comparison_df.loc[len(comparison_df)] = new_row

Para ID: 20220318_10-K_1045810_part2_item7_para5
Target Sector: Information Technology
    tag: basic-year2018_2022-null-k1_0.9-b_0.4
        number of docids from irrelevant CIKs: 4
        number of unique irrelevant CIKs: 2
        number of unique irrelevant sectors: 2
    tag: company_name-year2018_2022-target_company-k1_0.9-b_0.4
        number of overlap: 9
        basic unique docids: {'20191028_10-K_1618921_part1_item1_para9'}
        comparison unique docids: {'20210226_10-K_1045810_part2_item7_para71'}
        number of docids from irrelevant CIKs: 3
        number of unique irrelevant CIKs: 1
        number of unique irrelevant sectors: 1
    tag: title-year2018_2022-target_title-k1_0.9-b_0.4
        number of overlap: 9
        basic unique docids: {'20191028_10-K_1618921_part1_item1_para9'}
        comparison unique docids: {'20210226_10-K_1045810_part2_item7_para71'}
        number of docids from irrelevant CIKs: 3
        number of unique irrelevant CIKs: 1
        numb

In [None]:
comparison_df.to_csv("comparison_results.csv", index=False)