In [1]:
import sys
sys.path.append("/home/ybtu/FinNLP")

import pandas as pd
import numpy as np
from utils import retrieve_paragraph_from_docid, get_10K_file_name, parse_trec_file, trec_file_to_csv
from utils import docid_to_cik, cik_to_sector

In [2]:
results = parse_trec_file("retrieval_results_trec/retrieval_results_full.txt")
trec_file_to_csv("retrieval_results_trec/retrieval_results_full.txt", "retrieval_results_full.csv")

target_paras = list(results.keys())
df_trec_results = pd.read_csv("retrieval_results_full.csv")
print(df_trec_results.columns)
print(target_paras)

Index(['target_id', 'Q0', 'reference_id', 'rank', 'score', 'tag'], dtype='object')
['20220318_10-K_1045810_part2_item7_para5', '20220222_10-K_1090727_part2_item7_para5', '20220216_10-K_1585689_part2_item7_para74', '20220217_10-K_200406_part2_item7_para42', '20221028_10-K_320193_part2_item7_para7']


In [3]:
def parse_retrieval_system_tag(d_encoder, format_type, filter_name, prepend_info):
    # dpr-ctx_encoder-multiset-base-basic-year2018_2022-null
    d_encoder_name = d_encoder.split("/")[-1]
    return f"{d_encoder_name}-{format_type}-{filter_name}-{prepend_info}"

def calculate_overlap(list1, list2):
    return set(list1).intersection(set(list2))

def calculate_difference(list1, list2):
    return set(list1).difference(set(list2))

def filter_docids_by_cik(docids, cik):
    return [docid for docid in docids if docid_to_cik(docid) == cik]

def get_irrelevant_ciks(cik_lists, target_cik):
    return set([cik for cik in cik_lists if cik != target_cik])

def get_irrelevant_sectors(sector_lists, target_sector):
    return set([sector for sector in sector_lists if sector != target_sector])

def get_format_type_from_system_tag(system_tag):
    # {d_encoder}-{format_type}-{filter_name}-{prepend_info}
    return system_tag.split("-")[-3]

def get_prepend_info_from_system_tag(system_tag):
    # {d_encoder}-{format_type}-{filter_name}-{prepend_info}
    return system_tag.split("-")[-1]

def get_filter_name_from_system_tag(system_tag):
    # {d_encoder}-{format_type}-{filter_name}-{prepend_info}
    return system_tag.split("-")[-2]

def get_d_encoder_from_system_tag(system_tag):
    # {d_encoder}-{format_type}-{filter_name}-{prepend_info}
    return "-".join(system_tag.split("-")[:-3])

class RetrievalResult:
    def __init__(self, target_docid, retrieval_system_tag, retrieved_docids):
        self.target_docid = target_docid
        self.retrieval_system_tag = retrieval_system_tag
        self.retrieved_docids = retrieved_docids

class DenseExperimentConfig:
    COLUMNS = [
        "target_cik", 
        "target_id", 
        "system_tag",
        "d_encoder", 
        "format_type", 
        "prepend_info",
        "k", 
        "ciks_cnt",
        "irrelevant_ciks_cnt", 
        "sectors_cnt", 
        "irrelevant_sectors_cnt",
        "relevant_docids_cnt",
        "overlapped_docids_cnt",
        "comparison_tag"
    ]

    def __init__(self, target_docids, filter_name, retrieval_system_tags, df_trec_results, 
                 baseline_d_encoder, baseline_format_type, baseline_prepend_info):
        self.target_docids = target_docids
        self.target_ciks = set([docid_to_cik(docid) for docid in target_docids])
        self.target_sectors = set([cik_to_sector(cik) for cik in self.target_ciks])
        self.filter_name = filter_name
        self.retrieval_system_tags = retrieval_system_tags
        self.df_trec_results = df_trec_results

        self.baseline = self.init_baseline(baseline_d_encoder, baseline_format_type, baseline_prepend_info)
        self.comparison_df = self.init_comparison_df(DenseExperimentConfig.COLUMNS)

    def init_baseline(self, baseline_d_encoder, baseline_format_type, baseline_prepend_info):
        baseline = {}
        retrieval_system_tag = parse_retrieval_system_tag(baseline_d_encoder, baseline_format_type, self.filter_name, baseline_prepend_info)
        for target_docid in self.target_docids:
            retrieved_docids = self.df_trec_results[(self.df_trec_results["target_id"] == target_docid) & (self.df_trec_results["tag"] == retrieval_system_tag)]["reference_id"].tolist()
            baseline[target_docid] = RetrievalResult(target_docid, retrieval_system_tag, retrieved_docids)

        return baseline
    
    def init_comparison_df(self, columns):
        comparison_df = pd.DataFrame(columns=columns)

        for target_docid in self.target_docids:
            target_cik = docid_to_cik(target_docid)
            retrieved_docids = self.baseline[target_docid].retrieved_docids
            retrieval_system_tag = self.baseline[target_docid].retrieval_system_tag
            retrieved_ciks = set([docid_to_cik(docid) for docid in retrieved_docids])
            retrieved_sectors = set([cik_to_sector(cik) for cik in retrieved_ciks])

            irrelevant_ciks = get_irrelevant_ciks(retrieved_ciks, target_cik)
            irrelevant_sectors = get_irrelevant_sectors(retrieved_sectors, cik_to_sector(target_cik))

            # here, relevant docids are the ones that have the same cik as the target cik
            relevant_docids = filter_docids_by_cik(retrieved_docids, target_cik)
            irrelevant_docids = calculate_difference(retrieved_docids, relevant_docids)

            new_row = {
                "target_cik": target_cik,
                "target_id": target_docid,
                "system_tag": retrieval_system_tag,
                "d_encoder": get_d_encoder_from_system_tag(retrieval_system_tag),
                "format_type": get_format_type_from_system_tag(retrieval_system_tag),
                "prepend_info": get_prepend_info_from_system_tag(retrieval_system_tag),
                "k": len(retrieved_docids),
                "ciks_cnt": len(retrieved_ciks),
                "irrelevant_ciks_cnt": len(irrelevant_ciks),
                "sectors_cnt": len(retrieved_sectors),
                "irrelevant_sectors_cnt": len(irrelevant_sectors),
                "relevant_docids_cnt": len(relevant_docids),
            }
            comparison_df.loc[len(comparison_df)] = new_row

        return comparison_df
    
    def get_retrieved_docids(self, target_docid, retrieval_system_tag):
        return self.df_trec_results[(self.df_trec_results["target_id"] == target_docid) & (self.df_trec_results["tag"] == retrieval_system_tag)]["reference_id"].tolist()

    def add_comparison_result(self, new_row):
        self.comparison_df.loc[len(self.comparison_df)] = new_row

    @staticmethod
    def setup_new_row(target_docid, base_system_tag, retrieval_system_tag, retrieved_docids, overlapped_docids_cnt):
        target_cik = docid_to_cik(target_docid)
        retrieved_ciks = set([docid_to_cik(docid) for docid in retrieved_docids])
        retrieved_sectors = set([cik_to_sector(cik) for cik in retrieved_ciks])
        relevant_docids = filter_docids_by_cik(retrieved_docids, target_cik)

        irrelevant_ciks = get_irrelevant_ciks(retrieved_ciks, target_cik)
        irrelevant_sectors = get_irrelevant_sectors(retrieved_sectors, cik_to_sector(target_cik))

        new_row = {
            "target_cik": docid_to_cik(target_docid),
            "target_id": target_docid,
            "system_tag": retrieval_system_tag,
            "d_encoder": get_d_encoder_from_system_tag(retrieval_system_tag),
            "format_type": get_format_type_from_system_tag(retrieval_system_tag),
            "prepend_info": get_prepend_info_from_system_tag(retrieval_system_tag),
            "k": len(retrieved_docids),
            "ciks_cnt": len(retrieved_ciks),
            "irrelevant_ciks_cnt": len(irrelevant_ciks),
            "sectors_cnt": len(retrieved_sectors),
            "irrelevant_sectors_cnt": len(irrelevant_sectors),
            "relevant_docids_cnt": len(relevant_docids),
            "overlapped_docids_cnt": overlapped_docids_cnt,
            "comparison_tag": base_system_tag
        }
        
        return new_row

    def compare_with_baseline(self):
        for target_docid in self.target_docids:
            target_cik = docid_to_cik(target_docid)
            target_sector = cik_to_sector(target_cik)
            print(f"Target docid: {target_cik}")
            print(f"Target sector: {target_sector}")

            base_system_tag = self.baseline[target_docid].retrieval_system_tag
            base_retrieved_docids = self.baseline[target_docid].retrieved_docids
            base_ciks = set([docid_to_cik(docid) for docid in base_retrieved_docids])
            base_sectors = set([cik_to_sector(cik) for cik in base_ciks])
            base_relevant_docids = filter_docids_by_cik(base_retrieved_docids, target_cik)

            print(f"    tag: {base_system_tag}")
            print(f"        number of relevant docids: {len(base_relevant_docids)}")
            print(f"        number of ciks: {len(base_ciks)}")
            print(f"        number of sectors: {len(base_sectors)}")

            for retrieval_system_tag in self.retrieval_system_tags:
                if retrieval_system_tag == base_system_tag:
                    continue

                comp_retrieved_docids = self.get_retrieved_docids(target_docid, retrieval_system_tag)
                comp_ciks = set([docid_to_cik(docid) for docid in comp_retrieved_docids])
                comp_sectors = set([cik_to_sector(cik) for cik in comp_ciks])
                comp_relevant_docids = filter_docids_by_cik(comp_retrieved_docids, target_cik)

                overlapped_docids = calculate_overlap(base_retrieved_docids, comp_retrieved_docids)
                docids_unique_to_base = calculate_difference(base_retrieved_docids, comp_retrieved_docids)
                docids_unique_to_comp = calculate_difference(comp_retrieved_docids, base_retrieved_docids)

                new_row = self.setup_new_row(target_docid, base_system_tag, retrieval_system_tag, comp_retrieved_docids, len(overlapped_docids))
                self.add_comparison_result(new_row)

                print(f"    tag: {retrieval_system_tag}")
                print(f"        number of ciks: {len(comp_ciks)}")
                print(f"        number of sectors: {len(comp_sectors)}")
                print(f"        number of relevant docids: {len(comp_relevant_docids)}")
                print(f"        number of overlapped docids: {len(overlapped_docids)}")
                print(f"        docids unique to base:", docids_unique_to_base)
                print(f"        docids unique to comp:", docids_unique_to_comp)
    
    @staticmethod
    def get_comparison_df_columns():
        return DenseExperimentConfig.COLUMNS
        

In [4]:
def concat_comparison_dfs(exp_configs):
    full_comparison_df = pd.DataFrame(columns=DenseExperimentConfig.get_comparison_df_columns())
    for exp_name, exp_config in exp_configs.items():
        print("Experiemnt Name:", exp_name)
        comparison_df = exp_config.comparison_df
        full_comparison_df = pd.concat([full_comparison_df, comparison_df])

    return full_comparison_df.drop_duplicates()

# Analyze prepend info

* fixed encoder = dpr

In [5]:
filter_name = "year2018_2022"
d_encoders = [
    "facebook/dpr-ctx_encoder-multiset-base", 
    "sentence-transformers/all-mpnet-base-v2", 
    "/home/ybtu/FinNLP/training/model/sentenceBert"
]
retrieval_sys = {
    "company_name": "target_company", 
    "title": "target_title", 
    "ner_concat_with_company_name": "target_company", 
    "ner_concat_with_title": "target_title",
}

exp_configs = {}
for d_encoder in d_encoders:
    retrieval_system_tags = []

    for format_type, prepend_info in retrieval_sys.items():
        retrieval_system_tags.append(parse_retrieval_system_tag(d_encoder, format_type, filter_name, prepend_info))

    exp_configs[d_encoder] = DenseExperimentConfig(
        target_docids=target_paras,
        filter_name=filter_name,
        retrieval_system_tags=retrieval_system_tags,
        df_trec_results=df_trec_results,
        baseline_d_encoder=d_encoder,
        baseline_format_type="basic",
        baseline_prepend_info="null"
    )

    exp_configs[d_encoder].compare_with_baseline()

Target docid: 1045810
Target sector: Information Technology
    tag: dpr-ctx_encoder-multiset-base-basic-year2018_2022-null
        number of relevant docids: 3
        number of ciks: 4
        number of sectors: 3
    tag: dpr-ctx_encoder-multiset-base-company_name-year2018_2022-target_company
        number of ciks: 3
        number of sectors: 2
        number of relevant docids: 6
        number of overlapped docids: 2
        docids unique to base: {'20190214_10-K_1555280_part2_item8_para129', '20220126_10-K_87347_part2_item8_para70', '20210226_10-K_1045810_part1_item1_para11', '20200122_10-K_87347_part2_item8_para40', '20210127_10-K_87347_part2_item8_para84', '20220520_10-K_849399_part1_item1a_para31', '20210216_10-K_1555280_part2_item8_para120', '20200213_10-K_1555280_part2_item8_para124'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para21', '20220228_10-K_874761_part2_item8_para181', '20210226_10-K_1045810_part1_item1a_para31', '20210222_10-K_1002910_par

In [6]:
full_comparison_df = concat_comparison_dfs(exp_configs)
full_comparison_df.to_csv("dense_exp.csv", index=False)

Experiemnt Name: facebook/dpr-ctx_encoder-multiset-base
Experiemnt Name: sentence-transformers/all-mpnet-base-v2
Experiemnt Name: /home/ybtu/FinNLP/training/model/sentenceBert


  full_comparison_df = pd.concat([full_comparison_df, comparison_df])


In [5]:
filter_name = "year2018_2022"
d_encoders = [
    "facebook/dpr-ctx_encoder-multiset-base", 
    "sentence-transformers/all-mpnet-base-v2"
]
retrieval_sys = {
    "company_name": "target_company", 
    "ner_concat_with_company_name": "target_company", 
}

exp_configs = {}
for d_encoder in d_encoders:
    retrieval_system_tags = []

    for format_type, prepend_info in retrieval_sys.items():
        retrieval_system_tags.append(parse_retrieval_system_tag(d_encoder, format_type, filter_name, prepend_info))

    exp_configs[d_encoder] = DenseExperimentConfig(
        target_docids=target_paras,
        filter_name=filter_name,
        retrieval_system_tags=retrieval_system_tags,
        df_trec_results=df_trec_results,
        baseline_d_encoder=d_encoder,
        baseline_format_type="company_name",
        baseline_prepend_info="target_company"
    )

    exp_configs[d_encoder].compare_with_baseline()

Target docid: 1045810
Target sector: Information Technology
    tag: dpr-ctx_encoder-multiset-base-company_name-year2018_2022-target_company
        number of relevant docids: 6
        number of ciks: 3
        number of sectors: 2
    tag: dpr-ctx_encoder-multiset-base-ner_concat_with_company_name-year2018_2022-target_company
        number of ciks: 4
        number of sectors: 3
        number of relevant docids: 7
        number of overlapped docids: 4
        docids unique to base: {'20200228_10-K_1002910_part2_item8_para356', '20220223_10-K_1002910_part2_item8_para381', '20210222_10-K_1002910_part2_item8_para375', '20220318_10-K_1045810_part1_item1_para51', '20210226_10-K_1045810_part2_item7_para21', '20220228_10-K_874761_part2_item8_para181'}
        docids unique to comp: {'20210226_10-K_1045810_part2_item7_para8', '20181221_10-K_1730168_part2_item7_para110', '20220223_10-K_46080_part2_item5_para4', '20200122_10-K_87347_part2_item8_para40', '20210226_10-K_1045810_part2_item7_pa

In [8]:
filter_name = "year2018_2022"
d_encoders = [
    "sentence-transformers/all-mpnet-base-v2", 
]
retrieval_sys = {
    "ner_concat_with_title": "target_title", 
}

exp_configs = {}
for d_encoder in d_encoders:
    retrieval_system_tags = []

    for format_type, prepend_info in retrieval_sys.items():
        retrieval_system_tags.append(parse_retrieval_system_tag(d_encoder, format_type, filter_name, prepend_info))

    exp_configs[d_encoder] = DenseExperimentConfig(
        target_docids=target_paras,
        filter_name=filter_name,
        retrieval_system_tags=retrieval_system_tags,
        df_trec_results=df_trec_results,
        baseline_d_encoder=d_encoder,
        baseline_format_type="title",
        baseline_prepend_info="target_title"
    )

    exp_configs[d_encoder].compare_with_baseline()

Target docid: 1045810
Target sector: Information Technology
    tag: all-mpnet-base-v2-title-year2018_2022-target_title
        number of relevant docids: 10
        number of ciks: 1
        number of sectors: 1
    tag: all-mpnet-base-v2-ner_concat_with_title-year2018_2022-target_title
        number of ciks: 1
        number of sectors: 1
        number of relevant docids: 10
        number of overlapped docids: 9
        docids unique to base: {'20220318_10-K_1045810_part1_item1a_para85'}
        docids unique to comp: {'20220318_10-K_1045810_part1_item1a_para57'}
Target docid: 1090727
Target sector: Industrials
    tag: all-mpnet-base-v2-title-year2018_2022-target_title
        number of relevant docids: 10
        number of ciks: 1
        number of sectors: 1
    tag: all-mpnet-base-v2-ner_concat_with_title-year2018_2022-target_title
        number of ciks: 1
        number of sectors: 1
        number of relevant docids: 10
        number of overlapped docids: 6
        docids 