In [184]:
# Load the saved records file
with open('./reports/retrieval_analysis.json', 'r') as f:
    records = json.load(f)

In [7]:
import json
from collections import Counter
# ----------------------------------
# config
# ----------------------------------
discourse_styles  = ["unispeaker", "multispeaker"]
reasoning_types   = ["arithmetic", "temporal", "wknow"]
retrieval_types   = ['bm25']          # keep if you also want BM25 vs ReasonIR                       # how many "high" and "low"
dataset_repo      = "zeinabTaghavi/ImpliRet"      # HF dataset
results_dir       = "./results"                   # retrieval JSONL folder

def overlap_count(query, pos_doc): 
    count = 0
    for word in query.split():
        for word_pd in pos_doc.split():
            if word in word_pd:
                count += 1
    return count


from datasets import load_dataset


# ----------------------------------
# main loop
# ----------------------------------
import json, pandas as pd
from datasets import load_dataset
from pathlib  import Path

# records = {}
window = 50
n = 20
for discourse in discourse_styles:
    for reasoning in reasoning_types:
        ds = load_dataset(dataset_repo, name=discourse, split=reasoning)
        

        for retrieval in retrieval_types:
            print(f"Processing {discourse} {reasoning} {retrieval}")
            records[f"{discourse}_{reasoning}_{retrieval}"] = {}
            # ---------- locate the retrieval file ----------
            # pattern used in your earlier example:
            #   arithmetic_multispeaker_bm25_index.jsonl
            run_file = Path(results_dir) / f"{reasoning}_{discourse}_{retrieval}_index.jsonl"
            print(run_file)
            if not run_file.exists():
                print(f"⚠️  Missing run file → {run_file}")
                continue

            # ---------- read the retrieval scores ----------
            top_w = []
            top_w_score = []
            low_w = []
            low_w_score = []
            i_average_score = {}
            with run_file.open() as fh:
                run_lines = []
                for line in fh:
                    new_line = json.loads(line)['index_score_tuple_list']
                    run_lines.append(new_line)
                    new_line.sort(key=lambda x: x[1], reverse=True)
                    top_w.extend([new_line[num][0] for num in range(window)])
                    top_w_score.extend([new_line[num][1] for num in range(window)])
                    low_w.extend([new_line[-(num+1)][0] for num in range(window)])
                    low_w_score.extend([new_line[-(num+1)][1] for num in range(window)])
                    
                    for idx, i in enumerate(new_line):
                        if i[0] in i_average_score.keys():
                            if idx> i_average_score[i[0]]['max']:
                                i_average_score[i[0]]['max'] =  idx
                            if  idx < i_average_score[i[0]]['min']:
                                i_average_score[i[0]]['min'] =  idx
                            i_average_score[i[0]]['sum'] +=  idx
                            i_average_score[i[0]]['count'] += 1
                            i_average_score[i[0]]['dist'] = i_average_score[i[0]]['max'] - i_average_score[i[0]]['min']
                        else:
                            i_average_score[i[0]] = {'max': idx, 'min': idx, 'sum': idx, 'count': 1, 'dist': 0}

            i_average_dist = sum([i_average_score[i]['dist'] for i in i_average_score.keys()])/len(i_average_score.keys())
            records[f"{discourse}_{reasoning}_{retrieval}"]['i_average_dist'] = i_average_dist
            # Get frequency count of numbers in top_10
            frequency = Counter(top_w)
            # Get most and least common numbers and their counts
            most_common_high = frequency.most_common()[:n]
            agv_repeat_high = sum([i[1] for i in most_common_high])/len(most_common_high)
            records[f"{discourse}_{reasoning}_{retrieval}"]['agv_repeat_high'] = agv_repeat_high
            frequency = Counter(low_w)  
            most_common_low = frequency.most_common()[:n]
            agv_repeat_low = sum([i[1] for i in most_common_low])/len(most_common_low)
            records[f"{discourse}_{reasoning}_{retrieval}"]['agv_repeat_low'] = agv_repeat_low


            # word overlap between query and top_n
            
            high_score_overlap = 0
            low_score_overlap = 0
            for i in range(n):
                for j in range(len(ds)):
                    high_score_overlap += overlap_count(ds[j]['question'], ds[most_common_high[i][0]]['pos_document'])
                    low_score_overlap += overlap_count(ds[j]['question'], ds[most_common_low[i][0]]['pos_document'])
            high_score_overlap = high_score_overlap/(n*len(ds))
            low_score_overlap = low_score_overlap/(n*len(ds))

            records[f"{discourse}_{reasoning}_{retrieval}"]['high_score_overlap'] = high_score_overlap
            records[f"{discourse}_{reasoning}_{retrieval}"]['low_score_overlap'] = low_score_overlap
         

        # Save records to reports directory
        import json
        import os

        # Create reports directory if it doesn't exist
        os.makedirs('./reports', exist_ok=True)

        # Save records as JSON file
        output_path = os.path.join('./reports', 'retrieval_analysis.json')
        with open(output_path, 'w') as f:
            json.dump(records, f, indent=4)

        print(f'Saved analysis results to {output_path}')

Using the latest cached version of the dataset since zeinabTaghavi/ImpliRet couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'unispeaker' at /mounts/Users/cisintern/zeinabtaghavi/.cache/huggingface/datasets/zeinabTaghavi___impli_ret/unispeaker/0.0.0/4396c7e7cc73d91ce1eaa0cc2f6001f847c4c8ad (last modified on Tue Jun 24 18:26:19 2025).


Processing unispeaker arithmetic bm25
results/arithmetic_unispeaker_bm25_index.jsonl
Saved analysis results to ./reports/retrieval_analysis.json


Using the latest cached version of the dataset since zeinabTaghavi/ImpliRet couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'unispeaker' at /mounts/Users/cisintern/zeinabtaghavi/.cache/huggingface/datasets/zeinabTaghavi___impli_ret/unispeaker/0.0.0/4396c7e7cc73d91ce1eaa0cc2f6001f847c4c8ad (last modified on Tue Jun 24 18:26:19 2025).


Processing unispeaker temporal bm25
results/temporal_unispeaker_bm25_index.jsonl
Saved analysis results to ./reports/retrieval_analysis.json


Using the latest cached version of the dataset since zeinabTaghavi/ImpliRet couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'unispeaker' at /mounts/Users/cisintern/zeinabtaghavi/.cache/huggingface/datasets/zeinabTaghavi___impli_ret/unispeaker/0.0.0/4396c7e7cc73d91ce1eaa0cc2f6001f847c4c8ad (last modified on Tue Jun 24 18:26:19 2025).


Processing unispeaker wknow bm25
results/wknow_unispeaker_bm25_index.jsonl
Saved analysis results to ./reports/retrieval_analysis.json


Using the latest cached version of the dataset since zeinabTaghavi/ImpliRet couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'multispeaker' at /mounts/Users/cisintern/zeinabtaghavi/.cache/huggingface/datasets/zeinabTaghavi___impli_ret/multispeaker/0.0.0/84345357f3de42273eb43cd28fa83b143f8f9c0d (last modified on Wed Jun 25 11:29:34 2025).


Processing multispeaker arithmetic bm25
results/arithmetic_multispeaker_bm25_index.jsonl
Saved analysis results to ./reports/retrieval_analysis.json


Using the latest cached version of the dataset since zeinabTaghavi/ImpliRet couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'multispeaker' at /mounts/Users/cisintern/zeinabtaghavi/.cache/huggingface/datasets/zeinabTaghavi___impli_ret/multispeaker/0.0.0/84345357f3de42273eb43cd28fa83b143f8f9c0d (last modified on Wed Jun 25 11:29:34 2025).


Processing multispeaker temporal bm25
results/temporal_multispeaker_bm25_index.jsonl
Saved analysis results to ./reports/retrieval_analysis.json


Using the latest cached version of the dataset since zeinabTaghavi/ImpliRet couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'multispeaker' at /mounts/Users/cisintern/zeinabtaghavi/.cache/huggingface/datasets/zeinabTaghavi___impli_ret/multispeaker/0.0.0/84345357f3de42273eb43cd28fa83b143f8f9c0d (last modified on Wed Jun 25 11:29:34 2025).


Processing multispeaker wknow bm25
results/wknow_multispeaker_bm25_index.jsonl
Saved analysis results to ./reports/retrieval_analysis.json


In [178]:
# Save records to reports directory
import json
import os

# Create reports directory if it doesn't exist
os.makedirs('./reports', exist_ok=True)

# Save records as JSON file
output_path = os.path.join('./reports', 'retrieval_analysis.json')
with open(output_path, 'w') as f:
    json.dump(records, f, indent=4)

print(f'Saved analysis results to {output_path}')

Saved analysis results to ./reports/retrieval_analysis.json


In [183]:
# Load and examine temporal_multispeaker_hipporag results
with open('Retrieval/results/temporal_multispeaker_hipporag_index.jsonl', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    print(len(lines))


FileNotFoundError: [Errno 2] No such file or directory: 'Retrieval/results/temporal_multispeaker_hipporag_index.jsonl'

In [None]:

    # "unispeaker_arithmetic_hipporag": {
    #     "i_average_dist": 0.8630152666666666,
    #     "agv_repeat_high": 383.35,
    #     "agv_repeat_low": 726.1,
    #     "high_score_overlap": 8.054,
    #     "low_score_overlap": 7.031
    # }

    # "multispeaker_arithmetic_hipporag": {
    #     "i_average_dist": 0.8242842666666667,
    #     "agv_repeat_high": 450.65,
    #     "agv_repeat_low": 861.0,
    #     "high_score_overlap": 39.855,
    #     "low_score_overlap": 36.705
    # },