In [1]:
import sys
sys.path.insert(0, '/Users/sajayudhay/Studies/Sphurti/Information Retrieval/Project/FinanceRAG')


In [2]:
import logging
from financerag.retrieval import BM25Retriever, BM25Processor
import financerag.tasks as tasks_module

import importlib
import inspect
import os
import json
import pandas as pd
from datasets import load_dataset

from nltk.tokenize import word_tokenize, TweetTokenizer
from rank_bm25 import BM25Okapi
import nltk

tweet_tokenizer = TweetTokenizer()
logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [43]:
import os
import pdb
import re
import shutil
import json
from pathlib import Path
# from dotenv import load_dotenv
from langchain_openai import ChatOpenAI


def clean_text(text):
    """
    Replace all Unicode escape sequences (e.g., \u2019, \u0080) with a space.
    """
    return re.sub(r"(\\u[0-9A-Fa-f]{4})+", " ", text)


def load_jsonl(file_path):
    """
    Load a JSONL file and return its content as a list of dictionaries.
    """
    if not file_path.exists():
        raise FileNotFoundError(f"File not found at {file_path}")
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(clean_text(line.strip())) for line in f]


def save_jsonl(file_path, data, ensure_ascii=True):
    """
    Save a list of dictionaries to a JSONL file.
    """
    file_path.parent.mkdir(parents=True, exist_ok=True)
    with open(file_path, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=ensure_ascii) + "\n")


def load_prompt(subset, key="queries"):
    """
    Load prompt template for the specified subset.
    """
    prompt_path = Path("./prompt.json")
    if not prompt_path.exists():
        raise FileNotFoundError(f"Prompt file not found at {prompt_path}")
    with open(prompt_path, "r", encoding="utf-8") as f:
        prompts = json.load(f)["pre_retrieval"][key]
    if subset not in prompts:
        raise ValueError(f"Prompt not found for subset '{subset}'")
    return prompts[subset]


def _extract_table_from_corpus(c_text, subset):
    """
    Extracts corpus tables and a sentence before each table.
    """
    lines = c_text.split("\n")
    results = []
    table_start = None

    # Iterate through lines to find all tables
    for i, line in enumerate(lines):
        if line.startswith("| "):
            if table_start is None:
                table_start = i  # Start of a new table

            # Check if this is the end of the table block
            if i + 1 == len(lines) or not lines[i + 1].startswith("| "):
                # Extract the sentence immediately before the table
                before_sentence = lines[table_start - 1] if table_start > 0 else ""
                
                # Extract the table content
                table_content = "\n".join(lines[table_start:i + 1])
                after_sentence = ""
                if i + 1 < len(lines) and not lines[i + 1].startswith("| "):
                    after_sentence = lines[i + 1]

                # Extract Table Content only
                result = f"{table_content}".strip()
                results.append(result)

                # Reset for the next table
                table_start = None

    # Return combined results if tables are found, otherwise handle based on subset
    if results:
        return "\n\n".join(results)
    else:
        # Handle cases where no table is found, based on the subset
        parts = c_text.split("\n\n")
        if subset in {"TATQA", "FinQA", "ConvFinQA"}:
            return parts[-1] if subset == "TATQA" else parts[1] if len(parts) > 1 else parts[0]
        elif subset == "MultiHiertt":
            return c_text
        else:
            raise ValueError("Error: subset does not exist.")


def expand_queries_keyword(subset, dataset_dir, llm, overwrite=False):
    """
    Expands data (queries or corpus) for a given subset using the LLM.
    
    We provides expanded queries generated through the LLM to ensure exact reproducibility.
    However, if you wish to regenerate the expanded queries, set `overwrite=True`.
    """
    data = load_jsonl(Path(f"{dataset_dir}/{subset.lower()}_queries/queries.jsonl"))
    prompt_template = load_prompt(subset, "queries")

    expanded_queries = []
    for item in data:
        item_text = item["text"]
        prompt = f"{prompt_template}\n\nQuery: {item_text}"
        new_text = llm.invoke(prompt).content
        expanded_queries.append(
            {
                "_id": item["_id"],
                "title": item["title"],
                "text": f"{item_text}\n\n{new_text}",
            }
        )

    save_path = Path(f"{dataset_dir}/{subset.lower()}_queries/queries_keyword.jsonl")
    if not save_path.is_file() or overwrite:
        save_jsonl(save_path, expanded_queries)
    

def expand_queries_paraphrasing(subset, dataset_dir, llm, overwrite=False):
    """
    Expands data (queries or corpus) for a given subset using the LLM.
    
    We provides expanded queries generated through the LLM to ensure exact reproducibility.
    However, if you wish to regenerate the expanded queries, set `overwrite=True`.
    """
    data = load_jsonl(Path(f"{dataset_dir}/{subset.lower()}_queries/queries.jsonl"))
    prompt_template = load_prompt(subset, "queries_paraphrasing")

    expanded_queries = []
    for item in data:
        item_text = item["text"]
        prompt = f"{prompt_template}\n\nQuery: {item_text}"
        new_text = llm.invoke(prompt).content
        expanded_queries.append(
            {
                "_id": item["_id"],
                "title": item["title"],
                "text": f"{item_text}\n\n{new_text}",
            }
        )

    save_path = Path(f"{dataset_dir}/{subset.lower()}_queries/queries_para.jsonl")
    if not save_path.is_file() or overwrite:
        save_jsonl(save_path, expanded_queries)
    

def corpus_summarising(subset, dataset_dir, llm, overwrite=False):
    """
    Expands data (queries or corpus) for a given subset using the LLM.
    
    We provides expanded queries generated through the LLM to ensure exact reproducibility.
    However, if you wish to regenerate the expanded queries, set `overwrite=True`.
    """
    data = load_jsonl(Path(f"{dataset_dir}/{subset.lower()}_corpus/corpus.jsonl"))
    prompt_template = load_prompt(subset, "corpus_summary")

    expanded_corpus = []
    for item in data:
        item_text = item["text"]
        prompt = f"{prompt_template}\n\nQuery: {item_text}"
        new_text = llm.invoke(prompt).content
        expanded_corpus.append(
            {
                "_id": item["_id"],
                "title": item["title"],
                "text": f"{new_text}",
            }
        )

    save_path = Path(f"{dataset_dir}/{subset.lower()}_corpus/corpus_summary_only.jsonl")
    if not save_path.is_file() or overwrite:
        save_jsonl(save_path, expanded_corpus)

from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import json

def corpus_summarising(subset, dataset_dir, llm, overwrite=False, max_workers=4):
    """
    Expands data (queries or corpus) for a given subset using the LLM in parallel.

    We provide expanded queries generated through the LLM to ensure exact reproducibility.
    However, if you wish to regenerate the expanded queries, set `overwrite=True`.

    Args:
        subset (str): Subset of the dataset to process.
        dataset_dir (str): Directory containing the dataset.
        llm (object): LLM instance with an `invoke` method.
        overwrite (bool): Whether to overwrite existing results.
        max_workers (int): Maximum number of parallel threads to use.
    """
    # Load data and prompt template
    data = load_jsonl(Path(f"{dataset_dir}/{subset.lower()}_corpus/corpus.jsonl"))
    prompt_template = load_prompt(subset, "corpus_summary")

    save_path = Path(f"{dataset_dir}/{subset.lower()}_corpus/corpus_summary_only.jsonl")
    if save_path.is_file() and not overwrite:
        print(f"File already exists at {save_path}, and overwrite is set to False.")
        return

    # Helper function for processing individual items
    def process_item(item):
        item_text = item["text"]
        prompt = f"{prompt_template}\n\nQuery: {item_text}"
        new_text = llm.invoke(prompt).content
        return {
            "_id": item["_id"],
            "title": item["title"],
            "text": new_text,
        }

    # Run LLM calls in parallel
    expanded_corpus = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks to the executor
        future_to_item = {executor.submit(process_item, item): item for item in data}

        for future in as_completed(future_to_item):
            try:
                result = future.result()
                expanded_corpus.append(result)
            except Exception as e:
                print(f"Error processing item: {future_to_item[future]['_id']}, {e}")

    # Save the results
    save_jsonl(save_path, expanded_corpus)
    print(f"Corpus summary saved at {save_path}")

def expand_corpus_keyword(subset, dataset_dir, llm, overwrite=False):
    """
    Expands data (queries or corpus) for a given subset using the LLM.
    
    We provides expanded queries generated through the LLM to ensure exact reproducibility.
    However, if you wish to regenerate the expanded queries, set `overwrite=True`.
    """
    data = load_jsonl(Path(f"{dataset_dir}/{subset.lower()}_corpus/corpus.jsonl"))
    prompt_template = load_prompt(subset, "corpus_keywords")

    expanded_corpus = []
    for item in data:
        item_text = item["text"]
        prompt = f"{prompt_template}\n\nQuery: {item_text}"
        new_text = llm.invoke(prompt).content
        expanded_corpus.append(
            {
                "_id": item["_id"],
                "title": item["title"],
                "text": f"{item_text}\n\n{new_text}",
            }
        )

    save_path = Path(f"{dataset_dir}/{subset.lower()}_corpus/corpus_keyword.jsonl")
    if not save_path.is_file() or overwrite:
        save_jsonl(save_path, expanded_corpus)


def compress_corpus(subset, dataset_dir):
    """
    Compress specific sections of corpus text for given subsets.
    """
    corpus = load_jsonl(Path(f"{dataset_dir}/{subset}/corpus.jsonl"))
    for item in corpus:
        item["text"] = _extract_table_from_corpus(item["text"], subset)
    save_jsonl(Path(f"{dataset_dir}/{subset}/corpus_prep.jsonl"), corpus)


def copy_corpus(subset, dataset_dir):
    from_path = os.path.join(dataset_dir, subset, 'corpus.jsonl')
    to_path = os.path.join(dataset_dir, subset, 'corpus_prep.jsonl')
    shutil.copy(from_path, to_path)

def pre_retrieval(dataset_dir):
    # load_dotenv()
    llm = ChatOpenAI(model="gpt-4o-mini")

    subsets = [
        "FinanceBench",
        "FinDER",
        "FinQABench",
        "MultiHiertt",
        "ConvFinQA",
        "TATQA",
        "FinQA",
    ]

    for subset in subsets:
        try:
            print(f"Pre-retrieval for '{subset}' initiating...")
            expand_queries(subset, dataset_dir, llm)
            if subset == "MultiHiertt":
                compress_corpus(subset, dataset_dir)
            else:
                copy_corpus(subset, dataset_dir)
            print(f"Pre-retrieval for '{subset}' completed.")

        except (FileNotFoundError, ValueError) as e:
            print(f"Pre-retrieval for '{subset}' Error : {e}")



In [44]:

def pre_retrieval_query(dataset_dir):
    # load_dotenv()
    llm = ChatOpenAI(model="gpt-4o-mini")

    subsets = [
        'FinQABench',
        "FinanceBench",
        "FinDER",
        "MultiHiertt",
        "ConvFinQA",
        "TATQA",
        "FinQA",
    ]

    for subset in subsets:
        try:
            print(f"Pre-retrieval for '{subset}' initiating...")
            expand_queries_keyword(subset, dataset_dir, llm)
            # expand_queries_paraphrasing(subset, dataset_dir, llm)

        except (FileNotFoundError, ValueError) as e:
            print(f"Pre-retrieval for '{subset}' Error : {e}")


In [37]:
import getpass
import os
openai_api_key = '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'

if not os.environ.get("OPENAI_API_KEY"):
    # os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
    os.environ["OPENAI_API_KEY"] = openai_api_key
    
pre_retrieval_query('../data')

Pre-retrieval for 'MultiHiertt' initiating...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

In [45]:

def pre_retrieval_corpus(dataset_dir):
    # load_dotenv()
    llm = ChatOpenAI(model="gpt-4o-mini")

    subsets = [
        # 'FinQABench',
        # "FinanceBench",
        # "FinDER",
        "MultiHiertt",
        "ConvFinQA",
        "TATQA",
        "FinQA",
    ]

    for subset in subsets:
        try:
            print(f"Pre-retrieval for '{subset}' initiating...")
            corpus_summarising(subset, dataset_dir, llm)
            # expand_queries_paraphrasing(subset, dataset_dir, llm)

        except (FileNotFoundError, ValueError) as e:
            print(f"Pre-retrieval for '{subset}' Error : {e}")


In [46]:
import getpass
import os
openai_api_key = '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'

if not os.environ.get("OPENAI_API_KEY"):
    # os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
    os.environ["OPENAI_API_KEY"] = openai_api_key
    
pre_retrieval_corpus('../data')

Pre-retrieval for 'MultiHiertt' initiating...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

In [None]:

def pre_retrieval_corpus(dataset_dir):
    # load_dotenv()
    llm = ChatOpenAI(model="gpt-4o-mini")

    subsets = [
        'FinQABench',
        "FinanceBench",
        "FinDER",
        "MultiHiertt",
        "ConvFinQA",
        "TATQA",
        "FinQA",
    ]

    for subset in subsets:
        try:
            print(f"Pre-retrieval for '{subset}' initiating...")
            expand_corpus_keyword(subset, dataset_dir, llm)
            # expand_queries_paraphrasing(subset, dataset_dir, llm)

        except (FileNotFoundError, ValueError) as e:
            print(f"Pre-retrieval for '{subset}' Error : {e}")
import getpass
import os
openai_api_key = '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'

if not os.environ.get("OPENAI_API_KEY"):
    # os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
    os.environ["OPENAI_API_KEY"] = openai_api_key
    
pre_retrieval_corpus('../data')