In [5]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from dataset import *
from data_handler import *
from embeddings import *
from vector_store import *
from RAG_pipeline import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
dataset_manager = FinanceRAGDataset("../data")
print("Available datasets:", dataset_manager.list_datasets())

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


In [7]:
qrels = dataset_manager.load_qrels("ConvFinQA")
unique_query_ids = qrels['query_id'].nunique()
print(f"Number of unique query IDs: {unique_query_ids} of out a total of {qrels.shape[0]}")

Number of unique query IDs: 126 of out a total of 126


In [9]:
def get_random_sample(corpus_df):
    random_corpus = corpus_df.sample(n=1)
    pd.set_option('display.max_colwidth', None)
    return random_corpus['text'].values[0]

## Check how many tables per dataset

In [10]:
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} has {len(corpus)} documents in corpus")

    multi_table_docs = {'1': 0, '2': 0, '3': 0, '4': 0, '+':0}  # Initialize the keys with 0
    for k,v in corpus.items():
        text = v[0]
        tables = extract_tables(v[0])
        if tables:
            if len(tables)==1:
                multi_table_docs['1'] += 1
            if len(tables)==2:
                multi_table_docs['2'] += 1
            if len(tables)==3:
                multi_table_docs['3'] += 1
            if len(tables)==4:
                multi_table_docs['4'] += 1
            if len(tables)>4:
                multi_table_docs['+'] += 1

    # unique print with all the information
    print(f"""\t{multi_table_docs['1']} documents with 1 table
          {multi_table_docs['2']} documents with 2 tables
          {multi_table_docs['3']} documents with 3 tables
          {multi_table_docs['4']} documents with 4 tables 
          {multi_table_docs['+']} documents with more than 4 tables\n""")


Dataset: ConvFinQA has 101 documents in corpus
	101 documents with 1 table
          0 documents with 2 tables
          0 documents with 3 tables
          0 documents with 4 tables 
          0 documents with more than 4 tables

Dataset: FinQA has 247 documents in corpus
	247 documents with 1 table
          0 documents with 2 tables
          0 documents with 3 tables
          0 documents with 4 tables 
          0 documents with more than 4 tables

Dataset: MultiHeritt has 876 documents in corpus
	491 documents with 1 table
          46 documents with 2 tables
          7 documents with 3 tables
          6 documents with 4 tables 
          1 documents with more than 4 tables

Dataset: TATQA has 248 documents in corpus
	248 documents with 1 table
          0 documents with 2 tables
          0 documents with 3 tables
          0 documents with 4 tables 
          0 documents with more than 4 tables



## Check how long are these tables

In [11]:
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} has {len(corpus)} documents in corpus")

    tables_len = []
    text_with_tables_len = []
    for k,v in corpus.items():
        text = v[0]
        len_text = len(text)
        tables = extract_tables(v[0])
        sum_len_table = 0
        if tables:
            for table in tables:
                tables_len.append(len(table))
                sum_len_table += len(table)
        text_with_tables_len.append(len_text - sum_len_table)
    # average length of tables
    print(f"Average length of tables: {int(np.mean(tables_len))}")
    # average length of text without tables
    print(f"Average length of text without tables: {int(np.mean(text_with_tables_len))}")
    print()

Dataset: ConvFinQA has 101 documents in corpus
Average length of tables: 471
Average length of text without tables: 3821

Dataset: FinQA has 247 documents in corpus
Average length of tables: 462
Average length of text without tables: 3823

Dataset: MultiHeritt has 876 documents in corpus
Average length of tables: 722
Average length of text without tables: 2350

Dataset: TATQA has 248 documents in corpus
Average length of tables: 491
Average length of text without tables: 1766



## check how many chunks there are per corpus per dataset

In [12]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")),
                             Embedder(AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")))
EMBEDDING_DIM = 384
MODEL_INPUT_SIZE = 256

In [13]:
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)}\t Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    num_chunks = []
    for idx, text in corpus.items():
        data_obj = text_processor.load_data(text)
        data_obj = text_processor.tokenize()
        chunks = chunker(data_obj.data, max_length=512, padding_value=0, overlap_percent=15)
        num_chunks.append(len(chunks))
    
    print(f"Average number of chunks: {round(np.mean(num_chunks),4)}\n")

Token indices sequence length is longer than the specified maximum sequence length for this model (1801 > 512). Running this sequence through the model will result in indexing errors


Dataset: ConvFinQA 	Number of queries: 126	 Number of documents: 101
Average number of chunks: 2.3564

Dataset: FinQA 	Number of queries: 344	 Number of documents: 247
Average number of chunks: 2.3684

Dataset: MultiHeritt 	Number of queries: 292	 Number of documents: 876
Average number of chunks: 1.8265

Dataset: TATQA 	Number of queries: 498	 Number of documents: 248
Average number of chunks: 1.4637



In [21]:
import plotly.graph_objects as go
import plotly.subplots as sp

def plot_count_words(corpus_df, queries_df):
    # Calculate text lengths based on word count
    corpus_df['word_count'] = corpus_df['text'].apply(lambda x: len(x.split()))
    queries_df['word_count'] = queries_df['text'].apply(lambda x: len(x.split()))

    # Calculate max and average word counts
    max_word_count_corpus = corpus_df['word_count'].max()
    average_word_count_corpus = round(corpus_df['word_count'].mean(), 2)
    max_word_count_queries = queries_df['word_count'].max()
    average_word_count_queries = round(queries_df['word_count'].mean(), 2)

    print(f"Max word count: {max_word_count_corpus}")
    print(f"Average word count: {average_word_count_corpus}")
    print(f"Max word count in queries: {max_word_count_queries}")
    print(f"Average word count in queries: {average_word_count_queries}")

    # Create subplots
    fig = sp.make_subplots(rows=1, cols=2, subplot_titles=("Corpus Word Count Distribution", "Queries Word Count Distribution"))

    # Histogram for corpus
    fig.add_trace(go.Histogram(
        x=corpus_df['word_count'],
        nbinsx=50,
        marker_color='blue',
        name="Corpus Word Count"
    ), row=1, col=1)

    # Histogram for queries
    fig.add_trace(go.Histogram(
        x=queries_df['word_count'],
        nbinsx=50,
        marker_color='green',
        name="Queries Word Count"
    ), row=1, col=2)

    # Update layout
    fig.update_layout(
        title_text="Word Count Distribution in Corpus and Queries",
        showlegend=False,
        height=400,
        width=800
    )

    fig.update_xaxes(title_text="Word Count")
    fig.update_yaxes(title_text="Frequency")

    fig.show()

## Word count in the datasets

In [22]:
for DATASET_NAME in dataset_manager.list_datasets():
    print("========= DATASET:", DATASET_NAME, " =========\n")
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    print(f"Starting number of documents: {len(corpus)} and number of queries: {len(queries)}")

    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"After reducing: number of documents: {len(corpus)} and number of queries: {len(queries)}\n")

    corpus_df = pd.DataFrame(list(corpus.items()), columns=["id", "text"])
    corpus_df["text"] = corpus_df["text"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else "")
    queries_df = pd.DataFrame(list(queries.items()), columns=["id", "text"])
    plot_count_words(corpus_df, queries_df)


Starting number of documents: 2066 and number of queries: 421
After reducing: number of documents: 101 and number of queries: 126

Max word count: 1586
Average word count: 685.39
Max word count in queries: 35
Average word count in queries: 13.89




Starting number of documents: 2789 and number of queries: 1147
After reducing: number of documents: 247 and number of queries: 344

Max word count: 1661
Average word count: 685.09
Max word count in queries: 43
Average word count in queries: 16.71




Starting number of documents: 10475 and number of queries: 974
After reducing: number of documents: 876 and number of queries: 292

Max word count: 2654
Average word count: 474.03
Max word count in queries: 46
Average word count in queries: 17.97




Starting number of documents: 2756 and number of queries: 1663
After reducing: number of documents: 248 and number of queries: 498

Max word count: 1100
Average word count: 287.44
Max word count in queries: 32
Average word count in queries: 12.35





## check the summaries of the tables

In [41]:
import re
import pandas as pd

def get_table(text):
    """Estrae le tabelle da un testo e le formatta in modo leggibile."""
    table_pattern = r'(?:(?:\n|\A)([^\n]+\|[^\n]+\n)((?:[-]+\|[-]+\n)?)((?:[^\n]+\|[^\n]+\n)+))'
    matches = re.findall(table_pattern, text)

    formatted_tables = []
    for match in matches:
        header, separator, rows = match
        columns = header.strip().split("|")  # Estrai colonne
        rows = [row.strip().split("|") for row in rows.strip().split("\n")]  # Estrai righe

        # Pulizia degli spazi
        columns = [col.strip() for col in columns]
        rows = [[cell.strip() for cell in row] for row in rows]

        # Creazione DataFrame
        df = pd.DataFrame(rows, columns=columns)
        
        # Aggiungi tabella formattata alla lista
        formatted_tables.append(df.to_markdown(index=False))

    return "\n\n".join(formatted_tables)  # Unisce le tabelle con spazi tra loro


In [49]:
import random

for DATASET_NAME in dataset_manager.list_datasets():
    print("========= DATASET:", DATASET_NAME, " =========")
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)

    extracted_tables = np.load(f"../data/{DATASET_NAME}/extracted_tables_{DATASET_NAME}.npy", allow_pickle=True).item()
    summaries_short = np.load(f"../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_short.npy", allow_pickle=True).item()
    timing_short = np.load(f"../data/{DATASET_NAME}/Timing_{DATASET_NAME}_short.npy", allow_pickle=True).item()
    summaries_long = np.load(f"../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy", allow_pickle=True).item()
    timing_long = np.load(f"../data/{DATASET_NAME}/Timing_{DATASET_NAME}_long.npy", allow_pickle=True).item()

    # Seleziona casualmente una chiave
    random_key = random.choice(list(extracted_tables.keys()))
    print(f"Selected Document ID: {random_key}")

    # Cerca il documento nel corpus con la chiave
    text = corpus[random_key][0]
    table = get_table(text)

    print(f"Table in text:\n{table[0] if table else 'No table found'}\n")
    print(f"Table extracted:\n{extracted_tables[random_key]}\n")

    print(f"Short summary in {timing_short.get(random_key, 'N/A')} seconds:\n\t{summaries_short.get(random_key, 'N/A')}\n")
    print(f"Long summary in {timing_long.get(random_key, 'N/A')} seconds: \n\t{summaries_long.get(random_key, 'N/A')}")

    break  # Esci dopo aver stampato un solo documento


Selected Document ID: dd497774e
Table in text:
( in millions )                                                                        | for the years ended december 31 , 2017 | for the years ended december 31 , 2016 | for the years ended december 31 , 2015
-------------------------------------------------------------------------------------- | -------------------------------------- | -------------------------------------- | --------------------------------------
net earnings attributable to pmi                                                       | $ 6035                                 | $ 6967                                 | $ 6873                                
less distributed and undistributed earnings attributable to share-based payment awards | 14                                     | 19                                     | 24                                    
net earnings for basic and diluted eps                                                 | $ 6021                  