Replicating: https://github.com/superlinked/chunking-research

In [24]:
import datasets
from tqdm import tqdm
import pandas as pd

dataset = datasets.load_dataset("rajpurkar/squad")

In [25]:
df = pd.concat(
    [
        pd.DataFrame(dataset["train"]),
        pd.DataFrame(dataset["validation"]),
    ],
    axis=0,
).reset_index(drop=True)
df.head()

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [26]:
# Get rid of id and title
df = df.loc[:, ['context', 'question', 'answers']].copy()
df.head()

Unnamed: 0,context,question,answers
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [27]:
# although it is called answers, there is always one element only
df['answers'] = df.answers.apply(lambda x: x['text'][0])
df.head()

Unnamed: 0,context,question,answers
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary


In [28]:
df.rename(
    columns={'question': 'questions', 'answer': 'answers'},
    inplace=True
)
df.head()

Unnamed: 0,context,questions,answers
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary


In [29]:
# Think this does nothing
df = df.loc[:, ['context', 'questions', 'answers']].copy()
df.head()

Unnamed: 0,context,questions,answers
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary


In [30]:
# Don't run this twice
df = df.groupby("context").agg({"questions": list, "answers": list}).reset_index()
df.head()

Unnamed: 0,context,questions,answers
0,"\n Australia: The event was held in Canberra, ...","[When did the torch arrive in Canberra?, Who r...","[April 24, Agnes Shea, a message stick, Austra..."
1,"\n China: In China, the torch was first welcom...","[Action was taken to boycott which company?, W...","[Carrefour, the LVMH Group, the French flag, K..."
2,"\n France: The torch relay leg in Paris, held ...","[When did the torch relay in Paris occur?, Whe...","[April 7, the Eiffel Tower, by bus, Teddy Rine..."
3,\n Great Britain: The torch relay leg held in ...,"[Which city hosted the 2012 Summer Olympics?, ...","[London, April 6, £750,000, London, Wembley St..."
4,\n India: Due to concerns about pro-Tibet prot...,"[When did the torch visit New Delhi?, How many...","[April 17, 70, Baichung Bhutia, five, New Delh..."


In [34]:
import numpy as np

contexts, question_batches = df.context.tolist(), df.questions.tolist()

(contexts[:2], question_batches[:2])


(["\n Australia: The event was held in Canberra, Australian Capital Territory on April 24, and covered around 16 km of Canberra's central areas, from Reconciliation Place to Commonwealth Park. Upon its arrival in Canberra, the Olympic flame was presented by Chinese officials to local Aboriginal elder Agnes Shea, of the Ngunnawal people. She, in turn, offered them a message stick, as a gift of peace and welcome. Hundreds of pro-Tibet protesters and thousands of Chinese students reportedly attended. Demonstrators and counter-demonstrators were kept apart by the Australian Federal Police. Preparations for the event were marred by a disagreement over the role of the Chinese flame attendants, with Australian and Chinese officials arguing publicly over their function and prerogatives during a press conference.",
  '\n China: In China, the torch was first welcomed by Politburo Standing Committee member Zhou Yongkang and State Councilor Liu Yandong. It was subsequently passed onto CPC General 

In [43]:
labels = np.array(
    [idx for idx, qb in enumerate(question_batches) for _ in qb]
)
len(labels), sum(len(qb) for qb in question_batches)

(98169, 98169)

In [45]:
# vector store index
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser.text import SentenceSplitter
from llama_index.core import Document, VectorStoreIndex

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")

chunker = SentenceSplitter(chunk_size=128, chunk_overlap=16)

documents = [
    Document(
        text=context,
        doc_id=idx,
        metadata={
            'context_id': idx,
        },
        excluded_embed_metadata_keys=['context_id'],
    ) for idx, context in enumerate(contexts)
]

vector_store = VectorStoreIndex(
    documents=documents,
    transformations=[chunker],
    embed_model=embed_model,
    show_progress=True,
)

vector_store