In [98]:
import datasets

dataset = datasets.load_dataset("deepmind/narrativeqa")

dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 32747
    })
    test: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 10557
    })
    validation: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 3461
    })
})

In [99]:
from IPython.display import display

df = dataset["validation"].to_pandas()
df.head()

Unnamed: 0,document,question,answers
0,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHO NORMALLY DELIVERS THE OPENING PR...,"[{'text': 'THE ACTOR WEARING THE BLACK CLOAK',..."
1,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHAT NAME WAS CYNTHIA MORE FAMOUSLY ...,"[{'text': 'THE GODDESS DIANA', 'tokens': ['THE..."
2,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,"{'text': 'WHO DOES ECHO WEEP FOR?', 'tokens': ...","[{'text': 'NARCISSUS', 'tokens': ['NARCISSUS']..."
3,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHAT DOES A DRINK FROM NARCISSUS'S S...,"[{'text': 'FALL IN LOVE WITH THEMSELVES', 'tok..."
4,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'IN WHAT VALLEY DID THE SOLEMN REVELS...,"[{'text': 'GARGAPHIE IN GREECE', 'tokens': ['G..."


In [100]:
df['context'] = df['document'].apply(lambda x: x["summary"]['text'])
df['question'] = df['question'].apply(lambda x: x['text'])
df.head()

Unnamed: 0,document,question,answers,context
0,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHO NORMALLY DELIVERS THE OPENING PROLOGUE IN ...,"[{'text': 'THE ACTOR WEARING THE BLACK CLOAK',...",The play begins with three pages disputing ov...
1,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHAT NAME WAS CYNTHIA MORE FAMOUSLY KNOWN BY?,"[{'text': 'THE GODDESS DIANA', 'tokens': ['THE...",The play begins with three pages disputing ov...
2,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHO DOES ECHO WEEP FOR?,"[{'text': 'NARCISSUS', 'tokens': ['NARCISSUS']...",The play begins with three pages disputing ov...
3,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHAT DOES A DRINK FROM NARCISSUS'S SPRING CAUS...,"[{'text': 'FALL IN LOVE WITH THEMSELVES', 'tok...",The play begins with three pages disputing ov...
4,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,IN WHAT VALLEY DID THE SOLEMN REVELS OF CYNTHI...,"[{'text': 'GARGAPHIE IN GREECE', 'tokens': ['G...",The play begins with three pages disputing ov...


In [90]:
df = df.rename(columns={'question': 'questions'})
df = df.groupby('context').agg({'questions': list}).reset_index()
df.head()

Unnamed: 0,context,questions
0,"""Goblin Market"" is about two close sisters, L...","[What are the sister's names?, Who lingers at ..."
1,"(From Conan The Warrior, ISBN 0-441-11465-2)\...","[How old is Conan?, Why does Conan head off to..."
2,(Note: The following synopsis was that of Emm...,"[Who is the senior clerk at James How & Sons?,..."
3,"A British woman, Elise Clifton-Ward (Angelina...","[Why is Elise being followed by the Police?, W..."
4,A group of grifters rip off their latest mark...,"[Why was one of the four grifters shot?, How d..."


In [91]:
# Check for duplicates in context column
duplicate_contexts = df['context'].duplicated().sum()
print(f"Number of duplicate contexts: {duplicate_contexts}")
df.shape

Number of duplicate contexts: 0


(115, 2)

In [92]:
# df = df.sample(n=250, random_state=42)
# df.head()

In [93]:
import pandas as pd

# Save DataFrame to parquet file
df.to_parquet('data/narrativeqa.parquet')
df = pd.read_parquet('data/narrativeqa.parquet')
df.head()

Unnamed: 0,context,questions
0,"""Goblin Market"" is about two close sisters, L...","[What are the sister's names?, Who lingers at ..."
1,"(From Conan The Warrior, ISBN 0-441-11465-2)\...","[How old is Conan?, Why does Conan head off to..."
2,(Note: The following synopsis was that of Emm...,"[Who is the senior clerk at James How & Sons?,..."
3,"A British woman, Elise Clifton-Ward (Angelina...","[Why is Elise being followed by the Police?, W..."
4,A group of grifters rip off their latest mark...,"[Why was one of the four grifters shot?, How d..."


In [94]:
df['context_length'] = df['context'].apply(lambda x: len(x))
df.describe()

Unnamed: 0,context_length
count,115.0
mean,3271.817391
std,1225.678177
min,1196.0
25%,2430.5
50%,3278.0
75%,4148.0
max,6033.0


In [95]:
from counter import get_and_increment_counter
from llama_stack_client import LlamaStackClient
from llama_stack_client.types.memory_insert_params import Document

client = LlamaStackClient(
    base_url="http://localhost:5001",
)

providers = client.providers.list()
memory_banks_response = client.memory_banks.list()

bank_id = f"bank_{get_and_increment_counter()}"
provider = providers["memory"][0]
client.memory_banks.register(
    memory_bank_id=bank_id,
    params={
        "embedding_model": "all-MiniLM-L6-v2",
        # Is the default for agent config: https://github.com/meta-llama/llama-stack/blob/66d8f4ffd126bff668434b314892a99fe854a034/llama_stack/providers/inline/agents/meta_reference/agent_instance.py#L668
        "chunk_size_in_tokens": 512,
    },
    provider_id=provider.provider_id,
)
bank_id

'bank_32'

In [96]:
documents = [
    Document(
        document_id=str(idx),
        content=context,
        mime_type="text/plain",
        metadata={},
    )
    for idx, context in zip(df.index, df["context"])
]
documents[:5]

[{'document_id': '0',
  'content': ' "Goblin Market" is about two close sisters, Laura and Lizzie, as well as the goblins to whom the title refers.\nAlthough the sisters seem to be quite young, they live by themselves in a house, and are accustomed to draw water every evening from a stream. As the poem begins, twilight is falling, and as usual, the sisters hear the calls from the goblin merchants, who sell fruits in fantastic abundance, variety and savour. On this evening, Laura lingers at the stream after her sister has left for home, intrigued by the goblins\' strange manner and appearance. (Rossetti hints that the "goblin men" resemble animalsâ\x80\x94for example, having faces like wombats or cats, and possessing tails.) Longing for the goblin fruits but having no money, the impulsive Laura offers a lock of her hair and "a tear more rare than pearl."\nLaura gorges on the delicious fruit in a sort of bacchic frenzy, then once she is finished, after picking up one of the seeds, return

In [97]:
from tqdm import tqdm

for i in tqdm(range(len(documents))):
    client.memory.insert(
        bank_id=bank_id,
        documents=[documents[i]],
    )

  2%|█▍                                                                              | 2/115 [00:00<00:06, 16.58it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 115/115 [12:56<00:00,  6.75s/it]
