In [1]:
import datasets

dataset = datasets.load_dataset("deepmind/narrativeqa")

dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 32747
    })
    test: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 10557
    })
    validation: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 3461
    })
})

In [2]:
from IPython.display import display

df = dataset["validation"].to_pandas()
df.head()

Unnamed: 0,document,question,answers
0,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHO NORMALLY DELIVERS THE OPENING PR...,"[{'text': 'THE ACTOR WEARING THE BLACK CLOAK',..."
1,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHAT NAME WAS CYNTHIA MORE FAMOUSLY ...,"[{'text': 'THE GODDESS DIANA', 'tokens': ['THE..."
2,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,"{'text': 'WHO DOES ECHO WEEP FOR?', 'tokens': ...","[{'text': 'NARCISSUS', 'tokens': ['NARCISSUS']..."
3,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'WHAT DOES A DRINK FROM NARCISSUS'S S...,"[{'text': 'FALL IN LOVE WITH THEMSELVES', 'tok..."
4,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,{'text': 'IN WHAT VALLEY DID THE SOLEMN REVELS...,"[{'text': 'GARGAPHIE IN GREECE', 'tokens': ['G..."


In [3]:
df['context'] = df['document'].apply(lambda x: x['text'])
df['question'] = df['question'].apply(lambda x: x['text'])
df.head()

Unnamed: 0,document,question,answers,context
0,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHO NORMALLY DELIVERS THE OPENING PROLOGUE IN ...,"[{'text': 'THE ACTOR WEARING THE BLACK CLOAK',...",ï»¿The Project Gutenberg EBook of Cynthia's Re...
1,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHAT NAME WAS CYNTHIA MORE FAMOUSLY KNOWN BY?,"[{'text': 'THE GODDESS DIANA', 'tokens': ['THE...",ï»¿The Project Gutenberg EBook of Cynthia's Re...
2,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHO DOES ECHO WEEP FOR?,"[{'text': 'NARCISSUS', 'tokens': ['NARCISSUS']...",ï»¿The Project Gutenberg EBook of Cynthia's Re...
3,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,WHAT DOES A DRINK FROM NARCISSUS'S SPRING CAUS...,"[{'text': 'FALL IN LOVE WITH THEMSELVES', 'tok...",ï»¿The Project Gutenberg EBook of Cynthia's Re...
4,{'id': '00fb61fa7bee266ad995e52190ebb73606b60b...,IN WHAT VALLEY DID THE SOLEMN REVELS OF CYNTHI...,"[{'text': 'GARGAPHIE IN GREECE', 'tokens': ['G...",ï»¿The Project Gutenberg EBook of Cynthia's Re...


In [4]:
df = df.rename(columns={'question': 'questions'})
df = df.groupby('context').agg({'questions': list}).reset_index()
df.head()

Unnamed: 0,context,questions
0,"<html>\n\n<head>\n<title>""Domino,"" by Richard ...",[Who planned the robbery that was being invest...
1,<html>\n<head><title>Airplane Script at IMSDb....,"[What is Ted Striker afraid of?, Why is Ted af..."
2,<html>\n<head><title>All About Steve Script at...,"[What is Mary Horowitz's job?, Who is Mary's b..."
3,<html>\n<head><title>American Psycho Script at...,"[Who is the first man Bateman muders?, Who is ..."
4,"<html>\n<head><title>American, The Script at I...","[What does Jack do for a living?, What is the ..."


In [5]:
# Check for duplicates in context column
duplicate_contexts = df['context'].duplicated().sum()
print(f"Number of duplicate contexts: {duplicate_contexts}")
df.shape

Number of duplicate contexts: 0


(115, 2)

In [6]:
# Filter out html documents - we are only interested in text documents
df = df.assign(html=df['context'].str.contains('<html>'))
df['html'].value_counts()
df = df[~df['html']].reset_index(drop=True)
df = df.drop('html', axis=1)
df.head()

Unnamed: 0,context,questions
0,ï»¿\n Project Gutenberg's Justice (Second Seri...,"[Who is the senior clerk at James How & Sons?,..."
1,ï»¿\nThe Project Gutenberg EBook of The Myster...,"[When does the story begin?, Where does the st..."
2,ï»¿*********The Project Gutenberg Etext of Cri...,"[Where does this story take place?, Who Socrat..."
3,"ï»¿Project Gutenberg etext, The Deliverance; A...",[Which U.S. state is the setting for the story...
4,"ï»¿Project Gutenberg's Armageddon--2419 A.D., ...",[How did Anthony (Buck Rogers) remain asleep f...


In [7]:
# df = df.sample(n=50, random_state=42)
# df.head()
# Get first 50 rows
df = df.iloc[:50]
df.shape

(50, 2)

In [8]:
import pandas as pd

# Save DataFrame to parquet file
df.to_parquet('data/narrativeqa.parquet')
df = pd.read_parquet('data/narrativeqa.parquet')
df.head()

Unnamed: 0,context,questions
0,ï»¿\n Project Gutenberg's Justice (Second Seri...,"[Who is the senior clerk at James How & Sons?,..."
1,ï»¿\nThe Project Gutenberg EBook of The Myster...,"[When does the story begin?, Where does the st..."
2,ï»¿*********The Project Gutenberg Etext of Cri...,"[Where does this story take place?, Who Socrat..."
3,"ï»¿Project Gutenberg etext, The Deliverance; A...",[Which U.S. state is the setting for the story...
4,"ï»¿Project Gutenberg's Armageddon--2419 A.D., ...",[How did Anthony (Buck Rogers) remain asleep f...


In [9]:
df['context_length'] = df['context'].apply(lambda x: len(x))
df.describe()

Unnamed: 0,context_length
count,50.0
mean,421282.9
std,393284.8
min,45910.0
25%,115514.2
50%,310338.0
75%,564249.0
max,1811502.0


In [10]:
from counter import get_and_increment_counter
from llama_stack_client import LlamaStackClient
from llama_stack_client.types.memory_insert_params import Document

client = LlamaStackClient(
    base_url="http://localhost:5001",
)

providers = client.providers.list()
memory_banks_response = client.memory_banks.list()

bank_id = f"bank_{get_and_increment_counter()}"
provider = providers["memory"][0]
client.memory_banks.register(
    memory_bank_id=bank_id,
    params={
        "embedding_model": "all-MiniLM-L6-v2",
        # Is the default for agent config: https://github.com/meta-llama/llama-stack/blob/66d8f4ffd126bff668434b314892a99fe854a034/llama_stack/providers/inline/agents/meta_reference/agent_instance.py#L668
        "chunk_size_in_tokens": 512,
    },
    provider_id=provider.provider_id,
)
bank_id

'bank_43'

In [11]:
documents = [
    Document(
        document_id=str(idx),
        content=context,
        mime_type="text/plain",
        metadata={},
    )
    for idx, context in zip(df.index, df["context"])
]
documents[:5]

[{'document_id': '0',
  'mime_type': 'text/plain',
  'metadata': {}},
 {'document_id': '1',
  'mime_type': 'text/plain',
  'metadata': {}},
 {'document_id': '2',
  'content': 'ï»¿*********The Project Gutenberg Etext of Crito, by Plato*********\n#16 in our series by Plato\n\nCopyright laws are changing all over the world, be sure to check\nthe copyright laws for your country before posting these files!!\n\nPlease take a look at the important information in this header.\nWe encourage you to keep this file on your own disk, keeping an\nelectronic path open for the next readers.  Do not remove this.\n\n\n**Welcome To The World of Free Plain Vanilla Electronic Texts**\n\n**Etexts Readable By Both Humans and By Computers, Since 1971**\n\n*These Etexts Prepared By Hundreds of Volunteers and Donations*\n\nInformation on contacting Project Gutenberg to get Etexts, and\nfurther information is included below.  We need your donations.\n\n\nCrito\n\nby Plato\n\nTranslated by Benjamin Jowett\n\nMarc

In [12]:
from tqdm import tqdm

for i in tqdm(range(len(documents))):
    client.memory.insert(
        bank_id=bank_id,
        documents=[documents[i]],
    )

  0%|          | 0/50 [00:05<?, ?it/s]


InternalServerError: Error code: 500 - {'detail': 'Internal server error: An unexpected error occurred.'}