# Creating the Test Dataset

In [24]:
import pandas as pd

# Define the column names
column_names = 'IndexName NumArticles Splitter ChunkSize EmbeddingModel Query QueryType NumQueriesGenerated NumDocsPerQuery RerankCritique OrigQuery GenQueries DocsPerQuery'.split()

# Create an empty DataFrame with specified column names
test = pd.DataFrame(columns=column_names)

# Optionally, specify the types for each column if necessary
# Example types could be:
column_types = {
    'IndexName': 'string',  # String for textual data
    'NumArticles': 'int',   # Integer for numerical counts
    'Splitter': 'string',   # String for categorical text
    'ChunkSize': 'int',     # Integer for numerical counts
    'EmbeddingModel': 'string',  # String for textual data
    'Query': 'string',
    'QueryType': 'string',  # 'S' for Simple, 'C' for Complex, hence string
    'NumQueriesGenerated': 'int',    # Integer for numerical counts
    'NumDocsPerQuery': 'int',  # Integer for numerical counts
    'RerankCritique': 'string',  # 'R' for Rerank, 'C' for Critique, 'N' for Neither, hence string
    'OrigQuery': 'string',  # String for textual data
    'GenQueries': 'object', # List, hence object
    'DocsPerQuery': 'object' # List of lists, hence object
}

# Assign types to the DataFrame
test = test.astype(column_types)

# Example to add data which matches the types
test.loc[0] = {
    'IndexName': 'example-index',
    'NumArticles': 100,
    'Splitter': 'RecursiveCharacterTextSplitter',
    'ChunkSize': 500,
    'EmbeddingModel': 'text-embedding-3-small',
    'Query': 'What does Socrates think about death?',
    'QueryType': 'Simple',
    'NumQueriesGenerated': 5,
    'NumDocsPerQuery': 10,
    'RerankCritique': 'R',
    'OrigQuery': 'What does Socrates think about death?',
    'GenQueries': ['Query 1', 'Query 2', 'Query 3'],
    'DocsPerQuery': [[{'doc1': 'content1'}, {'doc2': 'content2'}], [{'doc3': 'content3'}], []]
}


In [25]:
test

Unnamed: 0,IndexName,NumArticles,Splitter,ChunkSize,EmbeddingModel,Query,QueryType,NumQueriesGenerated,NumDocsPerQuery,RerankCritique,OrigQuery,GenQueries,DocsPerQuery
0,example-index,100,RecursiveCharacterTextSplitter,500,text-embedding-3-small,What does Socrates think about death?,Simple,5,10,R,What does Socrates think about death?,"[Query 1, Query 2, Query 3]","[[{'doc1': 'content1'}, {'doc2': 'content2'}],..."


In [27]:
#test.to_csv('test_records.csv',index=False)

# Importing Data

In [28]:
import pandas as pd

In [29]:
test = pd.read_csv('test_records.csv')

In [30]:
test.head()

Unnamed: 0,IndexName,NumArticles,Splitter,ChunkSize,EmbeddingModel,Query,QueryType,NumQueriesGenerated,NumDocsPerQuery,RerankCritique,OrigQuery,GenQueries,DocsPerQuery
0,example-index,100,RecursiveCharacterTextSplitter,500,text-embedding-3-small,What does Socrates think about death?,Simple,5,10,R,What does Socrates think about death?,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],..."


In [15]:
df = pd.read_parquet('SEP.parquet')

In [16]:
df.head()

Unnamed: 0,Url,Title,Preamble,TOC,Text,Bib,Other Resources,Related,Copyright,BibTeX,Date,Authors,BibURL,Bib_Refined
0,https://plato.stanford.edu/archives/spr2024/en...,18th Century German Philosophy Prior to Kant,\n\nKant undoubtedly casts a long shadow in th...,\n\n\n1. Christian Thomasius\n\n1.1 Life and W...,\n1. Christian Thomasius\n1.1 Life and Works\n...,\nBibliography\nPrimary Literature\nBy Author\...,\nOther Internet Resources\n\nChristian-Wolff-...,"\nRelated Entries\n\naesthetics: German, in th...",\n\nCopyright © 2021 by\n\n\nCorey Dyck\n<cdyc...,"InCollection{sep-18thGerman-preKant,\n\tauthor...",2021,"[{'email': 'cdyck5@uwo.ca', 'name': 'Corey Dyc...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[Press, 1738, Tractatus de arte sobrie et\nacc..."
1,https://plato.stanford.edu/archives/spr2024/en...,Abduction,"\n\nIn the philosophical literature, the term ...",\n\n1. Abduction: The General Idea\n\n1.1 Dedu...,\n1. Abduction: The General Idea\n\nYou happen...,"\nBibliography\n\nAchinstein, P., 2001. The Bo...",\nOther Internet Resources\n[Please contact th...,\nRelated Entries\n\nepistemology: Bayesian |\...,\n\nCopyright © 2021 by\n\n\nIgor Douven\n<igo...,"InCollection{sep-abduction,\n\tauthor =\...",2021,"[{'email': 'igor.douven@paris-sorbonne.fr', 'n...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[\nBibliography\n\nAchinstein, P., 2001. The B..."
2,https://plato.stanford.edu/archives/spr2024/en...,Peter Abelard,\n\nPeter Abelard (1079–21 April 1142) [‘Abail...,\n\n\n1. Life and Works\n\n1.1 Life\n1.2 Works...,\n1. Life and Works\n1.1 Life\n\nAbelard’s lif...,\nBibliography\nPrimary texts in Latin\n\nCarm...,\nOther Internet Resources\n\nPierre Abelard o...,"\nRelated Entries\n\nAristotle, General Topics...",\n\nCopyright © 2022 by\n\n\nPeter King\n\nAnd...,"InCollection{sep-abelard,\n\tauthor =\t{...",2022,"[{'email': None, 'name': 'Peter King'}, {'emai...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[Fairweather, E. R., 1995, A Scholastic Miscel..."
3,https://plato.stanford.edu/archives/spr2024/en...,Abhidharma,\n\nThe first centuries after Śākyamuni Buddha...,\n\n1. Abhidharma: its origins and texts\n\n1....,\n1. Abhidharma: its origins and texts\n\nThe ...,\nBibliography\nPrimary Sources\n\nThe texts a...,\nOther Internet Resources\n\nAbhidharma trans...,\nRelated Entries\n\natomism: 17th to 20th cen...,\n\nCopyright © 2022 by\n\n\nNoa Ronkin\n<noa....,"InCollection{sep-abhidharma,\n\tauthor =...",2022,"[{'email': 'noa.ronkin@wolfson.oxon.org', 'nam...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[Bronkhorst, J., 2016, “Abhidharma and Indian\..."
4,https://plato.stanford.edu/archives/spr2024/en...,Abilities,"\n\nIn the accounts we give of one another, cl...",\n\n\n1. A taxonomy\n\n1.1 Dispositions and ot...,\n1. A taxonomy\n\nWhat is an ability? On one ...,"\nBibliography\n\nAlbritton, Rogers, 1985. “Fr...","\nOther Internet Resources\n\nHackl, Martin, 1...",\nRelated Entries\n\naction |\n compatibilism ...,\n\nCopyright © 2020 by\n\n\nJohn Maier\n<john...,"InCollection{sep-abilities,\n\tauthor =\...",2020,"[{'email': 'john@jmaier.net', 'name': 'John Ma...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[Oxford University Press, 1986, 67–80.\nOxford..."


In [17]:
df = df[:100]

In [19]:
len(df)

100

# Queries to Ask
We're going to ask 5 simple (though technical) questions. We also create 5 complex (and technical) queries -- hinging on various parts of an article, or various articles -- where the questions can be broken down to seversal subquestions.

# Creating the Indexes
Using the Indexing Pipeline. Parameters to change are mentioned in my physical notebook

# Changing RAG Parameters
We're using the advanced RAG pipepline from Notebook 7. Parameters to change are mentioned in the physical notebook.