# Creating the Test Dataset

In [70]:
import pandas as pd

# Define the column names
column_names = 'IndexName NumArticles Splitter ChunkSize EmbeddingModel Query QueryType NumQueriesGenerated NumDocsPerQuery RerankCritique OrigQuery GenQueries DocsPerQuery'.split()

# Create an empty DataFrame with specified column names
test = pd.DataFrame(columns=column_names)

# Optionally, specify the types for each column if necessary
# Example types could be:
column_types = {
    'IndexName': 'string',  # String for textual data
    'NumArticles': 'int',   # Integer for numerical counts
    'Splitter': 'string',   # String for categorical text
    'ChunkSize': 'int',     # Integer for numerical counts
    'EmbeddingModel': 'string',  # String for textual data
    'Query': 'string',
    'QueryType': 'string',  # 'S' for Simple, 'C' for Complex, hence string
    'NumQueriesGenerated': 'int',    # Integer for numerical counts
    'NumDocsPerQuery': 'int',  # Integer for numerical counts
    'RerankCritique': 'string',  # 'R' for Rerank, 'C' for Critique, 'N' for Neither, hence string
    'OrigQuery': 'string',  # String for textual data
    'GenQueries': 'object', # List, hence object
    'DocsPerQuery': 'object' # List of lists, hence object
}

# Assign types to the DataFrame
test = test.astype(column_types)

# Example to add data which matches the types
test.loc[0] = {
    'IndexName': 'example-index',
    'NumArticles': 100,
    'Splitter': 'RecursiveCharacterTextSplitter',
    'ChunkSize': 500,
    'EmbeddingModel': 'text-embedding-3-small',
    'Query': 'What does Socrates think about death?',
    'QueryType': 'Simple',
    'NumQueriesGenerated': 5,
    'NumDocsPerQuery': 10,
    'RerankCritique': 'R',
    'OrigQuery': 'What does Socrates think about death?',
    'GenQueries': ['Query 1', 'Query 2', 'Query 3'],
    'DocsPerQuery': [[{'doc1': 'content1'}, {'doc2': 'content2'}], [{'doc3': 'content3'}], []]
}


In [71]:
test

Unnamed: 0,IndexName,NumArticles,Splitter,ChunkSize,EmbeddingModel,Query,QueryType,NumQueriesGenerated,NumDocsPerQuery,RerankCritique,OrigQuery,GenQueries,DocsPerQuery
0,example-index,100,RecursiveCharacterTextSplitter,500,text-embedding-3-small,What does Socrates think about death?,Simple,5,10,R,What does Socrates think about death?,"[Query 1, Query 2, Query 3]","[[{'doc1': 'content1'}, {'doc2': 'content2'}],..."


In [72]:
#test.to_csv('test_records.csv',index=False)

# Importing Data

In [153]:
import pandas as pd

In [154]:
test = pd.read_csv('test_records.csv')

In [155]:
test.head()

Unnamed: 0,IndexName,NumArticles,Splitter,ChunkSize,EmbeddingModel,Query,QueryType,NumQueriesGenerated,NumDocsPerQuery,RerankCritique,OrigQuery,GenQueries,DocsPerQuery,Dimensions
0,chunk200-text-embedding-3-small,100,RecursiveCharacterTextSplitter,200,text-embedding-3-small,Query,,,,,,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1536
1,naive-rag-chunk400-text-embedding-3-small-cos,100,RecursiveCharacterTextSplitter,400,text-embedding-3-small,Query,,,,,,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1536
2,rag-test-3,100,RecursiveCharacterTextSplitter,800,text-embedding-3-small,Query,,,,,,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1536
3,chunk-1000-text-embedding-3-small,100,RecursiveCharacterTextSplitter,1000,text-embedding-3-small,Query,,,,,,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1536


In [76]:
df = pd.read_parquet('SEP.parquet')
df['ID'] = df.index + 1

In [77]:
df.head()

Unnamed: 0,Url,Title,Preamble,TOC,Text,Bib,Other Resources,Related,Copyright,BibTeX,Date,Authors,BibURL,Bib_Refined,ID
0,https://plato.stanford.edu/archives/spr2024/en...,18th Century German Philosophy Prior to Kant,\n\nKant undoubtedly casts a long shadow in th...,\n\n\n1. Christian Thomasius\n\n1.1 Life and W...,\n1. Christian Thomasius\n1.1 Life and Works\n...,\nBibliography\nPrimary Literature\nBy Author\...,\nOther Internet Resources\n\nChristian-Wolff-...,"\nRelated Entries\n\naesthetics: German, in th...",\n\nCopyright © 2021 by\n\n\nCorey Dyck\n<cdyc...,"InCollection{sep-18thGerman-preKant,\n\tauthor...",2021,"[{'email': 'cdyck5@uwo.ca', 'name': 'Corey Dyc...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[Press, 1738, Tractatus de arte sobrie et\nacc...",1
1,https://plato.stanford.edu/archives/spr2024/en...,Abduction,"\n\nIn the philosophical literature, the term ...",\n\n1. Abduction: The General Idea\n\n1.1 Dedu...,\n1. Abduction: The General Idea\n\nYou happen...,"\nBibliography\n\nAchinstein, P., 2001. The Bo...",\nOther Internet Resources\n[Please contact th...,\nRelated Entries\n\nepistemology: Bayesian |\...,\n\nCopyright © 2021 by\n\n\nIgor Douven\n<igo...,"InCollection{sep-abduction,\n\tauthor =\...",2021,"[{'email': 'igor.douven@paris-sorbonne.fr', 'n...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[\nBibliography\n\nAchinstein, P., 2001. The B...",2
2,https://plato.stanford.edu/archives/spr2024/en...,Peter Abelard,\n\nPeter Abelard (1079–21 April 1142) [‘Abail...,\n\n\n1. Life and Works\n\n1.1 Life\n1.2 Works...,\n1. Life and Works\n1.1 Life\n\nAbelard’s lif...,\nBibliography\nPrimary texts in Latin\n\nCarm...,\nOther Internet Resources\n\nPierre Abelard o...,"\nRelated Entries\n\nAristotle, General Topics...",\n\nCopyright © 2022 by\n\n\nPeter King\n\nAnd...,"InCollection{sep-abelard,\n\tauthor =\t{...",2022,"[{'email': None, 'name': 'Peter King'}, {'emai...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[Fairweather, E. R., 1995, A Scholastic Miscel...",3
3,https://plato.stanford.edu/archives/spr2024/en...,Abhidharma,\n\nThe first centuries after Śākyamuni Buddha...,\n\n1. Abhidharma: its origins and texts\n\n1....,\n1. Abhidharma: its origins and texts\n\nThe ...,\nBibliography\nPrimary Sources\n\nThe texts a...,\nOther Internet Resources\n\nAbhidharma trans...,\nRelated Entries\n\natomism: 17th to 20th cen...,\n\nCopyright © 2022 by\n\n\nNoa Ronkin\n<noa....,"InCollection{sep-abhidharma,\n\tauthor =...",2022,"[{'email': 'noa.ronkin@wolfson.oxon.org', 'nam...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[Bronkhorst, J., 2016, “Abhidharma and Indian\...",4
4,https://plato.stanford.edu/archives/spr2024/en...,Abilities,"\n\nIn the accounts we give of one another, cl...",\n\n\n1. A taxonomy\n\n1.1 Dispositions and ot...,\n1. A taxonomy\n\nWhat is an ability? On one ...,"\nBibliography\n\nAlbritton, Rogers, 1985. “Fr...","\nOther Internet Resources\n\nHackl, Martin, 1...",\nRelated Entries\n\naction |\n compatibilism ...,\n\nCopyright © 2020 by\n\n\nJohn Maier\n<john...,"InCollection{sep-abilities,\n\tauthor =\...",2020,"[{'email': 'john@jmaier.net', 'name': 'John Ma...",https://plato.stanford.edu/cgi-bin/encyclopedi...,"[Oxford University Press, 1986, 67–80.\nOxford...",5


In [78]:
df = df[:100]

In [79]:
len(df)

100

In [28]:
df['ID'] = df.index + 1

# Queries to Ask
We're going to ask 5 simple (though technical) questions. We also create 5 complex (and technical) queries -- hinging on various parts of an article, or various articles -- where the questions can be broken down to seversal subquestions.

# Creating the Indexes
Using the Indexing Pipeline. Parameters to change are mentioned in my physical notebook

In [80]:
import tiktoken
import re

# create the length function
def tiktoken_len(text):
    input_string = str(tiktoken.encoding_for_model('gpt-3.5-turbo'))
    match = re.search(r"'(.*?)'", input_string)
    result = match.group(1)

    tokenizer = tiktoken.get_encoding(result)
    
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [161]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

class TextSplitter:
    def __init__(self):
        # Set default splitter and prompt the user for a change
        print("The default splitter is RecursiveCharacterTextSplitter.")
        change_splitter = input("Do you want to use a different splitter? (type 'yes', otherwise hit enter): ").lower()
        if change_splitter.lower().strip() == 'yes':
            splitter_input = input("Available splitter: RecursiveCharacterTextSplitter. Please enter the splitter you want to use: ")
            if splitter_input == 'RecursiveCharacterTextSplitter':
                splitter = RecursiveCharacterTextSplitter
            else:
                raise ValueError("Unsupported splitter type")
        else:
            splitter = RecursiveCharacterTextSplitter
        
        # Prompt user for chunk size
        chunk_size = int(input("Enter chunk size (e.g., 400): "))
        
        # Prompt user for chunk overlap
        chunk_overlap = int(input("Enter chunk overlap size (e.g., 20): "))
        
        # Assuming 'tiktoken_len' is the length function to be used
        length_function = tiktoken_len
        
        # Set default separators and offer to change them
        default_separators = ["\n\n", "\n", " ", ""]
        print("Default separators are: ['\\n\\n', '\\n', ' ', '']")
        change_separators = input("Do you want to change the default separators? (type 'yes', otherwise hit enter): ").lower()
        if change_separators.lower().strip() == 'yes':
            separators = input("Enter separators (seprate them by space): ").split()
        else:
            separators = default_separators
        
        self.text_splitter = splitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function,
            separators=separators
        )
    
    def split_text(self, text):
        return self.text_splitter.split_text(text)

#if __name__ == '__main__':
    #text_splitter = TextSplitter()
    #text = text_splitter.split_text(df['Text'].iloc[0])

In [117]:
text[0]

'1. Christian Thomasius\n1.1 Life and Works\n\nChristian Thomasius was born on 1 January 1655 in Leipzig. He was the\nson of Jakob Thomasius (1622–84), a well-known jurist and\nphilosopher at the University of Leipzig who counted Leibniz among his\nstudents. Christian (hereafter simply ‘Thomasius’)\nmatriculated in the philosophy faculty at Leipzig in 1669, and was\npromoted to Magister artium in 1672. As a result of his\nfather’s lectures, particularly on Hugo Grotius’ De\njure belli ac pacis, and his interest in Samuel Pufendorf’s\nDe jure naturae et gentium, Thomasius took up the study of\nlaw in Frankfurt an der Oder in 1675 and was awarded a doctorate in\n1679. After a brief journey to Holland, Thomasius returned to Leipzig\nwhere he worked (unhappily) as a lawyer while also holding private\nlectures on natural jurisprudence. Thomasius attests to the\nfundamental reorientation of his thinking effected by his reading of\nPufendorf, and the Apologia pro se et suo libro (1674) in\npa

In [111]:
len(text)

11

In [162]:
from langchain_openai import OpenAIEmbeddings
from langchain_cohere import CohereEmbeddings
import getpass

class EmbedCreator:
    def __init__(self):
        provider = input("Are you using OpenAI or Cohere embeddings? (Hit enter for opeanai and type 'cohere' otherwise.)")
        default_openai_api_key = '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'
        default_cohere_api_key = 'CBzSlf1OukbDlWDnAxCLjxAdwxOmDQbYc4F5b3WG'
        
        if provider.lower().strip() == 'cohere':
            print('Available: [embed-english-light-v2.0, embed-english-light-v3.0]')
            model_name = input('Which model? ')
            use_default_key = input("Use default Cohere API key? (yes/no): ").lower().strip()
            COHERE_API_KEY = default_cohere_api_key if use_default_key == 'yes' else getpass.getpass()
            
            self.embed = CohereEmbeddings(
                model=model_name,
                apiKey=COHERE_API_KEY)
        else:
            provider.lower().strip() == 'openai'
            #print('Available: [text-embedding-3-small, text-embedding-3-large]. Enter the number accrodingly:')
            print('1. text-embedding-3-small\n2. text-embedding-3-large')
            
            num = input('Which model number (type the number)? ')
            if int(num.strip()) == 1:
                model_name = 'text-embedding-3-small'
            elif int(num.strip()) == 2:
                model_name = 'text-embedding-3-large'
            else:
                print("Enter only the number.")
                
            use_default_key = input("Change default OpenAI API key? (hit enter of no, and type 'yes' otherwise): ").lower()
            dimension = int(input("Enter the dimension of the embedding model (e.g., 256, 512, 1024, 3072): "))
            OPENAI_API_KEY = getpass.getpass() if use_default_key == 'yes' else default_openai_api_key
            
            self.embed = OpenAIEmbeddings(
                model=model_name,
                openai_api_key=OPENAI_API_KEY,
                dimensions = dimension)
            
    def embed_documents(self, texts):
        return self.embed.embed_documents(texts)

In [163]:
from pinecone import Pinecone, ServerlessSpec
from tqdm.autonotebook import tqdm
import time
import getpass

class VectorDB:

    def __init__(self):
        # Setup API key
        default_api_key = '25959b28-fb44-44df-9371-13b27f6f3903'  # Handle your API key securely
        api = input("Do you want to change your Pinecone API key? (hit enter of no, and type 'yes' otherwise) ")
        if api.lower().strip() == 'yes':
            api_key = getpass.getpass()
        else:
            api_key = default_api_key

        # Initialize Pinecone client
        self.pc = Pinecone(api_key=api_key)

        # Default cloud provider and region
        print("Cloud provider default: AWS")
        print("Cloud region default: us-west-2")

        # Optionally change cloud provider or region
        cloud_specs = input("Do you want to change your Pinecone cloud provider or region? (hit enter of no, and type 'yes' otherwise) ")
        if cloud_specs.lower().strip() == 'yes':
            cloud = input("What provider? ")
            region = input("What region? ")
            self.spec = ServerlessSpec(cloud=cloud, region=region)
        else:
            self.spec = ServerlessSpec(cloud="aws", region="us-west-2")

    def list_indexes(self):
        # Fetching and listing indexes
        return self.pc.list_indexes()

    def list_cloud(self, index_name=None):
        # Use class instance index_name if not provided
        index_name = index_name if index_name else self.index_name
        if index_name and index_name in [index['name'] for index in self.list_indexes()]:
            print(f"Index '{index_name}' is configured on:")
            print(f"Cloud Provider: {self.spec.cloud}")
            print(f"Cloud Region: {self.spec.region}")
        else:
            print(f"Index '{index_name}' does not exist.")

    def create_index(self):
        self.index_name = input("Enter the name of the index (e.g.: naive-rag-chunk400-text-embedding-3-small): ")
        dimension = int(input("Enter the dimension of the index matching the embeddeing model used (e.g., 256, 512, 1024, 3072): "))
        default_metric = 'cosine'
        metric = input("Enter the metric ('euclidean', 'cosine', 'dotproduct') -- hit enter for the default of cosine: ")
        
        if metric != '':
            default_metric = metric
        else:
            pass
            
        existing_indexes = [index['name'] for index in self.list_indexes()]
        if self.index_name not in existing_indexes:
            print(f"Creating index '{self.index_name}'...")
            self.pc.create_index(
                self.index_name,
                dimension=dimension,
                metric=default_metric,
                spec=self.spec
            )
            while not self.pc.describe_index(self.index_name).status['ready']:
                self.time.sleep(1)
            print(f"Index '{self.index_name}' created and is now ready.")
        else:
            print(f"Index '{self.index_name}' already exists. No action taken.")

    def connect_to_index(self):
        if self.index_name and self.index_name in [index['name'] for index in self.list_indexes()]:
            self.index = self.pc.Index(self.index_name)
            print(f"Connected to index '{self.index_name}'.")
            return self.index
        else:
            raise Exception(f"Index '{self.index_name}' does not exist.")

    def delete_index(self, index_name=None):
        # Use class instance index_name if not provided
        index_name = index_name if index_name else self.index_name
        if index_name and index_name in [index['name'] for index in self.list_indexes()]:
            self.pc.delete_index(index_name)
            print(f"Index '{index_name}' has been deleted.")
        else:
            print(f"Index '{index_name}' does not exist.")

In [189]:
from tqdm import tqdm
from uuid import uuid4
import pandas as pd  # Assuming you're using a DataFrame

def divide_into_three(lst):
    n = len(lst) // 3
    sublists = [lst[:n], lst[n:2*n], lst[2*n:]]
    return sublists

# Define a class to encapsulate the pipeline
class TextProcessingPipeline:
    def __init__(self, dataframe):
        self.df = dataframe
        self.setup_components()

    def setup_components(self):
        print('Setting up the text splitter..')
        print('------------------')
        self.text_splitter = TextSplitter()
        
        print('\nSetting up the embedding model..')
        print('------------------')
        self.embed = EmbedCreator()

        print('\nSetting up the vector database..')
        print('------------------')
        #self.index_name = 'xyz'
        self.vector_db = VectorDB()
        self.vector_db.create_index()
        self.index = self.vector_db.connect_to_index()

    def process_texts(self, batch_limit=100):
        all_texts = []
        all_metadatas = []
        
        # Process each row in the dataframe
        for i in tqdm(range(len(self.df))):
            metadata = {
                'article_id': str(self.df['ID'].iloc[i]),
                'source': self.df['Url'].iloc[i],
                'title': self.df['Title'].iloc[i],
                'authors': (self.df['Authors'].iloc[i]),
                'citation': self.df['BibURL'].iloc[i],
                'date': self.df['Date'].iloc[i]
            }
            
            # Split text into chunks and create metadata for each chunk
            record_texts = self.text_splitter.split_text(self.df['Text'].iloc[i])
            record_metadatas = [{'chunk': j, 'text': text, **metadata} for j, text in enumerate(record_texts)]
            
            all_texts.extend(record_texts)
            all_metadatas.extend(record_metadatas)
            
            if len(all_texts) >= batch_limit:
                pairs = list(zip(all_texts, all_metadatas))  # Pair texts with metadatas
                split_pairs = divide_into_three(pairs)  # Split pairs into three sublists
                for pairs in split_pairs:
                    sub_texts, sub_metadatas = zip(*pairs)  # Unzip pairs back into texts and metadatas
                    self.embed_and_upsert(sub_texts, sub_metadatas)
                all_texts = []
                all_metadatas = []
                
        if all_texts:
            pairs = list(zip(all_texts, all_metadatas))
            split_pairs = divide_into_three(pairs)
            for pairs in split_pairs:
                sub_texts, sub_metadatas = zip(*pairs)
                self.embed_and_upsert(sub_texts, sub_metadatas)

    def embed_and_upsert(self, texts, metadatas):
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = self.embed.embed_documents(texts)
        self.index.upsert(vectors=zip(ids, embeds, metadatas))

# Example of how to use the pipeline
if __name__ == '__main__':
    #df = pd.read_csv('path_to_your_data.csv')  # Load your data into a DataFrame
    pipeline = TextProcessingPipeline(df)

Setting up the text splitter..
------------------
The default splitter is RecursiveCharacterTextSplitter.
Do you want to use a different splitter? (type 'yes', otherwise hit enter): 
Enter chunk size (e.g., 400): 1500
Enter chunk overlap size (e.g., 20): 20
Default separators are: ['\n\n', '\n', ' ', '']
Do you want to change the default separators? (type 'yes', otherwise hit enter): 

Setting up the embedding model..
------------------
Are you using OpenAI or Cohere embeddings? (Hit enter for opeanai and type 'cohere' otherwise.)
1. text-embedding-3-small
2. text-embedding-3-large
Which model number (type the number)? 2
Change default OpenAI API key? (hit enter of no, and type 'yes' otherwise): 
Enter the dimension of the embedding model (e.g., 256, 512, 1024, 3072): 3072

Setting up the vector database..
------------------
Do you want to change your Pinecone API key? (hit enter of no, and type 'yes' otherwise) 
Cloud provider default: AWS
Cloud region default: us-west-2
Do you want t

In [190]:
# For smaller text splits (e.g., 200 vs 400 or 800), use smaller batch_limit. Default is 100, I'm using 99 for splits of 200

pipeline.process_texts(batch_limit=99)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:17<00:00,  1.37s/it]


In [191]:
from pinecone import Pinecone
pc=Pinecone(api_key='25959b28-fb44-44df-9371-13b27f6f3903')
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'chunk-1000-text-embedding-3-small-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'chunk-1000-text-embedding-3-small',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1024,
              'host': 'chunk1000-text-embedding-3-large-1024-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'chunk1000-text-embedding-3-large-1024',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 3072,
              'host': 'chunk1500-text-embedding-3-large-3072-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'chunk1500-text-embedding-3-large-3072',
              'spec': {'serverless

In [192]:
#pc.delete_index('chunk400-text-embedding-3-large')

### Adding the index info to the test dataframe

In [None]:
import pandas as pd

test = pd.read_csv('test_records.csv')

In [212]:
test

Unnamed: 0,IndexName,NumArticles,Splitter,ChunkSize,EmbeddingModel,Query,QueryType,NumQueriesGenerated,NumDocsPerQuery,RerankCritique,OrigQuery,GenQueries,DocsPerQuery,Dimensions
0,chunk200-text-embedding-3-small,100,RecursiveCharacterTextSplitter,200,text-embedding-3-small,Query,,,,,,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1536
1,naive-rag-chunk400-text-embedding-3-small-cos,100,RecursiveCharacterTextSplitter,400,text-embedding-3-small,Query,,,,,,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1536
2,rag-test-3,100,RecursiveCharacterTextSplitter,800,text-embedding-3-small,Query,,,,,,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1536
3,chunk-1000-text-embedding-3-small,100,RecursiveCharacterTextSplitter,1000,text-embedding-3-small,Query,,,,,,"['Query 1', 'Query 2', 'Query 3']","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1536
4,chunk400-text-embedding-3-large-512,100,RecursiveCharacterTextSplitter,400,text-embedding-3-large,Query,,,,,,"[Query 1, Query 2, Query 3]","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",512
5,chunk800-text-embedding-3-large-512,100,RecursiveCharacterTextSplitter,800,text-embedding-3-large,Query,,,,,,"[Query 1, Query 2, Query 3]","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",512
6,chunk1000-text-embedding-3-large-1024,100,RecursiveCharacterTextSplitter,1000,text-embedding-3-large,Query,,,,,,"[Query 1, Query 2, Query 3]","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",1024
7,chunk1500-text-embedding-3-large-3072,100,RecursiveCharacterTextSplitter,1500,text-embedding-3-large,Query,,,,,,"[Query 1, Query 2, Query 3]","[[{'doc1': 'content1'}, {'doc2': 'content2'}],...",3072


In [193]:
test.loc[7] = {
    'IndexName': 'chunk1500-text-embedding-3-large-3072',  #------------------------ CHANGE
    'NumArticles': 100,
    'Splitter': 'RecursiveCharacterTextSplitter',
    'ChunkSize': 1500,  #------------------------ CHANGE
    'EmbeddingModel': 'text-embedding-3-large',  #------------------------ CHANGE
    'Query': 'Query',
    'QueryType': None,
    'NumQueriesGenerated': None,
    'NumDocsPerQuery': None,
    'RerankCritique': None,
    'OrigQuery': None,
    'GenQueries': ['Query 1', 'Query 2', 'Query 3'],
    'DocsPerQuery': [[{'doc1': 'content1'}, {'doc2': 'content2'}], [{'doc3': 'content3'}], []],
    'Dimensions': 3072  #------------------------ CHANGE
}

In [213]:
#test['EmbeddingModel'][7]

In [195]:
#test.drop(['Unnamed: 0.1', 'Unnamed: 0'],axis=1,inplace=True)

In [214]:
test.to_csv('test_records.csv',index=False)

# Changing RAG Parameters
We're using the advanced RAG pipepline from Notebook 7. Parameters to change are mentioned in the physical notebook.