In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_parquet('SEP.parquet')

In [None]:
df = df[:100] # Change if necessary; this small fraction is for testing purposes

In [None]:
df.head(2)

In [None]:
df['ID'] = df.index + 1
df['Title'] = df['Title'].astype(str)
df['Text'] = df['Text'].astype(str)
df['Bib_Refined'] = df['Bib_Refined'].astype(str)
df['Other Resources'] = df['Other Resources'].astype(str)
df['Related'] = df['Related'].astype(str)
df['Authors'] = df['Authors'].apply(lambda my_list: {item['name']: item['email'] for item in my_list})
df['Authors'] = df['Authors'].apply(lambda my_list: [name + ' --- ' + (email if email is not None else 'No email provided') for name, email in my_list.items()])
df['FINAL_TEXT'] = "Table of Content: " + df['TOC'] + "\n\n" + "Text: " + df['Text'] + "\n\n" + "Bibliography: " + df['Bib_Refined'] + "\n\n" + "Other Resources: " + df['Other Resources'] 

# Splitting the Text

In [129]:
import re

input_string = str(tiktoken.encoding_for_model('gpt-3.5-turbo'))
match = re.search(r"'(.*?)'", input_string)
result = match.group(1)
result

'cl100k_base'

In [130]:
import tiktoken
import re

# create the length function
def tiktoken_len(text):
    input_string = str(tiktoken.encoding_for_model('gpt-3.5-turbo'))
    match = re.search(r"'(.*?)'", input_string)
    result = match.group(1)

    tokenizer = tiktoken.get_encoding(result)
    
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [131]:
tiktoken_len('hiiii ssasd')

5

In [118]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

class TextSplitter:
    def __init__(self):
        # Set default splitter and prompt the user for a change
        print("The default splitter is RecursiveCharacterTextSplitter.")
        change_splitter = input("Do you want to use a different splitter? (yes/no): ").lower()
        if change_splitter.lower().strip() == 'yes':
            splitter_input = input("Available splitter: RecursiveCharacterTextSplitter. Please enter the splitter you want to use: ")
            if splitter_input == 'RecursiveCharacterTextSplitter':
                splitter = RecursiveCharacterTextSplitter
            else:
                raise ValueError("Unsupported splitter type")
        else:
            splitter = RecursiveCharacterTextSplitter
        
        # Prompt user for chunk size
        chunk_size = int(input("Enter chunk size (e.g., 400): "))
        
        # Prompt user for chunk overlap
        chunk_overlap = int(input("Enter chunk overlap size (e.g., 20): "))
        
        # Assuming 'tiktoken_len' is the length function to be used
        length_function = tiktoken_len
        
        # Set default separators and offer to change them
        default_separators = ["\n\n", "\n", " ", ""]
        print("Default separators are: ['\\n\\n', '\\n', ' ', '']")
        change_separators = input("Do you want to change the default separators? (yes/no): ").lower()
        if change_separators.lower().strip() == 'yes':
            separators = input("Enter separators (seprate them by space): ").split()
        else:
            separators = default_separators
        
        self.text_splitter = splitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function,
            separators=separators
        )
    
    def split_text(self, text):
        return self.text_splitter.split_text(text)

In [None]:
splitter = TextSplitter()
text = df['FINAL_TEXT'][0]
result = splitter.split_text(text)

In [71]:
result[1]

'Text: \n1. Christian Thomasius\n1.1 Life and Works\n\nChristian Thomasius was born on 1 January 1655 in Leipzig. He was the\nson of Jakob Thomasius (1622–84), a well-known jurist and\nphilosopher at the University of Leipzig who counted Leibniz among his\nstudents. Christian (hereafter simply ‘Thomasius’)\nmatriculated in the philosophy faculty at Leipzig in 1669, and was\npromoted to Magister artium in 1672. As a result of his\nfather’s lectures, particularly on Hugo Grotius’ De\njure belli ac pacis, and his interest in Samuel Pufendorf’s\nDe jure naturae et gentium, Thomasius took up the study of\nlaw in Frankfurt an der Oder in 1675 and was awarded a doctorate in\n1679. After a brief journey to Holland, Thomasius returned to Leipzig\nwhere he worked (unhappily) as a lawyer while also holding private\nlectures on natural jurisprudence. Thomasius attests to the\nfundamental reorientation of his thinking effected by his reading of\nPufendorf, and the Apologia pro se et suo libro (1674

In [37]:
for i in range(len(result)):
    print(tiktoken_len(result[i]))

277
579
797
783
788
480
635
782
530
643
576
526
678
577
776
787
450
668
528
496
723
613
640
298
748
729
608
678
606
667
776
768
774
719
762
772
725
743
786
519
80


# Creating Embeddings

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_cohere import CohereEmbeddings
import getpass

class EmbedCreator:
    def __init__(self):
        provider = input("Are you using OpenAI or Cohere embeddings? ")
        default_openai_api_key = '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'
        default_cohere_api_key = 'CBzSlf1OukbDlWDnAxCLjxAdwxOmDQbYc4F5b3WG'

        if provider.lower().strip() == 'openai':
            print('Available: [text-embedding-3-small, text-embedding-3-large]')
            model_name = input('Which model? ')
            use_default_key = input("Change default OpenAI API key? (yes/no): ").lower()
            OPENAI_API_KEY = default_openai_api_key if use_default_key == 'no' else getpass.getpass()
            
            self.embed = OpenAIEmbeddings(
                model=model_name,
                openai_api_key=OPENAI_API_KEY)
            
        elif provider.lower().strip() == 'cohere':
            print('Available: [embed-english-light-v2.0, embed-english-light-v3.0]')
            model_name = input('Which model? ')
            use_default_key = input("Use default Cohere API key? (yes/no): ").lower().strip()
            COHERE_API_KEY = default_cohere_api_key if use_default_key == 'yes' else getpass.getpass()
            
            self.embed = CohereEmbeddings(
                model=model_name,
                apiKey=COHERE_API_KEY)
        else:
            raise ValueError("Unsupported Provider.")
            
    def embed_documents(self, texts):
        return self.embed.embed_documents(texts)

In [182]:
# Usage example
embed = EmbedCreator()

texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
print('\n')
print(len(res), len(res[0]))

Are you using OpenAI or Cohere embeddings? openai
Available: [text-embedding-3-small, text-embedding-3-large]
Which model? text-embedding-3-large
Change default OpenAI API key? (yes/no): no


2 3072


# Creating Vector Database

In [180]:
from pinecone import Pinecone, ServerlessSpec
import time
import getpass

class VectorDB:

    def __init__(self):
        # Setup API key
        default_api_key = '25959b28-fb44-44df-9371-13b27f6f3903'  # Handle your API key securely
        api = input("Do you want to change your Pinecone API key? (yes/no) ")
        if api.lower().strip() == 'yes':
            api_key = getpass.getpass()
        else:
            api_key = default_api_key

        # Initialize Pinecone client
        self.pc = Pinecone(api_key=api_key)

        # Default cloud provider and region
        print("Cloud provider default: AWS")
        print("Cloud region default: us-west-2")

        # Optionally change cloud provider or region
        cloud_specs = input("Do you want to change your Pinecone cloud provider or region? (yes/no) ")
        if cloud_specs.lower().strip() == 'yes':
            cloud = input("What provider? ")
            region = input("What region? ")
            self.spec = ServerlessSpec(cloud=cloud, region=region)
        else:
            self.spec = ServerlessSpec(cloud="aws", region="us-west-2")

    def list_indexes(self):
        # Fetching and listing indexes
        return self.pc.list_indexes()

    def list_cloud(self, index_name=None):
        # Use class instance index_name if not provided
        index_name = index_name if index_name else self.index_name
        if index_name and index_name in [index['name'] for index in self.list_indexes()]:
            print(f"Index '{index_name}' is configured on:")
            print(f"Cloud Provider: {self.spec.cloud}")
            print(f"Cloud Region: {self.spec.region}")
        else:
            print(f"Index '{index_name}' does not exist.")

    def create_index(self):
        self.index_name = input("Enter the name of the index (e.g.: naive-rag-chunk400-text-embedding-3-small-cos): ")
        dimension = int(input("Enter the dimension of the index: "))
        metric = input("Enter the metric (e.g., 'euclidean', 'cosine'): ")
        existing_indexes = [index['name'] for index in self.list_indexes()]
        if self.index_name not in existing_indexes:
            print(f"Creating index '{self.index_name}'...")
            self.pc.create_index(
                self.index_name,
                dimension=dimension,
                metric=metric,
                spec=self.spec
            )
            while not self.pc.describe_index(self.index_name).status['ready']:
                self.time.sleep(1)
            print(f"Index '{self.index_name}' created and is now ready.")
        else:
            print(f"Index '{self.index_name}' already exists. No action taken.")

    def connect_to_index(self):
        if self.index_name and self.index_name in [index['name'] for index in self.list_indexes()]:
            self.index = self.pc.Index(self.index_name)
            print(f"Connected to index '{self.index_name}'.")
            return self.index
        else:
            raise Exception(f"Index '{self.index_name}' does not exist.")

    def delete_index(self, index_name=None):
        # Use class instance index_name if not provided
        index_name = index_name if index_name else self.index_name
        if index_name and index_name in [index['name'] for index in self.list_indexes()]:
            self.pc.delete_index(index_name)
            print(f"Index '{index_name}' has been deleted.")
        else:
            print(f"Index '{index_name}' does not exist.")

In [187]:
# Usage Example

index_name = 'test-index'
vector_db = VectorDB() # Create an index named 'test-index'
vector_db.create_index()
print(vector_db.list_indexes(),'\n')
index = vector_db.connect_to_index()
print(vector_db.list_cloud(index_name))
vector_db.delete_index(index_name)
print(vector_db.list_cloud(index_name))

Do you want to change your Pinecone API key? (yes/no) no
Cloud provider default: AWS
Cloud region default: us-west-2
Do you want to change your Pinecone cloud provider or region? (yes/no) no
Enter the name of the index (e.g.: naive-rag-chunk400-text-embedding-3-small-cos): test-index
Enter the dimension of the index: 158
Enter the metric (e.g., 'euclidean', 'cosine'): euclidean
Creating index 'test-index'...
Index 'test-index' created and is now ready.
{'indexes': [{'dimension': 1536,
              'host': 'langchain-retrieval-augmentation-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'dotproduct',
              'name': 'langchain-retrieval-augmentation',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 158,
              'host': 'test-index-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'euclidean',
              'name': 'test-index'

# The Indexing Pipeline -- preview

In [136]:
# A preview of the pipeline in the next cell:
    
from tqdm import tqdm  # Make sure to import tqdm
from uuid import uuid4

print('Setting up the text splitter..')
print('------------------')

text_splitter = TextSplitter()

print('------------------')

print('Setting up the embedding model..')
print('------------------')
embed = EmbedCreator()

print('------------------')

print('Setting up the vector database..')
print('------------------')
index_name = 'rag-test-auto'
vector_db = VectorDB()
vector_db.create_index()
index = vector_db.connect_to_index(index_name)

Setting up the text splitter..
------------------
The default splitter is RecursiveCharacterTextSplitter.
Do you want to use a different splitter? (yes/no): no
Enter chunk size (e.g., 400): 800
Enter chunk overlap size (e.g., 20): 20
Default separators are: ['\n\n', '\n', ' ', '']
Do you want to change the default separators? (yes/no): no
------------------
Setting up the embedding model..
------------------
Are you using OpenAI or Cohere embeddings? openai
Available: [text-embedding-3-small, text-embedding-3-large]
Which model? text-embedding-3-small
Use default OpenAI API key? (yes/no): yes
------------------
Setting up the vector database..
------------------
Do you want to change your Pinecone API key? (yes/no) no
Cloud provider default: AWS
Cloud region default: us-west-2
Do you want to change your Pinecone cloud provider or region? (yes/no) no
Enter the name of the index: test2-april15
Enter the dimension of the index: 1890
Enter the metric (e.g., 'euclidean', 'cosine'): cosine
C

# The Indexing Pipeline

In [154]:
from tqdm import tqdm
from uuid import uuid4
import pandas as pd  # Assuming you're using a DataFrame

# Define a class to encapsulate the pipeline
class TextProcessingPipeline:
    def __init__(self, dataframe):
        self.df = dataframe
        self.setup_components()

    def setup_components(self):
        print('Setting up the text splitter..')
        print('------------------')
        self.text_splitter = TextSplitter()
        
        print('\nSetting up the embedding model..')
        print('------------------')
        self.embed = EmbedCreator()

        print('\nSetting up the vector database..')
        print('------------------')
        #self.index_name = 'xyz'
        self.vector_db = VectorDB()
        self.vector_db.create_index()
        self.index = self.vector_db.connect_to_index()

    def process_texts(self, batch_limit=100):
        texts = []
        metadatas = []
        
        # Process each row in the dataframe
        for i in tqdm(range(len(self.df))):
            metadata = {
                'article_id': str(self.df['ID'].iloc[i]),
                'source': self.df['Url'].iloc[i],
                'title': self.df['Title'].iloc[i],
                'authors': self.df['Authors'].iloc[i],
                'citation': self.df['BibURL'].iloc[i],
                'date': self.df['Date'].iloc[i]
            }
            
            # Split text into chunks and create metadata for each chunk
            record_texts = self.text_splitter.split_text(self.df['Text'].iloc[i])
            record_metadatas = [{'chunk': j, 'text': text, **metadata} for j, text in enumerate(record_texts)]
            
            texts.extend(record_texts)
            metadatas.extend(record_metadatas)
            
            # Check if batch limit is reached to process and upsert data
            if len(texts) >= batch_limit:
                self.embed_and_upsert(texts, metadatas)
                texts = []
                metadatas = []
                
        # Process any remaining texts and metadata after the loop
        if texts:
            self.embed_and_upsert(texts, metadatas)

    def embed_and_upsert(self, texts, metadatas):
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = self.embed.embed_documents(texts)
        self.index.upsert(vectors=zip(ids, embeds, metadatas))

# Example of how to use the pipeline
if __name__ == '__main__':
    #df = pd.read_csv('path_to_your_data.csv')  # Load your data into a DataFrame
    pipeline = TextProcessingPipeline(df)

Setting up the text splitter..
The default splitter is RecursiveCharacterTextSplitter.
Do you want to use a different splitter? (yes/no): no
Enter chunk size (e.g., 400): 800
Enter chunk overlap size (e.g., 20): 20
Default separators are: ['\n\n', '\n', ' ', '']
Do you want to change the default separators? (yes/no): no
Setting up the embedding model..
Are you using OpenAI or Cohere embeddings? openai
Available: [text-embedding-3-small, text-embedding-3-large]
Which model? text-embedding-3-small
Use default OpenAI API key? (yes/no): yes
Setting up the vector database..
Do you want to change your Pinecone API key? (yes/no) no
Cloud provider default: AWS
Cloud region default: us-west-2
Do you want to change your Pinecone cloud provider or region? (yes/no) no
Enter the name of the index: rag-test-3
Enter the dimension of the index: 1536
Enter the metric (e.g., 'euclidean', 'cosine'): cosine
Creating index 'rag-test-3'...
Index 'rag-test-3' created and is now ready.
Connected to index 'rag

In [155]:
pipeline.process_texts()

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:42<00:00,  1.02s/it]


# Naive RAG Pipeline

In [245]:
list(pc.list_indexes().names())

['langchain-retrieval-augmentation',
 'rag-test-3',
 'canopy--advanced-rag',
 'naive-rag-chunk400-text-embedding-3-small-cos']

In [341]:
import getpass
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain


class Naive_RAG:
    def __init__(self):
        self.pinecone_api_key = self.setup_pinecone_api_key()
        self.openai_api_key = self.setup_openai_api_key()
        self.pc = self.setup_pinecone()
        self.index = self.connect_to_index()
        self.embed = self.load_embedding_model()
        self.vectorstore = PineconeVectorStore(self.index, self.embed)
        self.llm = self.setup_llm()
        self.qa = self.setup_qa()

    def setup_pinecone_api_key(self):
        #print('API Keys: ')
        change_key = input("Do you want to change your default Pinecone API key? (yes/no): ")
        if change_key.lower() == 'yes':
            return getpass.getpass("Enter your Pinecone API key: ")
        else:
            return 'c5a1e31b-332e-4ac4-9f53-2042f80c7bfe'
        
    def setup_openai_api_key(self):
        change_key = input("Do you want to change your default OpenAI API key? (yes/no): ")
        if change_key.lower() == 'yes':
            return getpass.getpass("Enter your OpenAI API key: ")
        else:
            return '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'

    def setup_pinecone(self):
        return Pinecone(api_key=self.pinecone_api_key)

    def connect_to_index(self):
        index_names = self.pc.list_indexes().names()
        print("\nAvailable indexes:", index_names)
        index_name = input("Enter the name of the index you want to connect to: ")
        return self.pc.Index(index_name)

    def load_embedding_model(self):
        model_names = ['text-embedding-3-small', 'text-embedding-3-large', 'embed-english-light-v2.0', 'embed-english-light-v3.0']
        print("\nAvailable Embedding models:", model_names)
        model_name = input("Enter the name of the embedding model you want to use: ")
        return OpenAIEmbeddings(
            model=model_name,
            openai_api_key=self.openai_api_key
        )

    def setup_llm(self):
        llm_models = ['gpt-3.5-turbo', 'gpt-4', '...']
        print("\nAvailable LLM models:", llm_models)
        llm_model = input("Enter the name of the LLM model you want to use: ")
        temp = float(input("\nEnter a temperature value (0.0 to 1.0): "))
        return ChatOpenAI(
            openai_api_key=self.openai_api_key,
            model_name=llm_model,
            temperature=temp
        )

    def setup_qa(self):
        # Provide a choice between RetrievalQA and RetrievalQAWithSourcesChain
        print("\nAvailable QA Chain Types:")
        print("1. RetrievalQA")
        print("2. RetrievalQAWithSourcesChain")
        choice = input("Select the QA Chain type (enter 1 or 2): ")
        n=int(input("How many documents should the LLM retrieve and respond off of? "))
        
        if choice == '1':
            return RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",  # Adjust as necessary based on available chain types
                retriever=self.vectorstore.as_retriever(search_kwargs={'k': n})
            )
        elif choice == '2':
            return RetrievalQAWithSourcesChain.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                retriever=self.vectorstore.as_retriever(search_kwargs={'k': n}),
            )
        else:
            print("Invalid selection, defaulting to RetrievalQA.")
            return RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                retriever=self.vectorstore.as_retriever(search_kwargs={'k': 3})
            )

    # This only retrieves docs, nothing genrative about it. Default to 5
    #def query(self, query_text):
        #k=input("How many docs to retrieve? ")
        #results = self.vectorstore.similarity_search(
            #query_text,
            #k=int(k)
        #)
        #return results

    def invoke_qa(self, question):
        answer = self.qa.invoke(question)
        return answer

# Example usage
if __name__ == '__main__':
    krs = Naive_RAG()
    #query_result = krs.query("What does Socrates think about death?")
    #print("\nQuery Results:", query_result)
    #print('\n')
    answer = krs.invoke_qa("\nWhat does Socrates think about death?\n")
    print("\nResult:", answer)

Do you want to change your default Pinecone API key? (yes/no): 
Do you want to change your default OpenAI API key? (yes/no): 

Available indexes: ['langchain-retrieval-augmentation', 'rag-test-3', 'canopy--advanced-rag', 'naive-rag-chunk400-text-embedding-3-small-cos']
Enter the name of the index you want to connect to: rag-test-3

Available Embedding models: ['text-embedding-3-small', 'text-embedding-3-large', 'embed-english-light-v2.0', 'embed-english-light-v3.0']
Enter the name of the embedding model you want to use: text-embedding-3-small

Available LLM models: ['gpt-3.5-turbo', 'gpt-4', '...']
Enter the name of the LLM model you want to use: gpt-3.5-turbo

Enter a temperature value (0.0 to 1.0): 0

Available QA Chain Types:
1. RetrievalQA
2. RetrievalQAWithSourcesChain
Select the QA Chain type (enter 1 or 2): 2
How many documents should the LLM retrieve and respond off of? 4

Result: {'question': '\nWhat does Socrates think about death?\n', 'answer': 'Socrates believes that the so

In [342]:
krs.setup_qa().invoke("What does Scorates think about death?")


Available QA Chain Types:
1. RetrievalQA
2. RetrievalQAWithSourcesChain
Select the QA Chain type (enter 1 or 2): 2
How many documents should the LLM retrieve and respond off of? 4


{'question': 'What does Scorates think about death?',
 'answer': "Socrates' arguments for the immortality of the soul were offered to interlocutors who were not convinced of the idea. Socrates himself was presented as being noncommittal about what happens to the soul at death. There is no clear indication of what Socrates thought about death.\n",
 'sources': 'https://plato.stanford.edu/archives/spr2024/entries/ancient-soul/'}

# Adding a Reranker

In [287]:
print(answer['question'])


What does Socrates think about death?



In [300]:
#Turn the Documnet object of the reurned top results into a JSON format for the use of the reranker model

def list_to_dic(docs):
    list_of_dics = []
    
    for i in range(len(docs)):
        dic = {'text' : docs[i].page_content, 
         'article_id': docs[i].metadata['article_id'],
         'authors': docs[i].metadata['authors'],
         'chunk': docs[i].metadata['chunk'],
         'citation': docs[i].metadata['citation'],
         'date': docs[i].metadata['date'],
         'source': docs[i].metadata['source'],
         'title': docs[i].metadata['title']}

        list_of_dics.append(dic)
    
    return list_of_dics

Do you want to change your default Pinecone API key? (yes/no): 
Do you want to change your default OpenAI API key? (yes/no): 
Do you want to change your default Cohere API key? (yes/no): 

Available indexes: ['langchain-retrieval-augmentation', 'rag-test-3', 'canopy--advanced-rag', 'naive-rag-chunk400-text-embedding-3-small-cos']
Enter the name of the index you want to connect to: langchain-retrieval-augmentation

Available Embedding models: ['text-embedding-3-small', 'text-embedding-3-large', 'embed-english-light-v2.0', 'embed-english-light-v3.0']
Enter the name of the embedding model you want to use: text-embedding-3-small

Available LLM models: ['gpt-3.5-turbo', 'gpt-4', '...']
Enter the name of the LLM model you want to use: gpt-3.5-turbo

Enter a temperature value (0.0 to 1.0): 0
How many docs to retrieve? 7
Do you want to rerank the results? (yes/no)yes
Which reranker model? Options: [rerank-english-v2.0, rerank-english-v3.0] rerank-english-v3.0
Insert the number of top most rele

In [292]:
import cohere

# init client
co = cohere.Client(api_key = "7mwbGgEEMtVMZ7qIBUlGo2AAOcfPE66uCtWeqI8o")

In [293]:
vectorstore

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x23066823320>

In [308]:
query = "What does Scorates think about death?"

if __name__ == '__main__':
    krs = Naive_RAG()
    docs = krs.query(query)

docs

API Keys: 
Do you want to change your default Pinecone API key? (yes/no): no
Do you want to change your default OpenAI API key? (yes/no): no

Available indexes: ['langchain-retrieval-augmentation', 'rag-test-3', 'canopy--advanced-rag', 'naive-rag-chunk400-text-embedding-3-small-cos']
Enter the name of the index you want to connect to: rag-test-3

Available Embedding models: ['text-embedding-3-small', 'text-embedding-3-large', 'embed-english-light-v2.0', 'embed-english-light-v3.0']
Enter the name of the embedding model you want to use: text-embedding-3-small

Available LLM models: ['gpt-3.5-turbo', 'gpt-4', '...']
Enter the name of the LLM model you want to use: gpt-3.5-turbo

Enter a temperature value (0.0 to 1.0): 0

Available QA Chain Types:
1. RetrievalQA
2. RetrievalQAWithSourcesChain
Select the QA Chain type (enter 1 or 2): 1
How many documents should the LLM retrieve and respond off of? 4
How many docs to retrieve? 6


[Document(page_content='In most cultures, there is evidence of a belief in some sort of\npersonal afterlife, in which the same individual that lived and died\nnevertheless persists and continues to have new experiences. There are\nalternatives, however. The ancient Greeks are noted for having placed\na high premium on “survival” in the memory and honor of\nthe community—a practice reflected in our reference to deceased\ncelebrities as (for example) “the immortal Babe Ruth”.\n(Strictly speaking, this for the Greeks was not a replacement for a\npersonal afterlife, but rather a supplement to what was conceived as a\nrather colorless and unrewarding existence in Hades.) Such a hope, it\nwould seem, provides a major consolation only if one is optimistic\nconcerning the persistence and continued memory of the community, as\nwell as the accuracy and justice of their judgments. An interesting\nvariant of this form of immorality is found in process theology, with\nits promise of “objective immo

In [309]:
list_to_dic(docs)

[{'text': 'In most cultures, there is evidence of a belief in some sort of\npersonal afterlife, in which the same individual that lived and died\nnevertheless persists and continues to have new experiences. There are\nalternatives, however. The ancient Greeks are noted for having placed\na high premium on “survival” in the memory and honor of\nthe community—a practice reflected in our reference to deceased\ncelebrities as (for example) “the immortal Babe Ruth”.\n(Strictly speaking, this for the Greeks was not a replacement for a\npersonal afterlife, but rather a supplement to what was conceived as a\nrather colorless and unrewarding existence in Hades.) Such a hope, it\nwould seem, provides a major consolation only if one is optimistic\nconcerning the persistence and continued memory of the community, as\nwell as the accuracy and justice of their judgments. An interesting\nvariant of this form of immorality is found in process theology, with\nits promise of “objective immortality” in t

In [310]:
rerank_docs = co.rerank(query=query, documents=list_to_dic(docs), top_n=5, model='rerank-english-v3.0')

In [325]:
rerank_docs



In [312]:
rerank_docs.results

[RerankResponseResultsItem(document=None, index=4, relevance_score=0.0003029734),
 RerankResponseResultsItem(document=None, index=5, relevance_score=0.00019716943),
 RerankResponseResultsItem(document=None, index=0, relevance_score=0.0001634647),
 RerankResponseResultsItem(document=None, index=1, relevance_score=8.481104e-05),
 RerankResponseResultsItem(document=None, index=3, relevance_score=5.2252268e-05)]

In [326]:
[docs[hit.index] for hit in rerank_docs.results]

[Document(page_content="What he does, in fact, conclude is that the soul is most like,\nand most akin to, intelligible being, and that the body is most\nlike perceptible and perishable being. To say this is plainly neither\nto assert nor to imply (as Robinson 1995, 30, appears to think) that\nsoul in some way or other falls short of intelligible, imperishable\nbeing, any more than it is to assert or imply that body in some way or\nother falls short of, or rather rises above, perceptible, perishable\nbeing. The argument leaves it open whether soul is a perfectly\nrespectable member of intelligible reality, the way human bodies are\nperfectly respectable members of perceptible reality, or whether,\nalternatively, soul has some intermediate status in between\nintelligible and perceptible being, rising above the latter, but\nmerely approximating to the former. Socrates does seem to take his\nconclusion to imply, or at least strongly suggest, that it is natural\nfor the soul either “to be a

In [313]:
#After the rerankers

for hit in rerank_docs.results:
    article = docs[hit.index]
    print(article)

page_content="What he does, in fact, conclude is that the soul is most like,\nand most akin to, intelligible being, and that the body is most\nlike perceptible and perishable being. To say this is plainly neither\nto assert nor to imply (as Robinson 1995, 30, appears to think) that\nsoul in some way or other falls short of intelligible, imperishable\nbeing, any more than it is to assert or imply that body in some way or\nother falls short of, or rather rises above, perceptible, perishable\nbeing. The argument leaves it open whether soul is a perfectly\nrespectable member of intelligible reality, the way human bodies are\nperfectly respectable members of perceptible reality, or whether,\nalternatively, soul has some intermediate status in between\nintelligible and perceptible being, rising above the latter, but\nmerely approximating to the former. Socrates does seem to take his\nconclusion to imply, or at least strongly suggest, that it is natural\nfor the soul either “to be altogether 

# The Retriever_Reranker Pipeline

In [343]:
import getpass
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
import cohere
from tqdm.autonotebook import tqdm

def list_to_dic(docs):
    list_of_dics = []
    
    for i in range(len(docs)):
        dic = {'text' : docs[i].page_content, 
         'article_id': docs[i].metadata['article_id'],
         'authors': docs[i].metadata['authors'],
         'chunk': docs[i].metadata['chunk'],
         'citation': docs[i].metadata['citation'],
         'date': docs[i].metadata['date'],
         'source': docs[i].metadata['source'],
         'title': docs[i].metadata['title']}

        list_of_dics.append(dic)
    
    return list_of_dics

class Retriever_Reranker:
    def __init__(self):
        self.pinecone_api_key = self.setup_pinecone_api_key()
        self.openai_api_key = self.setup_openai_api_key()
        self.cohere_api_key = self.setup_cohere_api_key()
        self.pc = self.setup_pinecone()
        self.co = self.setup_cohere()
        self.index = self.connect_to_index()
        self.embed = self.load_embedding_model()
        self.vectorstore = PineconeVectorStore(self.index, self.embed)
        self.llm = self.setup_llm()
        #self.qa = self.setup_qa()

    def setup_pinecone_api_key(self):
        #print('API Keys: ')
        change_key = input("Do you want to change your default Pinecone API key? (yes/no): ")
        if change_key.lower() == 'yes':
            return getpass.getpass("Enter your Pinecone API key: ")
        else:
            return '25959b28-fb44-44df-9371-13b27f6f3903'
        
    def setup_openai_api_key(self):
        change_key = input("Do you want to change your default OpenAI API key? (yes/no): ")
        if change_key.lower() == 'yes':
            return getpass.getpass("Enter your OpenAI API key: ")
        else:
            return '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'
        
    def setup_cohere_api_key(self):
        change_key = input("Do you want to change your default Cohere API key? (yes/no): ")
        if change_key.lower() == 'yes':
            return getpass.getpass("Enter your OpenAI API key: ")
        else:
            return '7qCX4hvl4ywXdz92MnjWeyBxzp3aPKTZbd8vGbiu'

    def setup_pinecone(self):
        return Pinecone(api_key=self.pinecone_api_key)
    
    def setup_cohere(self):
        return cohere.Client(api_key = self.cohere_api_key)

    def connect_to_index(self):
        index_names = self.pc.list_indexes().names()
        print("\nAvailable indexes:", index_names)
        index_name = input("Enter the name of the index you want to connect to: ")
        return self.pc.Index(index_name)

    def load_embedding_model(self):
        model_names = ['text-embedding-3-small', 'text-embedding-3-large', 'embed-english-light-v2.0', 'embed-english-light-v3.0']
        print("\nAvailable Embedding models:", model_names)
        model_name = input("Enter the name of the embedding model you want to use: ")
        return OpenAIEmbeddings(
            model=model_name,
            openai_api_key=self.openai_api_key
        )

    def setup_llm(self):
        llm_models = ['gpt-3.5-turbo', 'gpt-4', '...']
        print("\nAvailable LLM models:", llm_models)
        llm_model = input("Enter the name of the LLM model you want to use: ")
        temp = float(input("\nEnter a temperature value (0.0 to 1.0): "))
        return ChatOpenAI(
            openai_api_key=self.openai_api_key,
            model_name=llm_model,
            temperature=temp
        )

    # This only retrieves docs, nothing genrative about it. Default to 5
    def query(self, query_text):
        k = input("How many docs to retrieve? ")
        naive_results = self.vectorstore.similarity_search(
            query_text,
            k=int(k)
        )

        # Convert naive results to docs for re-ranking
        docs = naive_results  # Assuming naive_results is a list of document objects

        choice = input('Do you want to rerank the results? (yes/no)').lower().strip()
        if choice == 'yes':
            re_model = input('Which reranker model? Options: [rerank-english-v2.0, rerank-english-v3.0] ').lower().strip()
            top_n = int(input('Insert the number of top most relevant docs: ').lower().strip())

            # Ensure docs is a list of dictionaries needed for re-ranking
            docs_as_dict = list_to_dic(docs)
            rerank_docs = self.co.rerank(query=query_text, documents=docs_as_dict, top_n=top_n, model=re_model)
            re_ordering = rerank_docs.results 

            results = [docs[hit.index] for hit in rerank_docs.results]
            print('\nThe Reranked Results:\n')
            print(re_ordering,'\n')
            return results

        else:
            results = naive_results
            return results

    
    
    #def rerank(self)

# Example usage
if __name__ == '__main__':
    krs = Retriever_Reranker()
    query_result = krs.query("What does Socrates think about death?")
    print("\nQuery Results:", query_result)
    #print('\n')
    #answer = krs.invoke_qa("\nWhat does Socrates think about death?\n")
    #print("Answer:", answer)

Do you want to change your default Pinecone API key? (yes/no): 
Do you want to change your default OpenAI API key? (yes/no): 
Do you want to change your default Cohere API key? (yes/no): 

Available indexes: ['langchain-retrieval-augmentation', 'rag-test-3', 'canopy--advanced-rag', 'naive-rag-chunk400-text-embedding-3-small-cos']
Enter the name of the index you want to connect to: rag-test-3

Available Embedding models: ['text-embedding-3-small', 'text-embedding-3-large', 'embed-english-light-v2.0', 'embed-english-light-v3.0']
Enter the name of the embedding model you want to use: text-embedding-3-small

Available LLM models: ['gpt-3.5-turbo', 'gpt-4', '...']
Enter the name of the LLM model you want to use: gpt-3.5-turbo

Enter a temperature value (0.0 to 1.0): 0
How many docs to retrieve? 7
Do you want to rerank the results? (yes/no)yes
Which reranker model? Options: [rerank-english-v2.0, rerank-english-v3.0] rerank-english-v3.0
Insert the number of top most relevantr docs: 3

The Rer