In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_parquet('SEP.parquet')

In [None]:
df = df[:100] # Change if necessary; this small fraction is for testing purposes

In [None]:
df.head(2)

In [None]:
df['ID'] = df.index + 1
df['Title'] = df['Title'].astype(str)
df['Text'] = df['Text'].astype(str)
df['Bib_Refined'] = df['Bib_Refined'].astype(str)
df['Other Resources'] = df['Other Resources'].astype(str)
df['Related'] = df['Related'].astype(str)
df['Authors'] = df['Authors'].apply(lambda my_list: {item['name']: item['email'] for item in my_list})
df['Authors'] = df['Authors'].apply(lambda my_list: [name + ' --- ' + (email if email is not None else 'No email provided') for name, email in my_list.items()])
df['FINAL_TEXT'] = "Table of Content: " + df['TOC'] + "\n\n" + "Text: " + df['Text'] + "\n\n" + "Bibliography: " + df['Bib_Refined'] + "\n\n" + "Other Resources: " + df['Other Resources'] 

# Splitting the Text

In [129]:
import re

input_string = str(tiktoken.encoding_for_model('gpt-3.5-turbo'))
match = re.search(r"'(.*?)'", input_string)
result = match.group(1)
result

'cl100k_base'

In [130]:
import tiktoken
import re

# create the length function
def tiktoken_len(text):
    input_string = str(tiktoken.encoding_for_model('gpt-3.5-turbo'))
    match = re.search(r"'(.*?)'", input_string)
    result = match.group(1)

    tokenizer = tiktoken.get_encoding(result)
    
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [131]:
tiktoken_len('hiiii ssasd')

5

In [118]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

class TextSplitter:
    def __init__(self):
        # Set default splitter and prompt the user for a change
        print("The default splitter is RecursiveCharacterTextSplitter.")
        change_splitter = input("Do you want to use a different splitter? (yes/no): ").lower()
        if change_splitter.lower().strip() == 'yes':
            splitter_input = input("Available splitter: RecursiveCharacterTextSplitter. Please enter the splitter you want to use: ")
            if splitter_input == 'RecursiveCharacterTextSplitter':
                splitter = RecursiveCharacterTextSplitter
            else:
                raise ValueError("Unsupported splitter type")
        else:
            splitter = RecursiveCharacterTextSplitter
        
        # Prompt user for chunk size
        chunk_size = int(input("Enter chunk size (e.g., 400): "))
        
        # Prompt user for chunk overlap
        chunk_overlap = int(input("Enter chunk overlap size (e.g., 20): "))
        
        # Assuming 'tiktoken_len' is the length function to be used
        length_function = tiktoken_len
        
        # Set default separators and offer to change them
        default_separators = ["\n\n", "\n", " ", ""]
        print("Default separators are: ['\\n\\n', '\\n', ' ', '']")
        change_separators = input("Do you want to change the default separators? (yes/no): ").lower()
        if change_separators.lower().strip() == 'yes':
            separators = input("Enter separators (seprate them by space): ").split()
        else:
            separators = default_separators
        
        self.text_splitter = splitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function,
            separators=separators
        )
    
    def split_text(self, text):
        return self.text_splitter.split_text(text)

In [None]:
splitter = TextSplitter()
text = df['FINAL_TEXT'][0]
result = splitter.split_text(text)

In [71]:
result[1]

'Text: \n1. Christian Thomasius\n1.1 Life and Works\n\nChristian Thomasius was born on 1 January 1655 in Leipzig. He was the\nson of Jakob Thomasius (1622–84), a well-known jurist and\nphilosopher at the University of Leipzig who counted Leibniz among his\nstudents. Christian (hereafter simply ‘Thomasius’)\nmatriculated in the philosophy faculty at Leipzig in 1669, and was\npromoted to Magister artium in 1672. As a result of his\nfather’s lectures, particularly on Hugo Grotius’ De\njure belli ac pacis, and his interest in Samuel Pufendorf’s\nDe jure naturae et gentium, Thomasius took up the study of\nlaw in Frankfurt an der Oder in 1675 and was awarded a doctorate in\n1679. After a brief journey to Holland, Thomasius returned to Leipzig\nwhere he worked (unhappily) as a lawyer while also holding private\nlectures on natural jurisprudence. Thomasius attests to the\nfundamental reorientation of his thinking effected by his reading of\nPufendorf, and the Apologia pro se et suo libro (1674

In [37]:
for i in range(len(result)):
    print(tiktoken_len(result[i]))

277
579
797
783
788
480
635
782
530
643
576
526
678
577
776
787
450
668
528
496
723
613
640
298
748
729
608
678
606
667
776
768
774
719
762
772
725
743
786
519
80


# Creating Embeddings

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_cohere import CohereEmbeddings
import getpass

class EmbedCreator:
    def __init__(self):
        provider = input("Are you using OpenAI or Cohere embeddings? ")
        default_openai_api_key = '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'
        default_cohere_api_key = 'CBzSlf1OukbDlWDnAxCLjxAdwxOmDQbYc4F5b3WG'

        if provider.lower().strip() == 'openai':
            print('Available: [text-embedding-3-small, text-embedding-3-large]')
            model_name = input('Which model? ')
            use_default_key = input("Change default OpenAI API key? (yes/no): ").lower()
            OPENAI_API_KEY = default_openai_api_key if use_default_key == 'no' else getpass.getpass()
            
            self.embed = OpenAIEmbeddings(
                model=model_name,
                openai_api_key=OPENAI_API_KEY)
            
        elif provider.lower().strip() == 'cohere':
            print('Available: [embed-english-light-v2.0, embed-english-light-v3.0]')
            model_name = input('Which model? ')
            use_default_key = input("Use default Cohere API key? (yes/no): ").lower().strip()
            COHERE_API_KEY = default_cohere_api_key if use_default_key == 'yes' else getpass.getpass()
            
            self.embed = CohereEmbeddings(
                model=model_name,
                apiKey=COHERE_API_KEY)
        else:
            raise ValueError("Unsupported Provider.")
            
    def embed_documents(self, texts):
        return self.embed.embed_documents(texts)

In [182]:
# Usage example
embed = EmbedCreator()

texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
print('\n')
print(len(res), len(res[0]))

Are you using OpenAI or Cohere embeddings? openai
Available: [text-embedding-3-small, text-embedding-3-large]
Which model? text-embedding-3-large
Change default OpenAI API key? (yes/no): no


2 3072


# Creating Vector Database

In [180]:
from pinecone import Pinecone, ServerlessSpec
import time
import getpass

class VectorDB:

    def __init__(self):
        # Setup API key
        default_api_key = 'e8e8297c-07af-4895-9f67-c19ece58bb3c'  # Handle your API key securely
        api = input("Do you want to change your Pinecone API key? (yes/no) ")
        if api.lower().strip() == 'yes':
            api_key = getpass.getpass()
        else:
            api_key = default_api_key

        # Initialize Pinecone client
        self.pc = Pinecone(api_key=api_key)

        # Default cloud provider and region
        print("Cloud provider default: AWS")
        print("Cloud region default: us-west-2")

        # Optionally change cloud provider or region
        cloud_specs = input("Do you want to change your Pinecone cloud provider or region? (yes/no) ")
        if cloud_specs.lower().strip() == 'yes':
            cloud = input("What provider? ")
            region = input("What region? ")
            self.spec = ServerlessSpec(cloud=cloud, region=region)
        else:
            self.spec = ServerlessSpec(cloud="aws", region="us-west-2")

    def list_indexes(self):
        # Fetching and listing indexes
        return self.pc.list_indexes()

    def list_cloud(self, index_name=None):
        # Use class instance index_name if not provided
        index_name = index_name if index_name else self.index_name
        if index_name and index_name in [index['name'] for index in self.list_indexes()]:
            print(f"Index '{index_name}' is configured on:")
            print(f"Cloud Provider: {self.spec.cloud}")
            print(f"Cloud Region: {self.spec.region}")
        else:
            print(f"Index '{index_name}' does not exist.")

    def create_index(self):
        self.index_name = input("Enter the name of the index (e.g.: naive-rag-chunk400-text-embedding-3-small-cos): ")
        dimension = int(input("Enter the dimension of the index: "))
        metric = input("Enter the metric (e.g., 'euclidean', 'cosine'): ")
        existing_indexes = [index['name'] for index in self.list_indexes()]
        if self.index_name not in existing_indexes:
            print(f"Creating index '{self.index_name}'...")
            self.pc.create_index(
                self.index_name,
                dimension=dimension,
                metric=metric,
                spec=self.spec
            )
            while not self.pc.describe_index(self.index_name).status['ready']:
                self.time.sleep(1)
            print(f"Index '{self.index_name}' created and is now ready.")
        else:
            print(f"Index '{self.index_name}' already exists. No action taken.")

    def connect_to_index(self):
        if self.index_name and self.index_name in [index['name'] for index in self.list_indexes()]:
            self.index = self.pc.Index(self.index_name)
            print(f"Connected to index '{self.index_name}'.")
            return self.index
        else:
            raise Exception(f"Index '{self.index_name}' does not exist.")

    def delete_index(self, index_name=None):
        # Use class instance index_name if not provided
        index_name = index_name if index_name else self.index_name
        if index_name and index_name in [index['name'] for index in self.list_indexes()]:
            self.pc.delete_index(index_name)
            print(f"Index '{index_name}' has been deleted.")
        else:
            print(f"Index '{index_name}' does not exist.")

In [187]:
# Usage Example

index_name = 'test-index'
vector_db = VectorDB() # Create an index named 'test-index'
vector_db.create_index()
print(vector_db.list_indexes(),'\n')
index = vector_db.connect_to_index()
print(vector_db.list_cloud(index_name))
vector_db.delete_index(index_name)
print(vector_db.list_cloud(index_name))

Do you want to change your Pinecone API key? (yes/no) no
Cloud provider default: AWS
Cloud region default: us-west-2
Do you want to change your Pinecone cloud provider or region? (yes/no) no
Enter the name of the index (e.g.: naive-rag-chunk400-text-embedding-3-small-cos): test-index
Enter the dimension of the index: 158
Enter the metric (e.g., 'euclidean', 'cosine'): euclidean
Creating index 'test-index'...
Index 'test-index' created and is now ready.
{'indexes': [{'dimension': 1536,
              'host': 'langchain-retrieval-augmentation-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'dotproduct',
              'name': 'langchain-retrieval-augmentation',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 158,
              'host': 'test-index-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'euclidean',
              'name': 'test-index'

# Indexing Docs

In [136]:
# A preview of the pipeline in the next cell:
    
from tqdm import tqdm  # Make sure to import tqdm
from uuid import uuid4

print('Setting up the text splitter..')
print('------------------')

text_splitter = TextSplitter()

print('------------------')

print('Setting up the embedding model..')
print('------------------')
embed = EmbedCreator()

print('------------------')

print('Setting up the vector database..')
print('------------------')
index_name = 'rag-test-auto'
vector_db = VectorDB()
vector_db.create_index()
index = vector_db.connect_to_index(index_name)

Setting up the text splitter..
------------------
The default splitter is RecursiveCharacterTextSplitter.
Do you want to use a different splitter? (yes/no): no
Enter chunk size (e.g., 400): 800
Enter chunk overlap size (e.g., 20): 20
Default separators are: ['\n\n', '\n', ' ', '']
Do you want to change the default separators? (yes/no): no
------------------
Setting up the embedding model..
------------------
Are you using OpenAI or Cohere embeddings? openai
Available: [text-embedding-3-small, text-embedding-3-large]
Which model? text-embedding-3-small
Use default OpenAI API key? (yes/no): yes
------------------
Setting up the vector database..
------------------
Do you want to change your Pinecone API key? (yes/no) no
Cloud provider default: AWS
Cloud region default: us-west-2
Do you want to change your Pinecone cloud provider or region? (yes/no) no
Enter the name of the index: test2-april15
Enter the dimension of the index: 1890
Enter the metric (e.g., 'euclidean', 'cosine'): cosine
C

In [154]:
from tqdm import tqdm
from uuid import uuid4
import pandas as pd  # Assuming you're using a DataFrame

# Define a class to encapsulate the pipeline
class TextProcessingPipeline:
    def __init__(self, dataframe):
        self.df = dataframe
        self.setup_components()

    def setup_components(self):
        print('Setting up the text splitter..')
        print('------------------')
        self.text_splitter = TextSplitter()
        
        print('\nSetting up the embedding model..')
        print('------------------')
        self.embed = EmbedCreator()

        print('\nSetting up the vector database..')
        print('------------------')
        #self.index_name = 'xyz'
        self.vector_db = VectorDB()
        self.vector_db.create_index()
        self.index = self.vector_db.connect_to_index()

    def process_texts(self, batch_limit=100):
        texts = []
        metadatas = []
        
        # Process each row in the dataframe
        for i in tqdm(range(len(self.df))):
            metadata = {
                'article_id': str(self.df['ID'].iloc[i]),
                'source': self.df['Url'].iloc[i],
                'title': self.df['Title'].iloc[i],
                'authors': self.df['Authors'].iloc[i],
                'citation': self.df['BibURL'].iloc[i],
                'date': self.df['Date'].iloc[i]
            }
            
            # Split text into chunks and create metadata for each chunk
            record_texts = self.text_splitter.split_text(self.df['Text'].iloc[i])
            record_metadatas = [{'chunk': j, 'text': text, **metadata} for j, text in enumerate(record_texts)]
            
            texts.extend(record_texts)
            metadatas.extend(record_metadatas)
            
            # Check if batch limit is reached to process and upsert data
            if len(texts) >= batch_limit:
                self.embed_and_upsert(texts, metadatas)
                texts = []
                metadatas = []
                
        # Process any remaining texts and metadata after the loop
        if texts:
            self.embed_and_upsert(texts, metadatas)

    def embed_and_upsert(self, texts, metadatas):
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = self.embed.embed_documents(texts)
        self.index.upsert(vectors=zip(ids, embeds, metadatas))

# Example of how to use the pipeline
if __name__ == '__main__':
    #df = pd.read_csv('path_to_your_data.csv')  # Load your data into a DataFrame
    pipeline = TextProcessingPipeline(df)

Setting up the text splitter..
The default splitter is RecursiveCharacterTextSplitter.
Do you want to use a different splitter? (yes/no): no
Enter chunk size (e.g., 400): 800
Enter chunk overlap size (e.g., 20): 20
Default separators are: ['\n\n', '\n', ' ', '']
Do you want to change the default separators? (yes/no): no
Setting up the embedding model..
Are you using OpenAI or Cohere embeddings? openai
Available: [text-embedding-3-small, text-embedding-3-large]
Which model? text-embedding-3-small
Use default OpenAI API key? (yes/no): yes
Setting up the vector database..
Do you want to change your Pinecone API key? (yes/no) no
Cloud provider default: AWS
Cloud region default: us-west-2
Do you want to change your Pinecone cloud provider or region? (yes/no) no
Enter the name of the index: rag-test-3
Enter the dimension of the index: 1536
Enter the metric (e.g., 'euclidean', 'cosine'): cosine
Creating index 'rag-test-3'...
Index 'rag-test-3' created and is now ready.
Connected to index 'rag

In [155]:
pipeline.process_texts()

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:42<00:00,  1.02s/it]


# Connecting to Index of Choice out of the Blue

In [191]:
from pinecone import Pinecone, ServerlessSpec
default_api_key = 'e8e8297c-07af-4895-9f67-c19ece58bb3c'
pc = Pinecone(api_key=default_api_key)

# List index names
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'langchain-retrieval-augmentation-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'dotproduct',
              'name': 'langchain-retrieval-augmentation',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'rag-test-3-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'rag-test-3',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'canopy--advanced-rag-cion06v.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'canopy--advanced-rag',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, '

In [192]:
# Connect
index = pc.Index('naive-rag-chunk400-text-embedding-3-small-cos')

In [195]:
# index.upsert(vectors=zip(ids, embeds, metadatas))