In [4]:
!pip install openai
!pip install langchain
!pip install langchain-openai
!pip install langchain-experimental
!pip install langchainhub
!pip install pinecone-client
!pip install tiktoken
!pip install docx2txt
!pip install pypdf
!pip install requests
!pip install numpy
!pip install pandas
!pip install python-dotenv

Collecting langchain-openai
  Downloading langchain_openai-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-core<0.4.0,>=0.3.27 (from langchain-openai)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting openai<2.0.0,>=1.58.1 (from langchain-openai)
  Downloading openai-1.58.1-py3-none-any.whl.metadata (27 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading langchain_openai-0.2.14-py3-none-any.whl (50 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.28-py3-none-any.whl (411 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.6/411.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading openai-1.58.1-py3-none-any.whl (454 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [5]:
import os
from dotenv import load_dotenv, find_dotenv
env_file = find_dotenv()
load_dotenv(env_file, override=True)

False

In [6]:
def load_file(file_path):
    import os
    base_name, file_extension = os.path.splitext(file_path)
    if file_extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file_path}')
        loader = PyPDFLoader(file_path)
    elif file_extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file_path}')
        loader = Docx2txtLoader(file_path)
    elif file_extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file_path)
    else:
        print('Unsupported document format!')
        return None
    document_data = loader.load()
    return document_data

In [7]:
def fetch_from_wikipedia(search_query, language='en', max_docs_to_load=2):
    from langchain.document_loaders import WikipediaLoader
    wiki_loader = WikipediaLoader(query=search_query, lang=language, load_max_docs=max_docs_to_load)
    loaded_data = wiki_loader.load()
    return loaded_data

In [8]:
def split_into_chunks(document_data, chunk_length=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_length, chunk_overlap=0)
    document_chunks = splitter.split_documents(document_data)
    return document_chunks

In [9]:
def calculate_embedding_cost(documents):
    import tiktoken
    encoder = tiktoken.encoding_for_model('text-embedding-3-small')
    total_token_count = sum([len(encoder.encode(document.page_content)) for document in documents])
    print(f'Total Tokens: {total_token_count}')
    print(f'Embedding Cost in USD: {total_token_count / 1000 * 0.00002:.6f}')

In [10]:
def fetch_or_store_embeddings(index_identifier, data_chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec
    pinecone_client = pinecone.Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
    embedding_model = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536, api_key=os.environ.get("OPEN_AI_KEY"))
    if index_identifier in pinecone_client.list_indexes().names():
        print(f'Index {index_identifier} exists. Loading embeddings ... ', end='')
        vector_store_instance = Pinecone.from_existing_index(index_identifier, embedding_model)
        print('Ok')
    else:
        print(f'Creating index {index_identifier} and embeddings ...', end='')
        pinecone_client.create_index(
            name=index_identifier,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
        vector_store_instance = Pinecone.from_documents(data_chunks, embedding_model, index_name=index_identifier)
        print('Ok')
    return vector_store_instance

In [11]:
def remove_pinecone_index(index_identifier='all'):
    import pinecone
    pinecone_client = pinecone.Pinecone()
    if index_identifier == 'all':
        index_list = pinecone_client.list_indexes().names()
        print('Removing all indexes ... ')
        for index in index_list:
            pinecone_client.delete_index(index)
        print('Done')
    else:
        print(f'Removing index {index_identifier} ...', end='')
        pinecone_client.delete_index(index_identifier)
        print('Done')

In [12]:
def query_and_get_response(store, query, num_results=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI
    language_model = ChatOpenAI(model='gpt-3.5-turbo', temperature=1, api_key=os.environ.get("OPEN_AI_KEY"))
    retriever_instance = store.as_retriever(search_type='similarity', search_kwargs={'k': num_results})
    qa_chain = RetrievalQA.from_chain_type(llm=language_model, chain_type="stuff", retriever=retriever_instance)
    response = qa_chain.invoke(query)
    return response

In [13]:
def split_content(data):
    chunk_size = 1000  # Example chunk size
    chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]
    return chunks
data = "This is some example text that we want to split into smaller chunks. " * 50  # Example large text
chunks = split_content(data)
print(len(chunks))

4


In [14]:
import tiktoken
def count_tokens(text, model="gpt-3.5"):
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(text))
    return num_tokens
def calculate_embedding_cost(chunks, price_per_1000_tokens=0.0004):
    total_tokens = 0
    for chunk in chunks:
        total_tokens += count_tokens(chunk)
    total_cost = (total_tokens / 1000) * price_per_1000_tokens
    return total_cost
chunks = [
    "This is the first chunk of text. It's just an example to demonstrate how token counting works.",
    "Here is the second chunk of text, another example with more content.",
    "Finally, this is the third chunk of text to make sure we have multiple chunks for the demonstration."
]
cost = calculate_embedding_cost(chunks)
print(f"Estimated embedding cost: ${cost:.4f}")

Estimated embedding cost: $0.0000


In [15]:
import time
counter = 1
print('Type "Quit" or "Exit" to exit the program.')
while True:
    user_question = input(f'Query #{counter}: ')
    counter += 1
    if user_question.lower() in ['quit', 'exit']:
        print('Exiting... Goodbye!')
        time.sleep(2)
        break
    response = ask_and_get_answer(vector_store, user_question)
    print(f'\nResponse: {response}')
    print(f'\n {"-" * 50} \n')

Type "Quit" or "Exit" to exit the program.
Query #1: quit
Exiting... Goodbye!
