In [50]:
import os
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv("req.env"), override=True)

#os.environ.get("OPENAI_API_KEY")

True

In [57]:
pip install pypdf -q

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
pip install docx2txt -q

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [66]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)
    
    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        try:
            loader = PyPDFLoader(file_path=file)
            data = loader.load()
            return data
        except Exception as e:
            print(f"Error loading PDF file: {e}")
            return None
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        try:
            loader = Docx2txtLoader(file)
            data = loader.load()
            return data
        except Exception as e:
            print(f"Error loading DOCX file: {e}")
            return None
    else:
        print('Document format is not supported')
        return None


In [60]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks=text_splitter.split_documents(data)
    return chunks

In [61]:
import tiktoken
def print_embedding_cost(texts):
    enc= tiktoken.encoding_for_model('text_embedding-ada-002')
    total_tokens=sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 *0.0004:.6f}')

# Embedding and Uploading to a Vector Database (Pinecone)

In [84]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec
    pc= pinecone.Pinecone()
    embeddings=OpenAIEmbeddings(model='text_embedding-3-small', dimension=1536)
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings...', end='')
        vector_store= Pinecone.from_existing_index(index_name, embeddings)
        print('ok')
    else:
        print(f'Creating index {index_name} and embeddings...', end='')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment = 'gcp-starter')
            
        )
        vector_store=Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')

In [94]:
def insert_or_fetch_embeddings1(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain.embeddings import OpenAIEmbeddings  # Doğru yolu kontrol edin
    from pinecone import PodSpec

    # Pinecone API anahtarınızı eklemeniz gerekebilir # Pinecone API anahtarınızı buraya ekleyin
    pc= pinecone.Pinecone()
    embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')  # OpenAI modelini doğru şekilde belirtin

    pc = pinecone.Pinecone()
    
    if index_name in pc.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings...', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('ok')
    else:
        print(f'Creating index {index_name} and embeddings...', end='')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment = 'gcp-starter')
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')

    return vector_store

In [63]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc =pinecone.Pinecone()
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes...')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        

# Asking and Getting Answers

In [96]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI
    
    if vector_store is None:
        raise ValueError("Vector store is None, cannot proceed with the retrieval.")

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    # k equals 3 means that it will return the three most similar chunks to the user's query.
    
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

    answer = chain.run(q)
    return answer

## Running Code

In [88]:
data = load_document('den.pdf')

print(f'we have {len(data)} pages')

Loading den.pdf
we have 1 pages


In [89]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

14
testing. She used unit test, selenium test, white box test, and black box test. The 
company name is BİMSER. She has new internship about Python backend development.


In [90]:
delete_pinecone_index()

Deleting all indexes...
Ok


In [95]:
index_name= 'askdocument'
vector_store =insert_or_fetch_embeddings1(index_name, chunks)

Creating index askdocument and embeddings...Ok


In [102]:
q = 'What is the topic of presentation of ilayda öcal'
answer= ask_and_get_answer(vector_store, q)
print(answer)

The topic of İlayda Öcal's presentation is cyber security in avionic systems.


In [103]:
q2= 'what is my secondary field'
answer2= ask_and_get_answer(vector_store, q2)
print(answer2)

Your secondary field is applied data science.


In [104]:
q3='list all people that are mentioned in the context'
answer3=ask_and_get_answer(vector_store,q3)
print(answer3)

The people mentioned in the context are:
1. Rufat Naghiyev
2. Ayman Hamdan


In [105]:
q4='what is the whole document about?'
answer4=ask_and_get_answer(vector_store,q4)
print(answer4)

The document seems to be about the user's internship experiences and projects related to artificial intelligence, cyber security in avionic systems, machine learning concepts, and various technologies such as LLMs, NLP, LangChain, and vector databases. It appears that the document may also include a presentation prepared for the company administrator.


# Using ChromaDB as Vector Database

In [108]:
pip install -q chromadb

Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  Building wheel for chroma-hnswlib (pyproject.toml) did not run successfully.
  exit code: 1
  
  [5 lines of output]
  running bdist_wheel
  running build
  running build_ext
  building 'hnswlib' extension
  error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for chroma-hnswlib
ERROR: Could not build wheels for chroma-hnswlib, which is required to install pyproject.toml-based projects

[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


#  Adding Memory (Chat History)

In [109]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

llm = ChatOpenAI(model_name='gpt-4-turbo-preview', temperature=0)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':5})

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc= ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff', # all of the text from documents
    verbose=True
)

In [110]:
def ask_question(q, chain):
    chain.invoke({'question':q})
    return result

# Using a Custom Prompts

In [112]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

llm = ChatOpenAI(model_name='gpt-4-turbo-preview', temperature=0)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':5})
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

system_template = '''
Use The following pieces of context to answer the user's question.
---------------------
Context: {context}
'''

user_template = '''
Question: {question}
Chat History: {chat_history}
'''

messages=[
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt= ChatPromptTemplate.from_messages(messages)

crc= ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff', # all of the text from documents
    combine_docs_chain_kwargs={'prompt':qa_prompt},
    verbose=True
)