In [2]:
# import Libraries
from pinecone import Pinecone
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

In [3]:
# loading enviroment variables
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
## Lets read the document
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    document = file_loader.load()
    return document

In [5]:
doc = read_doc('documents/')
len(doc)

3

In [6]:
## Divide the docs into chunks using splitter

def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    return chunks


In [7]:
chunks = chunk_data(docs=doc)
chunks

[Document(page_content='8 Years Experience\nShan Ali Khan@ ADDO AI\nSenior Data Scientist\nContact\nPhone\n+92 346 467 4070\nAddress\nWapda Town, Lahore - PK\nEmail\nshanalikhan@hotmail.com\nExpertise\nSkill setData Analysis & Visualization\nMLOpsNatural Language Processing\nExperiencesEducational\n2016 - 2018 Masters in Computer Science\nMachine Learning, Natural Language Processing, Statistical Modeling, ...\nPunjab University College Of Information Technology\nLahore, Pakistan \nSystem Analysis & Design\nAzure\nSynapse\nData Bricks, ML Studio\nML\nLanguage Modeling\nNER\nSpeech AnalysisDocument Classification\nTime-series Analysis2010 - 2014 Bachelors in Computer Science\nObject Orient Programming, Data Structures, Software Engineering\nUniversity of Central Punjab\nLahore, Pakistan\nSenior Data Scientist - ADDO AI', metadata={'source': 'documents/Shan+Ali+Khan.pdf', 'page': 0}),
 Document(page_content='Lahore, Pakistan\nSenior Data Scientist - ADDO AI\nApril 2021 - PresentTools / T

In [8]:
import os

## Embedding Technique of OpenAI
embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7b5d1032aef0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7b5ce1e612a0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [9]:
# Vectors
vectors = embeddings.embed_query("I just want to ask you that how are you doing?")
len(vectors)

1536

In [11]:
# Connect to Pincone Index and Inserting chunked docs:
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
index_name = os.environ['PINECONE_INDEX_NAME']
index = pc.Index(index_name)

docsearch = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    index_name=index_name
)

docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x7b5ce18399f0>

In [13]:
# Using a Chain
chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(temperature=0.5),
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={"k": 2}),
)
chain

RetrievalQA(combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7b5ce0b07160>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7b5ce20e1bd0>, temperature=0.5, openai_api_key=SecretStr('**********'), openai_proxy='')), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['PineconeVectorStore', 'OpenAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x7b

In [15]:
our_query = "whats the name of the candidate and how much experience does it have overall?"
answer = chain.invoke(our_query)
print(answer)

{'query': 'whats the name of the candidate and how much experience does it have overall?', 'result': 'The name of the candidate is Shan Ali Khan, and they have 8 years of overall experience.'}


In [17]:
our_query = "Tell some of the candidate's skill/technologies which is common in his multiple experiences?"
answer = chain.invoke(our_query)
print(answer)

{'query': "Tell some of the candidate's skill/technologies which is common in his multiple experiences?", 'result': "Based on the provided context, some of the common skills/technologies used in the candidate's multiple experiences include:\n\n1. Database Migration: The candidate has experience in migrating databases, such as from SAP HCM Oracle to Elastic Search, indicating proficiency in database migration techniques.\n\n2. Integration of Tools: The candidate has integrated various tools like OpenXML for Powerpoint and PDF conversion, indicating expertise in tool integration.\n\n3. Machine Learning: The candidate has experience as a Principal Machine Learning Engineer, showcasing skills in machine learning techniques, data analysis, and model training.\n\n4. Data Visualization: The candidate has developed and maintained Kibana dashboards for visualization, demonstrating proficiency in data visualization tools.\n\n5. Automation: The candidate has built automated Time Series Analysis a

In [18]:
our_query = "Is candidate's work experience is relevant to his education / formal study? If there is any irrelevant experience or task please list it."
answer = chain.invoke(our_query)
print(answer)

{'query': "Is candidate's work experience is relevant to his education / formal study? If there is any irrelevant experience or task please list it.", 'result': "Based on the provided work experience, it seems that the candidate's work aligns well with their education/formal study. The candidate has experience in EDA, domain understanding, leading teams of data scientists and Python engineers, model building, deployment, database migration, and developing various tools and techniques that are relevant to their field of study.\n\nThere is no clear indication of any irrelevant experience or tasks listed in the provided context."}
