In [1]:
from langchain.document_loaders import PyPDFLoader

In [2]:
pdf_loader = PyPDFLoader("docs\\CV_AnishChapagain_2024.pdf")
pdf_doc = pdf_loader.load()

In [3]:
pdf_doc

[Document(page_content='Chapagain, Anish   \nGreen Hill City  #498 , Kageshwori Manohara -06, Mulpani , Kathmandu, Nepal.  \nanishchapagain@gmail.com  | Linkedin  \n+977 01  9840065449  \n \nObjective                                                                                                                                                                \nTo gain experience & challenging position in the field of AI/ML/Data Science conducive to growth, where I will \nconsolidate, utilize, and further explore my professional skills with , and learning the latest technologies . \n  \nTechnical Skills  \n\uf0b7 Team  & project management, leading, m onitoring , training and c oaching , analysis & repor ting. \n\uf0b7 Experience  in Knowledge Management, AI/ML, Data Science, Cybersecurity, e -Commerce review platform . \n\uf0b7 PySpark, Pandas, PyTorch, OCR, Django , Web Scraping, Anti -Scraping, Anti -bot testing , Selenium . \n\uf0b7 Data & Text A nalysis , Regular Expressions , Analy

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=5,
    separators=["\n\n","\n"," \n",".","."," ",",","●"]
)

In [6]:
docs = r_splitter.split_documents(pdf_doc)

In [7]:
docs

[Document(page_content='Chapagain, Anish   \nGreen Hill City  #498 , Kageshwori Manohara -06, Mulpani , Kathmandu, Nepal.', metadata={'source': 'docs\\CV_AnishChapagain_2024.pdf', 'page': 0}),
 Document(page_content='anishchapagain@gmail.com  | Linkedin  \n+977 01  9840065449', metadata={'source': 'docs\\CV_AnishChapagain_2024.pdf', 'page': 0}),
 Document(page_content='Objective', metadata={'source': 'docs\\CV_AnishChapagain_2024.pdf', 'page': 0}),
 Document(page_content='To gain experience & challenging position in the field of AI/ML/Data Science conducive to growth,', metadata={'source': 'docs\\CV_AnishChapagain_2024.pdf', 'page': 0}),
 Document(page_content='where I will', metadata={'source': 'docs\\CV_AnishChapagain_2024.pdf', 'page': 0}),
 Document(page_content='consolidate, utilize, and further explore my professional skills with , and learning the latest', metadata={'source': 'docs\\CV_AnishChapagain_2024.pdf', 'page': 0}),
 Document(page_content='technologies', metadata={'sourc

In [8]:
len(docs)

179

In [9]:
docs[0].page_content

'Chapagain, Anish   \nGreen Hill City  #498 , Kageshwori Manohara -06, Mulpani , Kathmandu, Nepal.'

In [10]:
# Read API KEY from file
import json

file_path = 'llms_vault.json'
llm_key = 'GOOGLE_API_KEY'

try:
    with open(file_path,'r') as f:
        api_key = json.loads(f.read()).get(llm_key)
        if not api_key:
            raise ValueError(f"Some issue with '{llm_key}' from '{file_path}'. Verify the key:'{llm_key}' if it exists!")
except FileNotFoundError:
    raise FileNotFoundError(f"File related issue encountered, verify file path and name '{file_path}'")

# https://ai.google.dev/api
import google.generativeai as genai
genai.configure(api_key=api_key) # genai.configure(api_key=os.environ[llm_key]) 

In [11]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings # models/text-embedding-004	

import os
os.environ["GOOGLE_API_KEY"] = api_key

In [12]:
#result = genai.embed_content(model="models/text-embedding-004", content=docsr)

embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", # text-embedding-004
    google_api_key=api_key,
    )

In [13]:
embedding_model

GoogleGenerativeAIEmbeddings(client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x00000249A9D68E60>, model='models/embedding-001', task_type=None, google_api_key=SecretStr('**********'), credentials=None, client_options=None, transport=None, request_options=None)

In [14]:
from langchain.vectorstores import FAISS

In [15]:
vectordb = FAISS.from_documents(
    documents=docs,
    embedding=embedding_model,
)

In [16]:
vectordb
# vectordb.save_local("faiss_index")
# new_db = FAISS.load_local("faiss_index", embeddings)

# docs = new_db.similarity_search(query)

<langchain_community.vectorstores.faiss.FAISS at 0x249aca9bfb0>

In [111]:
print(vectordb.index.ntotal)

179


In [18]:
pdf = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 3})

In [19]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import StrOutputParser

In [20]:
llm_model = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest",
    google_api_key=api_key,
    )

### Chat

In [21]:
from langchain import PromptTemplate

In [40]:
llm_prompt_template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context 
    to answer the question. If you don't know the answer, just say that you don't know. 
    Use two to three sentences maximum and keep the answer concise.\n
    Question: {question} \nContext: {context} \nAnswer:
    """

llm_prompt_template_new = """You are an assistant, use the following context to answer the question.
Use two to three sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:"""


In [41]:
llm_prompt = PromptTemplate.from_template(llm_prompt_template)
print(llm_prompt)

input_variables=['context', 'question'] template="\n    You are an assistant for question-answering tasks. Use the following pieces of retrieved context \n    to answer the question. If you don't know the answer, just say that you don't know. \n    Use two to three sentences maximum and keep the answer concise.\n\n    Question: {question} \nContext: {context} \nAnswer:\n    "


In [34]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [35]:
from langchain.schema.runnable import RunnablePassthrough

In [42]:
def reply(question=''):
    chain = (
    {"context": pdf | format_docs, "question": RunnablePassthrough()}
    | llm_prompt | llm_model | StrOutputParser()
    )
    print(chain.invoke(question))

In [139]:
pdf.invoke("cities")[0].page_content

'●  “Element of AI” – University of Helsinki (Online 2 ECTS) , 2018'

In [144]:
vectordb.similarity_search("cities")[0]

Document(page_content='●  “Element of AI” – University of Helsinki (Online 2 ECTS) , 2018', metadata={'source': 'docs\\CV_AnishChapagain_2024.pdf', 'page': 3})

In [44]:
reply("list some cities found in the document")

The provided context mentions two cities: Jawalakhel and Lalitpur. 



In [49]:
reply("provide the list of colleges, universities with their address and name found in the document")

The document mentions one college: Campion College, located in Kupondole, Lalitpur, Nepal. 



In [48]:
reply("list some programming languages found in the document")

The document mentions Python, Shell Scripts, and C.  The context also mentions MongoDB, Terraform, Docker, and Jenkins, but these are not programming languages. 



In [52]:
reply("summarize the whole document in less than 200 words")

The document outlines tasks related to software development and project management. It mentions building applications for data collection, text summarization, translation, and paraphrasing. Additionally, it highlights daily responsibilities like system maintenance and backup, project management using BaseCamp, and version control using Git or SVN. 



In [53]:
reply("provide me linkedin URL found in the document")

The LinkedIn URL provided in the context is: https://www.linkedin.com/in/anish -chapagain -a581386/. 



In [54]:
reply("where is anish chpagain located, provide me his address")

Anish Chapagain's address is Green Hill City #498, Kageshwori Manohara -06, Mulpani, Kathmandu, Nepal. This information was found on his LinkedIn profile. 



In [61]:
reply("list country name that is available in the document")

The document mentions the country **Nepal**. 



In [69]:
reply("provide the earliest date")

The earliest date provided in the context is February 23, 2003. 



In [70]:
reply("provide the latest date")

The latest date provided in the context is May, 2024. 



In [71]:
reply("list some professional courses found")

The provided context lists professional courses such as MS-DOS, Microsoft Office, Adobe Photoshop, Oracle, MySQL, PL/SQL, C, C++, Java, and HTML.  These courses cover a range of software applications, programming languages, and database management systems. 



In [75]:
reply("list udemy courses")

I can only identify one Udemy course from the provided context: "LLMs with Google Cloud and Python". 



In [79]:
reply("list titles for LLM or Python or anyone of them")

The provided context mentions a Udemy course titled "LLMs with Google Cloud and Python".  This suggests a title related to LLMs and Python, such as "LLM and Python for Beginners" or "Building LLM Applications with Python". 



In [80]:
reply("is anish used to MongoDb")

The provided context indicates that Anish has experience with MongoDB. His LinkedIn profile lists MongoDB as a skill, suggesting he has used it in the past. 



In [88]:
reply("is anish used to PySpark")

Based on the provided context, Anish has taken a course on "Data Engineering Essentials using SQL, Python, and PySpark" on Udemy in 2022.  This suggests that Anish has been exposed to PySpark. 



In [100]:
reply("provide me titles and when did anish has taken from Udemy")

Anish Chapagain has taken the course "LLMs with Google Cloud and Python" on Udemy in 2024.  The LinkedIn profile you provided doesn't list any other courses taken. 



In [96]:
reply("when did anish obtained course or certification from Udemy")

I'm sorry, but the provided context does not include information about Anish obtaining any courses or certifications from Udemy. 



In [93]:
reply("courses or certification from LinkedIn")

The provided context only mentions "Developing your Team Members" as a LinkedIn Learning course. It doesn't list any other LinkedIn Learning courses or certifications. 



In [87]:
reply("lists some skills found in job")

The job requires technical skills in software development lifecycle (SDLC), technical writing, documentation, and presentations.  It also involves data science and web development skills, including PySpark, Pandas, PyTorch, OCR, Django, web scraping, anti-scraping, and anti-bot testing. 



In [108]:
reply("list some job skills")

Some job skills include proficiency in SDLC, technical writing and support, documentation, and delivering presentations. Technical skills include PySpark, Pandas, PyTorch, OCR, Django, web scraping, anti-scraping, and anti-bot testing. 



In [131]:
reply("publication and book title")

The publication is "Hands-On Web Scraping with Python" and the book title is "Hands-On Web Scraping with Python". 



In [136]:
reply("ISBNs")

The provided context mentions the ISBN for the book "Hands-On Web Scraping with Python" as 9781789533392. This ISBN is a unique identifier for the book. 



In [159]:
reply("provide all publication title from the document")

The only publication title provided in the context is "Hands-On Web Scraping with Python". 



## Similarity

In [118]:
def search(question='') -> list:
    docs = vectordb.similarity_search(question, k=3)
    answers=[]
    for doc in docs:
        answers.append(doc.page_content[:200])
    
    return answers

In [122]:
search(question="email address and phone number")

['anishchapagain@gmail.com  | Linkedin  \n+977 01  9840065449',
 'References are available upon request and as required.',
 'Chapagain, Anish   \nGreen Hill City  #498 , Kageshwori Manohara -06, Mulpani , Kathmandu, Nepal.']

In [119]:
search(question="list me some places where can i contribute to")

['● Coding support to developers/clients site (error testing/fixing, database maintenance, site',
 'LinkedIn Profile : https://www.linkedin.com/in/anish -chapagain -a581386/',
 '\uf0b7 Projects related to scraping and ML, Code reviews & testing.']

In [121]:
search(question="job title and skill")

['\uf0b7 Proficiency in SDLC, T echnical writing/s upport , Documentation and delivering presentations  &',
 '\uf0b7 Team  & project management, leading, m onitoring , training and c oaching , analysis & repor ting',
 'To gain experience & challenging position in the field of AI/ML/Data Science conducive to growth,']

In [123]:
search(question="last university")

['●  “Element of AI” – University of Helsinki (Online 2 ECTS) , 2018',
 'M.Sc Computer System (2007 -2008), Bangor University, Wales (U.K)',
 '● Subjects : Accounting, Organization Behavior, Research Methodology, TQM, HRM, Economics etc.']

In [132]:
search("publication and book title")

['Publications',
 '2nd Edition” . ISBN:  9781837636211 , Oct-2023.',
 '● “Hands -On Web Scraping with Python” . ISBN : 9781789533392 , Jul-2019 . \n \n \nEducation']

In [145]:
search("list all ISBN and date")

['Dec, 2005',
 '● “Hands -On Web Scraping with Python” . ISBN : 9781789533392 , Jul-2019 . \n \n \nEducation',
 '2nd Edition” . ISBN:  9781837636211 , Oct-2023.']

In [146]:
search("when was 2nd edition published")

['2nd Edition” . ISBN:  9781837636211 , Oct-2023.',
 '.E)                                                                      23rd Feb, 2003 – 30th Dec,',
 'Dec, 2005']

In [152]:
search("publication dates")

['May, 2024',
 '2nd Edition” . ISBN:  9781837636211 , Oct-2023.',
 '.E)                                                                      23rd Feb, 2003 – 30th Dec,']