In [2]:
import bs4
import chromadb 
from langchain import hub
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableLambda
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader


import os
import glob

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
TAVILY_API_KEY = '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'
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY

In [4]:
llm = ChatOllama(name="chat_llama3", model="krith/meta-llama-3.1-8b-instruct:IQ2_M", temperature = 0)

In [5]:
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)

In [6]:
documents_path = "data/documents/*"
filepaths = glob.glob(documents_path)
documents = []
for filepath in filepaths:
    loader = PyPDFLoader(filepath)
    docs = loader.load()
    splits = text_splitter.split_documents(docs)
    documents.extend(splits)

In [7]:
len(documents)

55

In [8]:
db = Chroma(persist_directory="data/chroma_db", embedding_function=embeddings)
db.delete_collection()
db = Chroma.from_documents(documents, embeddings, persist_directory="data/chroma_db")

In [9]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [10]:
retrieved_documents = retriever.invoke("Insan Ramadhan")
retrieved_documents

[Document(metadata={'page': 0, 'source': 'data/documents\\Resume_Dev._2024_Ari Maulana.pdf'}, page_content='ARI MAULANA HARDAN \n+6287818901144 | arimaulanahardan@gmail.com | https://www.linkedin.com/in/ari-maulana-hardan/ | \nhttps://github.com/arimaulanahardan | @arimaulanahardan'),
 Document(metadata={'page': 1, 'source': 'data/documents\\Resume_Dev._2024_Ari Maulana.pdf'}, page_content='Language : Indonesia (Native) & English (proficient)'),
 Document(metadata={'page': 1, 'source': 'data/documents\\Resume_Dev._2024_Ari Maulana.pdf'}, page_content='Ditjen Diktiristek, Ministry of Education, Culture, Research, and Technology (https://shorturl.at/kjqAI)'),
 Document(metadata={'page': 1, 'source': 'data/documents\\Resume_Dev._2024_Ari Maulana.pdf'}, page_content='Ditjen Diktiristek, Ministry of Education, Culture, Research, and Technology (https://shorturl.at/nnxaC)'),
 Document(metadata={'page': 0, 'source': 'data/documents\\Resume_Dev._2024_Ari Maulana.pdf'}, page_content='with Keras

In [11]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def inspect(state):
    """Print the state passed between Runnables in a langchain and pass it on"""
    print(state)
    return state

prompt = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:
"""

rag_prompt = ChatPromptTemplate.from_messages({'system_message',prompt})

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | RunnableLambda(inspect)
    | rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("who is Ari Maulana ?")

{'context': 'ARI MAULANA HARDAN \n+6287818901144 | arimaulanahardan@gmail.com | https://www.linkedin.com/in/ari-maulana-hardan/ | \nhttps://github.com/arimaulanahardan | @arimaulanahardan\n\nLanguage : Indonesia (Native) & English (proficient)\n\nDitjen Diktiristek, Ministry of Education, Culture, Research, and Technology (https://shorturl.at/kjqAI)\n\nFullstack Developer (Part-Time) \n• Developed and deployed a comprehensive web application for HIPMI E -Katalog, an online platform facilitating the promotion \nand sale of products by HIPMI members.\n\n• Coordinated business initiatives, organized events, and facilitated partnerships with external stakeholders.  \n \nHead of External Relations Division, Computer Students Association 2022 – 2023', 'question': 'who is Ari Maulana ?'}


"I don't have enough information to determine who Ari Maulana is. However, based on the context provided, it appears that Ari Maulana Hardan is a fullstack developer and has held various roles, including Head of External Relations Division at Computer Students Association from 2022-2023."

In [12]:
rag_chain.invoke("How long Ari works at Bandung State Polytechnic ?")

{'context': 'Available on : (It will be displayed if needed because this is an internal company application. ) \n \nFREELANCER PROJECT  Bandung, 2021-2024\n\nARI MAULANA HARDAN \n+6287818901144 | arimaulanahardan@gmail.com | https://www.linkedin.com/in/ari-maulana-hardan/ | \nhttps://github.com/arimaulanahardan | @arimaulanahardan\n\nFunded Student Creativity Program PKM-PI schema Bandung, 04/2023 \nBELMAWA of Kemendikbudristek (https://shorturl.at/GQjTH)\n\nBandung, West java, Indonesia 40559 \n \nPortfolio : https://shorturl.at/PepIz \n \nPROFESSIONAL SUMMARY\n\nPROFESSIONAL SUMMARY \nI am a fresh Computer Science graduate from Bandung State Polytechnic (Polban) with 2 years of internship experience  as a Software', 'question': 'How long Ari works at Bandung State Polytechnic ?'}


'Based on the context, Ari works at Bandung State Polytechnic for 2 years.'

In [13]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)


In [14]:
query = "which coutry Ari Maulana from?"
result = qa({"query": query})
print(result)

  result = qa({"query": query})


{'query': 'which coutry Ari Maulana from?', 'result': 'Based on the context provided, I believe Ari Maulana is from Indonesia. His LinkedIn profile mentions that he speaks Indonesian as his native language and English as a proficient language, suggesting that he is from Indonesia. Additionally, his email address and GitHub link are also related to Indonesia.', 'source_documents': [Document(metadata={'page': 0, 'source': 'data/documents\\Resume_Dev._2024_Ari Maulana.pdf'}, page_content='ARI MAULANA HARDAN \n+6287818901144 | arimaulanahardan@gmail.com | https://www.linkedin.com/in/ari-maulana-hardan/ | \nhttps://github.com/arimaulanahardan | @arimaulanahardan'), Document(metadata={'page': 1, 'source': 'data/documents\\Resume_Dev._2024_Ari Maulana.pdf'}, page_content='Language : Indonesia (Native) & English (proficient)'), Document(metadata={'page': 1, 'source': 'data/documents\\Resume_Dev._2024_Ari Maulana.pdf'}, page_content='Ditjen Diktiristek, Ministry of Education, Culture, Research,

In [15]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

question = "which country is Ari Maulana from ? is he speak english ?"
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=db.as_retriever(), llm=llm
)

In [16]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def inspect(state):
    """Print the state passed between Runnables in a langchain and pass it on"""
    print(state)
    return state

prompt = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:
"""

rag_prompt = ChatPromptTemplate.from_messages({'system_message',prompt})

rag_chain = (
    {"context": retriever_from_llm | format_docs, "question": RunnablePassthrough()}
    | RunnableLambda(inspect)
    | rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("which country is Ari Maulana from ? is he speak english ?")

{'context': '• Implemented fuzzy logic to handle uncertainty and imprecision in risk assessment. \n• Available on : https://risiko.spi-polban.site/home  \n \nTRAINING AND CERTIFICATION\n\n• Improved the quality of business education and increased student participation in entrepreneurial activities. \n \nADDITIONAL INFORMATIONS\n\nDatabase : MongoDB, MySQL, PostgreSQL, Firebase, SQLite, Oracle \nTools : Github, Postman, Notion, Trello, Figma, VS Code, Selenium, Draw.io, PowerBI, Bizagi, Google Collab.\n\n• Significantly improved data loading speed from >20 seconds to <3 seconds by optimizing data fetching and rendering.\n\nLanguage : Indonesia (Native) & English (proficient)\n\nARI MAULANA HARDAN \n+6287818901144 | arimaulanahardan@gmail.com | https://www.linkedin.com/in/ari-maulana-hardan/ | \nhttps://github.com/arimaulanahardan | @arimaulanahardan\n\n• Achieved 1st place among 812 teams in a national competition hosted by Politeknik Negeri Batam. Developed a chatbot integrated\n\n• Se

'Ari Maulana is from Indonesia. He speaks English proficiently as he has mentioned it in his profile.'