In [5]:
import os
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter  import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain_community.llms.ollama import Ollama
from langchain.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [6]:
path = "./data/cv"
persist_directory= "./vectorstor/db"
n_doc = 3 ##number of documents return

loader = DirectoryLoader(path = path, show_progress=True)
documents = loader.load()

embeddings = OllamaEmbeddings(base_url=os.getenv("OLLAMA_ADDR"), model=os.getenv("OLLAMA_MODEL"))

llm = Ollama(base_url=os.getenv("OLLAMA_ADDR"), model=os.getenv("OLLAMA_MODEL"))

100%|██████████| 10/10 [00:00<00:00, 24.65it/s]


In [7]:
text_splitter  = RecursiveCharacterTextSplitter(chunk_size=10000)
text_splitter.split_documents(documents)
vectorstore = Chroma.from_documents(persist_directory=persist_directory, embedding=embeddings, documents=documents)
retriver = vectorstore.as_retriever(search_type="mmr",search_kwargs= {"k": n_doc} )

In [8]:
def file_reader(path):
    with open(path, mode="r+", encoding= "utf-8") as f:
        return f.read()

In [10]:
jd_path = "./data/jd/job_description.txt"
jd=file_reader(jd_path)
print(jd)

responsibilities:
• Design, develop, and maintain multiple databases within the data warehouse. 
• Create, implement, and test data models and database management systems.
• Conduct research and support internal/external groups with the selection and implementation of applications and database management tools.
• Identify and utilize database management systems to aggregate and analyze data. 
• In collaboration with the VP, IBT and the Senior Data Warehouse Engineer, develop and implement data administration policies, standards, and models. 
• Work with internal departments and colleagues to understand and document specific data and user requirements, data collection and administration policies, and data access rules. 
• Develop and implement procedures for determining network access and usage and for the backup and recovery of data. 
• Write scripts to support automation, data extraction, and reporting.
• Design and automate routine and self-service reporting solutions (i.e., Power BI

In [11]:
screen_result = retriver.invoke(F"""find the CV with the best match with the JD as below:
   {jd}
    """)

Number of requested results 20 is greater than number of elements in index 10, updating n_results = 10


In [12]:
screen_result

[Document(page_content='Name: Sarah Miller\n\nEmail: sarah.miller@example.com\n\nPhone: +1-555-678-9012\n\nLocation: Edmonton, AB\n\nProfessional Summary\n\nDetail-oriented data analyst with 4 years of experience in data analysis, visualization,\n\nand reporting. Expertise in extracting insights from complex datasets to support\n\nstrategic business decisions.\n\nSkills\n\nData Analysis (Python, R)\n\nData Visualization (Tableau, Power BI)\n\nSQL, Excel\n\nStatistical Analysis\n\nData Cleaning and Preparation • Business Intelligence Tools\n\nWork Experience\n\nData Analyst\n\nMarket Insights Inc., Edmonton, AB\n\nFebruary 2020 - Present\n\nAnalyzed market trends and customer behavior to inform marketing strategies.\n\nCreated interactive dashboards and visualizations to present data insights.\n\nConducted statistical analyses to support product development decisions.\n\nJunior Data Analyst\n\nInsight Analytics, Edmonton, AB\n\nJuly 2018 - January 2020\n\nAssisted in data cleaning and p

In [13]:
retriever = Chroma.from_documents(documents=screen_result, embedding=embeddings).as_retriever()

In [14]:

template = """Summarize and highlight the experience, certificate or education in the CV which match with the JD as below:

CV:
{context}

JD:
{job_description}

"""
prompt = ChatPromptTemplate.from_template(template)
model = llm


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


chain = (
    {"context": retriever , "job_description": lambda x: jd}
    | prompt
    | model
    | StrOutputParser()
)

res =chain.invoke("")

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


In [15]:
print(res)

Based on the provided CVs and JD, I've highlighted the relevant experience, certificate, or education that match with the responsibilities mentioned in the JD:

**CV 1: Sarah Miller**

* Experience:
	+ Analyzed market trends and customer behavior to inform marketing strategies (matches with "Conduct research and support internal/external groups with the selection and implementation of applications and database management tools.")
	+ Created interactive dashboards and visualizations to present data insights (matches with "Design and automate routine and self-service reporting solutions (i.e., Power BI).")
* Education:
	+ Bachelor of Science in Statistics (relevant to the statistical analysis mentioned in the JD)

**CV 2: Daniel Williams**

* Experience:
	+ Designed and implemented scalable data pipelines on AWS (matches with "Identify and utilize database management systems to aggregate and analyze data.")
	+ Optimized ETL workflows for better performance (matches with "Write scripts to