https://python.langchain.com/docs/tutorials/rag/
</br>https://python.langchain.com/docs/integrations/text_embedding/
</br>https://platform.openai.com/settings/organization/billing/overview

In [8]:
import json

from pathlib import Path

import pandas as pd

from langchain_chroma import Chroma

from langchain_core.prompts.chat import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from settings import OPENAI_API_KEY

# Loading docs

In [9]:
PROMPT_TEMPLATE = Path('/Users/caiopavesi/Code/bmw-job-matcher/data/prompt.jinja').read_text()
APPLICANT_DOCUMENTS_FILES_PATH = '/Users/caiopavesi/Library/Mobile Documents/com~apple~CloudDocs/0/Work/Job applications/Templates/Latest'

In [10]:
llm = ChatOpenAI(
    model = "gpt-4.1",
    temperature = 0,
    api_key = OPENAI_API_KEY
)

In [11]:
embeddings = OpenAIEmbeddings(model = "text-embedding-3-large")

In [12]:
vector_store = Chroma(
    collection_name = "example_collection",
    embedding_function = embeddings,
    persist_directory =  "../data/chroma",
)

In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    add_start_index = True,
)

In [14]:
loader = DirectoryLoader(APPLICANT_DOCUMENTS_FILES_PATH, glob = "*.pdf", loader_cls = PyPDFLoader)
documents = loader.load()
all_splits = text_splitter.split_documents(documents)
document_ids = vector_store.add_documents(documents = all_splits)

# Creating the RAG

In [25]:
prompt = PromptTemplate(
    template = PROMPT_TEMPLATE,
    template_format = "jinja2"
)

In [16]:
job_description = pd.read_excel('../data/bmw_jobs.xlsx').loc[3, ('description')]

In [17]:
retrieved_docs = vector_store.similarity_search(prompt.template)

In [18]:
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

In [19]:
prompt = prompt.invoke({"question": job_description, "context": docs_content})
answer = llm.invoke(prompt)

In [20]:
rate = json.loads(answer.content)

In [21]:
rate

{'score': 62.5,
 'strengths': "The candidate demonstrates strong leadership, project management, and communication skills, as evidenced by leading a university BAJA SAE team to a top national finish and securing significant sponsorships. Experience in process improvement and workflow automation in a high-stakes environment shows analytical ability and adaptability. The candidate expresses clear alignment with BMW's culture of innovation and quality.",
 'weaknesses': 'The application does not explicitly confirm enrollment in a relevant degree (Maschinenbau, Elektrotechnik, Mechatronik, Wirtschaftsingenieurwesen) or fluency in German, both of which are mandatory for the role. There is no mention of experience with AUTOCAD or direct exposure to quality planning, metrology, or e-mobility. The resume lacks detail on technical hard skills and specific academic background.',
 'improvement_points': 'Explicitly state current field of study and confirm it matches the required disciplines. Highli