In [None]:
from langdetect import detect, DetectorFactory
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationSummaryMemory
from langchain.prompts import PromptTemplate
from PyPDF2 import PdfReader

from langchain_community.llms import Ollama
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

In [3]:
# Step 1: Load & extract text
pdf_path = "..\\data\\Yeins_Yefferson_Aristizabal_López_CV - English.pdf"
reader = PdfReader(pdf_path)
raw_text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())

In [4]:
try:
    detected_lang = detect(raw_text[:1000])
except:
    detected_lang = "unknown"
language = detected_lang

In [6]:
OLLAMA_MODEL = "gemma:2b"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
NUM_TOP_DOCS = 3

# Load local LLM
def load_llm():
    return Ollama(model=OLLAMA_MODEL, temperature=0.1)

# Load PDF, split and embed
def process_pdf(path):
    loader = PyPDFLoader(path)
    pages = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    docs = splitter.split_documents(pages)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs, embeddings)
    retriever = vectorstore.as_retriever()
    return retriever

In [9]:
# Load LLM and retriever
llm = load_llm()
retriever = process_pdf(pdf_path)

In [10]:
# Get summary
summary = llm.predict(f"Summarize the document in 5 bullet points:\n{raw_text[:3000]}")
summary

"Sure, here's a summary of the document in 5 bullet points:\n\n- The document is about a Data Scientist and AI Engineer with a focus on machine learning and data analysis.\n\n\n- The data scientist has experience in developing chatbots, forecasting models, and analyzing lead comments.\n\n\n- They have worked on projects related to automotive parts, retail products, and oil wells.\n\n\n- The document highlights the data scientist's skills and experience in programming languages, libraries, and data manipulation techniques.\n\n\n- The document provides a brief overview of the data scientist's professional experience and education."

In [11]:
CUSTOM_PROMPT = """
You are an intelligent assistant with access to a document and general knowledge.
Always try to answer using the document, but if the document doesn't contain the answer,
feel free to respond with helpful general knowledge.

Maintain a friendly, conversational tone, and keep answers short and clear.

{context}

Chat History:
{chat_history}

User: {question}
"""

In [None]:
# Setup memory and prompt
memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", return_messages=True)
prompt = PromptTemplate(
    input_variables=["chat_history", "question", "context"],
    template=CUSTOM_PROMPT
    )

In [15]:
# Build Conversational QA chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": prompt},
    verbose=True
    )

In [14]:
user_input = "Please, Tell about the Skills of the applicant"

In [17]:
chat_history = []

response = qa_chain.run(user_input)
chat_history.append({"role": "user", "content": user_input})
chat_history.append({"role": "assistant", "content": response})



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

system: The human asks what the AI thinks of artificial intelligence. The AI thinks artificial intelligence is a force for good because it will help humans reach their full potential. The applicant has a strong background in data science and AI, with experience in various domains such as data analysis, data wrangling, modeling, and visualization. They have expertise in programming languages such as Python, SQL, and C++, and libraries and frameworks like Scikit-Learn, Keras, TensorFlow, and OpenCV. Additionally, they have experience in data cleaning, analysis, and storytelling, which are essential skills for any data scientist.
Follow Up Input: Please, Tell about the Skills of the applicant
Standalone question:[0m

[1m> Finished chain.[0m


[1m

In [27]:
chat_history[0]["content"]

'Please, Tell about the Skills of the applicant'

In [26]:
chat_history[1]["content"]

"The applicant's skills that would be most relevant to the role of a data scientist are:\n\n• Programming Languages: Python, SQL, C++\n• Libraries & Frameworks: Scikit-Learn, Keras, TensorFlow, OpenCV , Numpy, Pandas, SciPy, spaCy, NLTK, Matplotlib, Seaborn, Streamlit, HugginFace, LangChain.\n• Data Cleaning, Analysis, and Storytelling\n• Machine Learning Techniques"