In [1]:
!pip install langchain-google-genai langchain-together pypdf docx2txt





## Imports

In [2]:
import json
import os
from dotenv import load_dotenv
from langchain_together import ChatTogether
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.prompts import PromptTemplate

In [3]:
load_dotenv()

True

In [4]:
model = ChatGoogleGenerativeAI(model='gemini-1.5-flash',api_key=os.getenv('GOOGLE_API_KEY'))

In [5]:
# model.invoke('Summarize the bias-variance tradeoff.').content

In [6]:
PROMPT_TEMPLATE = """
You are an expert resume parser. Your task is to extract structured information from the resume text below.

Return the output as a **single valid JSON object** with the exact following schema:

{{
  "Name": "string",
  "Email": "string",
  "Phone": "string",
  "LinkedIn": "string",
  "Skills": ["string"],
  "Education": ["string"],
  "Experience": ["string"],
  "Projects": ["string"],
  "Certifications": ["string"],
  "Languages": ["string"]
}}

Rules:
- If a field cannot be found, set its value to "No idea".
- Do not add explanations, notes, or extra text — output JSON only.
- For lists (Skills, Education, Experience, Projects, Certifications, Languages), return an array of short strings.
- Keep the JSON compact and properly formatted.

Resume text:
{text}
"""


In [7]:
# prompt = PromptTemplate(
#     template=PROMPT_TEMPLATE,
#     input_variables=["text"])
# formatted_prompt = prompt.format(text="Jane Doe, ML Engineer skilled in TensorFlow, PyTorch...")


# Instantiation using from_template (recommended, automatically detect variables)
# build prompt
prompt = PromptTemplate.from_template(PROMPT_TEMPLATE)

In [8]:
# formatted_prompt = prompt.format(text="John Doe, Software Engineer with skills in Python, SQL...")

In [9]:
# print(formatted_prompt)

In [10]:
def load_resume(file_path):
    if file_path.lower().endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.lower().endswith(".docx"):
        loader = Docx2txtLoader(file_path)
    elif file_path.lower().endswith(".txt"):
        loader = TextLoader(file_path)
    else:
        return None
    return loader.load()

In [36]:
extracted_text = load_resume("complex_resume.docx")

In [37]:
extracted_text

[Document(metadata={'source': 'complex_resume.docx'}, page_content='Johnathan A. Smith\nEmail: johnsmith@example.com | Phone: +1 (555) 123-4567 | LinkedIn: linkedin.com/in/johnsmith\n\nSkills:\n- Python, Java, C++, SQL, R, JavaScript, React, Node.js, Django, Flask, TensorFlow, PyTorch, Git, Docker, Kubernetes\n\nEducation:\n- MSc in Artificial Intelligence, Stanford University, 2020\n- BSc in Computer Science, University of California, Berkeley, 2018\n\nExperience:\n- Senior Machine Learning Engineer | OpenAI (2021-Present)\n  Leading a team of 5 engineers to build scalable NLP models for enterprise applications.\n- Data Scientist | Google (2018-2021)\n  Developed predictive analytics solutions for Google Ads, improving CTR by 15%.\n\nProjects:\n- Smart Resume Parser: Built an NLP-based system to extract structured information from resumes using Python and spaCy.\n- AI Chatbot: Designed a conversational AI using Transformer models, deployed on AWS.\n\nCertifications:\n- AWS Certified M

In [38]:
# [str(doc) for doc in extracted_text]

In [39]:
extracted_text = "\n\n".join([str(doc) for doc in extracted_text])

In [40]:
extracted_text

"page_content='Johnathan A. Smith\nEmail: johnsmith@example.com | Phone: +1 (555) 123-4567 | LinkedIn: linkedin.com/in/johnsmith\n\nSkills:\n- Python, Java, C++, SQL, R, JavaScript, React, Node.js, Django, Flask, TensorFlow, PyTorch, Git, Docker, Kubernetes\n\nEducation:\n- MSc in Artificial Intelligence, Stanford University, 2020\n- BSc in Computer Science, University of California, Berkeley, 2018\n\nExperience:\n- Senior Machine Learning Engineer | OpenAI (2021-Present)\n  Leading a team of 5 engineers to build scalable NLP models for enterprise applications.\n- Data Scientist | Google (2018-2021)\n  Developed predictive analytics solutions for Google Ads, improving CTR by 15%.\n\nProjects:\n- Smart Resume Parser: Built an NLP-based system to extract structured information from resumes using Python and spaCy.\n- AI Chatbot: Designed a conversational AI using Transformer models, deployed on AWS.\n\nCertifications:\n- AWS Certified Machine Learning – Specialty\n- Google Cloud Professio

In [41]:
formated_text = prompt.format(text = extracted_text)
response = model.invoke(formated_text).content

In [42]:
import re
def extract_json(response):
    cleaned = re.sub(r"```(json)?", "", response).strip()
    
    try:
        structured_output = json.loads(cleaned)
    except json.JSONDecodeError:
        print("Model returned invalid JSON. Raw output:")
        print(response)
        structured_output = None
    return structured_output

In [43]:
print(extract_json(response))

{'Name': 'Johnathan A. Smith', 'Email': 'johnsmith@example.com', 'Phone': '+1 (555) 123-4567', 'LinkedIn': 'linkedin.com/in/johnsmith', 'Skills': ['Python', 'Java', 'C++', 'SQL', 'R', 'JavaScript', 'React', 'Node.js', 'Django', 'Flask', 'TensorFlow', 'PyTorch', 'Git', 'Docker', 'Kubernetes'], 'Education': ['MSc in Artificial Intelligence, Stanford University, 2020', 'BSc in Computer Science, University of California, Berkeley, 2018'], 'Experience': ['Senior Machine Learning Engineer | OpenAI (2021-Present)', 'Data Scientist | Google (2018-2021)'], 'Projects': ['Smart Resume Parser: Built an NLP-based system to extract structured information from resumes using Python and spaCy.', 'AI Chatbot: Designed a conversational AI using Transformer models, deployed on AWS.'], 'Certifications': ['AWS Certified Machine Learning – Specialty', 'Google Cloud Professional Data Engineer', 'TensorFlow Developer Certificate'], 'Languages': ['English (Native)', 'French (Intermediate)', 'Spanish (Basic)']}


In [44]:
print(extract_json(response)['Skills'])

['Python', 'Java', 'C++', 'SQL', 'R', 'JavaScript', 'React', 'Node.js', 'Django', 'Flask', 'TensorFlow', 'PyTorch', 'Git', 'Docker', 'Kubernetes']
