In [1]:
!pip install langchain-google-genai langchain-together pypdf docx2txt





## Imports

In [2]:
import json
import os
from dotenv import load_dotenv
from langchain_together import ChatTogether
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.prompts import PromptTemplate

In [3]:
load_dotenv()

True

In [4]:
model = ChatGoogleGenerativeAI(model='gemini-1.5-flash',api_key=os.getenv('GOOGLE_API_KEY'))

In [5]:
# model.invoke('Summarize the bias-variance tradeoff.').content

In [6]:
PROMPT_TEMPLATE = """
You are an expert resume parser. Your task is to extract structured information from the resume text below.

Return the output as a **single valid JSON object** with the exact following schema:

{{
  "Name": "string",
  "Email": "string",
  "Phone": "string",
  "LinkedIn": "string",
  "Skills": ["string"],
  "Education": ["string"],
  "Experience": ["string"],
  "Projects": ["string"],
  "Certifications": ["string"],
  "Languages": ["string"]
}}

Rules:
- If a field cannot be found, set its value to "No idea".
- Do not add explanations, notes, or extra text — output JSON only.
- For lists (Skills, Education, Experience, Projects, Certifications, Languages), return an array of short strings.
- Keep the JSON compact and properly formatted.

Resume text:
{text}
"""


In [7]:
# prompt = PromptTemplate(
#     template=PROMPT_TEMPLATE,
#     input_variables=["text"])
# formatted_prompt = prompt.format(text="Jane Doe, ML Engineer skilled in TensorFlow, PyTorch...")


# Instantiation using from_template (recommended, automatically detect variables)
# build prompt
prompt = PromptTemplate.from_template(PROMPT_TEMPLATE)

In [8]:
# formatted_prompt = prompt.format(text="John Doe, Software Engineer with skills in Python, SQL...")

In [9]:
# print(formatted_prompt)

In [10]:
def load_resume(file_path):
    if file_path.lower().endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.lower().endswith(".docx"):
        loader = Docx2txtLoader(file_path)
    elif file_path.lower().endswith(".txt"):
        loader = TextLoader(file_path)
    else:
        return None
    return loader.load()

In [54]:
extracted_text = load_resume("resume_sample.pdf")

In [55]:
extracted_text

[Document(metadata={'producer': 'macOS Version 14.2 (Build 23C64) Quartz PDFContext', 'creator': 'Pages', 'creationdate': "D:20240725211807Z00'00'", 'title': 'Data Science Roadmap', 'moddate': "D:20240725211807Z00'00'", 'source': 'data-science-roadmap.pdf', 'total_pages': 21, 'page': 0, 'page_label': '1'}, page_content='THE COMPLETE \nDATA SCIENCE ROADMAP\nGo From Zero to a Data Scientist  \nin 12 Months \nMosh Hamedani'),
 Document(metadata={'producer': 'macOS Version 14.2 (Build 23C64) Quartz PDFContext', 'creator': 'Pages', 'creationdate': "D:20240725211807Z00'00'", 'title': 'Data Science Roadmap', 'moddate': "D:20240725211807Z00'00'", 'source': 'data-science-roadmap.pdf', 'total_pages': 21, 'page': 1, 'page_label': '2'}, page_content='2\nCopyright 2024 Code with Mosh codewithmosh.com\nHi! I am Mosh Hamedani, a software engineer with over 20 \nyears of experience.  \nOver the past 10 years, I’ve had the privilege of teaching \nmillions of people how to code and become professional \

In [56]:
# [str(doc) for doc in extracted_text]

In [57]:
extracted_text = "\n\n".join([str(doc) for doc in extracted_text])

In [58]:
# extracted_text

In [59]:
formated_text = prompt.format(text = extracted_text)
response = model.invoke(formated_text).content

In [60]:
import re
def extract_json(response):
    cleaned = re.sub(r"```(json)?", "", response).strip()
    
    try:
        structured_output = json.loads(cleaned)
    except json.JSONDecodeError:
        print("Model returned invalid JSON. Raw output:")
        print(response)
        structured_output = None
    return structured_output

In [61]:
print(extract_json(response))

{'Name': 'Mosh Hamedani', 'Email': 'No idea', 'Phone': 'No idea', 'LinkedIn': 'No idea', 'Skills': ['Python', 'Git', 'SQL', 'Data Structures & Algorithms', 'Mathematics and Statistics', 'Data Collection and Visualization', 'Machine Learning', 'Deep Learning', 'NLP', 'Computer Vision', 'Big Data', 'NumPy', 'Pandas', 'Matplotlib', 'TensorFlow', 'PyTorch', 'Keras', 'Hadoop', 'Spark', 'NLTK', 'SpaCy', 'Hugging Face'], 'Education': ['No idea'], 'Experience': ['Software engineer with over 20 years of experience', 'Teaching millions of people how to code and become professional software engineers through YouTube channel and online courses'], 'Projects': ['No idea'], 'Certifications': ['No idea'], 'Languages': ['No idea']}


In [62]:
print(extract_json(response)['Skills'])

['Python', 'Git', 'SQL', 'Data Structures & Algorithms', 'Mathematics and Statistics', 'Data Collection and Visualization', 'Machine Learning', 'Deep Learning', 'NLP', 'Computer Vision', 'Big Data', 'NumPy', 'Pandas', 'Matplotlib', 'TensorFlow', 'PyTorch', 'Keras', 'Hadoop', 'Spark', 'NLTK', 'SpaCy', 'Hugging Face']
