In [1]:
!pip install langchain-google-genai langchain-together pypdf docx2txt





## Imports

In [2]:
import json
import os
from dotenv import load_dotenv
from langchain_together import ChatTogether
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.prompts import PromptTemplate

In [3]:
load_dotenv()

True

In [4]:
model = ChatGoogleGenerativeAI(model='gemini-1.5-flash',api_key=os.getenv('GOOGLE_API_KEY'))

In [5]:
# model.invoke('Summarize the bias-variance tradeoff.').content

In [6]:
PROMPT_TEMPLATE = """
You are an expert resume parser. Your task is to extract structured information from the resume text below.

Return the output as a **single valid JSON object** with the exact following schema:

{{
  "Name": "string",
  "Email": "string",
  "Phone": "string",
  "LinkedIn": "string",
  "Skills": ["string"],
  "Education": ["string"],
  "Experience": ["string"],
  "Projects": ["string"],
  "Certifications": ["string"],
  "Languages": ["string"]
}}

Rules:
- If a field cannot be found, set its value to "No idea".
- Do not add explanations, notes, or extra text — output JSON only.
- For lists (Skills, Education, Experience, Projects, Certifications, Languages), return an array of short strings.
- Keep the JSON compact and properly formatted.

Resume text:
{text}
"""


In [7]:
# prompt = PromptTemplate(
#     template=PROMPT_TEMPLATE,
#     input_variables=["text"])
# formatted_prompt = prompt.format(text="Jane Doe, ML Engineer skilled in TensorFlow, PyTorch...")


# Instantiation using from_template (recommended, automatically detect variables)
# build prompt
prompt = PromptTemplate.from_template(PROMPT_TEMPLATE)

In [8]:
# formatted_prompt = prompt.format(text="John Doe, Software Engineer with skills in Python, SQL...")

In [9]:
# print(formatted_prompt)

In [10]:
def load_resume(file_path):
    if file_path.lower().endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.lower().endswith(".docx"):
        loader = Docx2txtLoader(file_path)
    elif file_path.lower().endswith(".txt"):
        loader = TextLoader(file_path)
    else:
        return None
    return loader.load()

In [45]:
extracted_text = load_resume("Ayesha_Saleem.pdf")

In [46]:
extracted_text

[Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-08-22T12:26:38+05:00', 'author': 'Ayesha Saleem', 'moddate': '2025-08-22T12:26:38+05:00', 'source': 'Ayesha_Saleem.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='AYESHA SALEEM  \n+92 318 6706803 | ayeshasaleem853@gmail.com | Portfolio | GitHub | Kaggle  | LinkedIn  \nACHIEVEMENTS & COMPETITIONS \n• @Kaggle Expert: Published datasets, notebooks, and competed in real-world ML challenges. \n• @Harvard CS50x Puzzle Day Winner (2025): Solved all 9/9 puzzles; secured 1st place globally with \nteam. \n• @Meta Hacker Cup (2024) Qualifier: Competed in Meta’s global programming competition, showcasing \nstrong algorithmic and problem-solving skills. \n• @UC Berkeley CALICO Informatics Competition (2024): Participated to enhance data handling and \ncomputational thinking. \n• @LabLab.ai AI Hackathons: Participated in multiple international hackathons focused on ge

In [47]:
# [str(doc) for doc in extracted_text]

In [48]:
extracted_text = "\n\n".join([str(doc) for doc in extracted_text])

In [49]:
extracted_text

"page_content='AYESHA SALEEM  \n+92 318 6706803 | ayeshasaleem853@gmail.com | Portfolio | GitHub | Kaggle  | LinkedIn  \nACHIEVEMENTS & COMPETITIONS \n• @Kaggle Expert: Published datasets, notebooks, and competed in real-world ML challenges. \n• @Harvard CS50x Puzzle Day Winner (2025): Solved all 9/9 puzzles; secured 1st place globally with \nteam. \n• @Meta Hacker Cup (2024) Qualifier: Competed in Meta’s global programming competition, showcasing \nstrong algorithmic and problem-solving skills. \n• @UC Berkeley CALICO Informatics Competition (2024): Participated to enhance data handling and \ncomputational thinking. \n• @LabLab.ai AI Hackathons: Participated in multiple international hackathons focused on generative \nand applied AI. \n• @LeetCode 230+ DSA Problems Solved: Practiced advanced algorithmic skills and competitive \ncoding. \n \nTECHNICAL SKILLS \n• Programming Languages: C++, Python  \n• Data Science and Machine Learning: NumPy, Pandas, Matplotlib, Seaborn, Scikit-learn, 

In [50]:
formated_text = prompt.format(text = extracted_text)
response = model.invoke(formated_text).content

In [51]:
import re
def extract_json(response):
    cleaned = re.sub(r"```(json)?", "", response).strip()
    
    try:
        structured_output = json.loads(cleaned)
    except json.JSONDecodeError:
        print("Model returned invalid JSON. Raw output:")
        print(response)
        structured_output = None
    return structured_output

In [52]:
print(extract_json(response))

{'Name': 'AYESHA SALEEM', 'Email': 'ayeshasaleem853@gmail.com', 'Phone': '+92 318 6706803', 'LinkedIn': 'No idea', 'Skills': ['C++', 'Python', 'NumPy', 'Pandas', 'Matplotlib', 'Seaborn', 'Scikit-learn', 'Plotly', 'TensorFlow', 'PyTorch', 'Keras', 'OpenCV', 'Hugging face', 'Streamlit', 'Gradio', 'Git', 'Github', 'VS Code', 'GoogleCollab', 'Anaconda', 'Jupyter Notebook', 'Canva', 'NLP', 'Sentence-BERT', 'Cosine Similarity', 'Logistic Regression', 'Random Forest', 'EDA', 'XGBoost Regressor', 'LangChain', 'ChromaDB', 'APIs'], 'Education': ['Emerson University Multan\nBachelor of Science in Computer Science\nCGPA: 3.86/4.0'], 'Experience': ['C++ Programming Intern\nCodeAlpha\nDeveloped hands-on projects using C++ programming language. \nGained experience in Object-Oriented Programming (OOP) and Data Structures.\nContributed to real-world applications by writing clean and efficient code.', 'Kaggle Professional\nData Science Community Expert\nPublished datasets and notebooks, contributing to 

In [53]:
print(extract_json(response)['Skills'])

['C++', 'Python', 'NumPy', 'Pandas', 'Matplotlib', 'Seaborn', 'Scikit-learn', 'Plotly', 'TensorFlow', 'PyTorch', 'Keras', 'OpenCV', 'Hugging face', 'Streamlit', 'Gradio', 'Git', 'Github', 'VS Code', 'GoogleCollab', 'Anaconda', 'Jupyter Notebook', 'Canva', 'NLP', 'Sentence-BERT', 'Cosine Similarity', 'Logistic Regression', 'Random Forest', 'EDA', 'XGBoost Regressor', 'LangChain', 'ChromaDB', 'APIs']
