In [1]:
import PyPDF2
import docx
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate

In [2]:
resume_path = "Sample-Resume.pdf"

In [3]:
def read_pdf_text(resume_path):
    """
    Extracts text from a PDF file.

    Args:
        resume_path (file_path): The PDF file to be read.

    Returns:
        str: The extracted text from the PDF.
    """
    pdf_reader = PyPDF2.PdfReader(resume_path)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text().strip()
    return text

In [4]:
data = read_pdf_text(resume_path=resume_path)
print(data)

9868949825    Sanjay Kumar  
1 
  
 
 
AREAS OF EXPERTISE  
Client Relations Management  
Employee Relations  
Record Keeping  
Contract document generation  
Creative Problem Solver  
Business Administration 
Note taking  
Answering  queries  
PROFESSIONAL  
MS-Office  
Photoshop  
 Internet  Savvy  
PERSONAL SKILLS  
Decision making  
Fast Learning  
Hard Working  
Leadership  
 Attention to detail  
Co-ordination  
Self- Motivated  
PERSONAL DETAILS  
Sanjay Kumar  
H. No. - D-2 
M.R. Dayalpur  
P.O. Gokul Pur  
Delhi - 110094  
M: 9868  9498  25 
E: sanjay.kr114@gmail.com  
 
 
 Sanjay Kumar  
HR Executive  
5 years of experience including 3+ years’ professional experience as an  HR Executive  in 
handling human resource projects . 
 
PERSONAL SUMMARY  
I am a  competent and organized  individual who is able to work as part of a team  or 
individual  and manage several priorities . I have  a positive attitude, strong work 
ethic, and a keen desire to learn and grow within a firm. I

In [5]:
def read_docx_text(word_file_path):
    """
    Extracts text from a DOCX file.

    Args:
        word_file_path (path for word file): The DOCX file to be read.

    Returns:
        str: The extracted text from the DOCX file.
    """
    doc = docx.Document(word_file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text.strip() + "\n"
    return text

In [6]:
data = read_docx_text("Sample-Resume-Word.docx")
print(data)

SAMPLE RESUME #1 – Optional format with no objective

Your Name
Street Address • City, State, Zip • Telephone number • E-mail



EDUCATION

University of California, Santa Cruz 	Santa Cruz, CA Master of Science in Applied Economics and Finance 	Expected June 2017
Current GPA
	List any honors or awards
	Thesis or special project title can be listed here

Related Course Work (Add left tabs at 4 1/4 and 4 1/2)
•   Course Name 	•   Course Name
•   Course Name 	•   Course Name

List Undergraduate College or University 	City, State
Degree 	Date Received
•   Related awards or honors can be mentioned here

RELATED EXPERIENCE

Name of Company 	City, State
Title 	Dates
•   Information about what you did and accomplished
•   Start each phrase with action words
•   If job is current use present tense -  If job is over use past tense

Name of Company (Don’t forget academic experience) 	City, State
Title 	Dates
•   What you did for company or client
•   More information about what you did
Prior 

In [7]:
def extract_resume_text(resume_path):
    file_type = resume_path.split(".")[-1]
    if file_type == "pdf":
        return read_pdf_text(resume_path)
    elif file_type == "docx":
        return read_docx_text(resume_path)

In [8]:
resume_text = extract_resume_text("Sample-Resume.pdf")

In [9]:
llm=ChatGroq(temperature=1,
             model_name="llama3-70b-8192",
             api_key='your-groq-api-key',
             model_kwargs={"response_format": {"type": "json_object"}})

In [10]:
format_instruction = """
{
  "name": "Candidate's full name",
  "email": "Candidate's email address",
  "phone_number": "Candidate's phone number",
  "location": {
    "address": "Candidate's full address",
    "city": "Candidate's city",
    "country": "Candidate's country"
  },
  "linkedin_profile": "URL to LinkedIn profile",
  "github_profile": "URL to GitHub profile (if applicable)",
  "portfolio_website": "URL to personal portfolio or website (if applicable)",
  "career_objective": "Candidate's career objective or summary",
  "total_experience": "Total years of work experience",
  "relevant_experience": "Years of relevant experience",
  "current_job_title": "Candidate's current job title",
  "current_company": "Candidate's current company",
  "previous_job_titles": [
    "List of previous job titles"
  ],
  "previous_companies": [
    "List of previous companies"
  ],
  "skills": {
    "technical_skills": [
      "List of technical skills"
    ],
    "soft_skills": [
      "List of soft skills"
    ]
  },
  "education": [
    {
      "degree": "Degree obtained",
      "institution": "Institution name",
      "year_of_passing": "Year of passing",
      "division": "Division/Grade/CGPA"
    }
  ],
  "certifications": "List of certifications (if any)",
  "projects": [
    {
      "project_name": "Name of the project",
      "description": "Brief description of the project",
      "technologies_used": "List of technologies/tools used",
      "role": "Role in the project"
    }
  ],
  "achievements": "List of major achievements (if any)",
  "publications": "List of publications (if applicable)",
  "languages": [
    "List of languages"
  ],
}
"""

In [11]:
prompt_template = """
You are tasked with extracting data from resume and returning a JSON structre.
{format_instruction}
Resume Text:
{resume_text}
"""

In [12]:
prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["format_instruction", "resume_text"],
)

In [13]:
llm_resume_parser = prompt | llm

In [14]:
parsed_candidate_data = llm_resume_parser.invoke({"format_instruction": format_instruction, "resume_text":resume_text})

In [15]:
print(parsed_candidate_data.content)

{
  "name": "Sanjay Kumar",
  "email": "sanjay.kr114@gmail.com",
  "phone_number": "9868949825",
  "location": {
    "address": "H. No. - D-2, M.R. Dayalpur, P.O. Gokul Pur, Delhi - 110094",
    "city": "Delhi",
    "country": "India"
  },
  "linkedin_profile": "",
  "github_profile": "",
  "portfolio_website": "",
  "career_objective": "I am a competent and organized individual who is able to work as part of a team or individual and manage several priorities. I have a positive attitude, strong work ethic, and a keen desire to learn and grow within a firm.",
  "total_experience": 5,
  "relevant_experience": 3,
  "current_job_title": "HR Executive",
  "current_company": "Sirius Global Limited",
  "previous_job_titles": [
    "HR-ADMIN EXECUTIVE",
    "HR-ADMIN EXECUTIVE/ OFFICE COORDINATOR",
    "LOWER DIVISIONAL CLERK",
    "STORE IN-CHARGE",
    "FINANCIAL ADVISOR"
  ],
  "previous_companies": [
    "Lion Manpower Solutions Private Limited",
    "1 Delhi Air Sqn. (Flg.)",
    "Prakash

In [16]:
new_resume = extract_resume_text("rishabh_Maheshwari_resume_2024.pdf")

incorrect startxref pointer(1)


In [17]:
print(new_resume)

RISHABH MAHESHWARI                                                                   Email: rishabh21071993@gmail.com  
 linkedin.com/in/rishabh -maheshwari -750a76181                                 Contact No: +91 -9643879864                                                                    
  
Professional Summary:  
Dedicated and results -oriented Data Engineer with over 8 + years of experience in IT industry. Proficient 
in managing data engineering projects across diverse domains including Automotive, Banking, Financial 
Services, and HealthCare  sectors. Skilled in designing, developing, and implementing robust data 
solutions to support business objectives. Proven track record of success in optimizing data infrastructure 
and driving efficiency through innovative technologies.  
 
WORK EXPERIENCE:  
EPAM SYSTEMS (May -2021 – Current ): Senior Data Engineer  
• Contributed to the development of data -driven solutions for the pharmaceutical industry, 
specializing in healthcare 

In [18]:
new = llm_resume_parser.invoke({"format_instruction": format_instruction, "resume_text":new_resume})

In [19]:
print(new.content)

{
  "name": "Rishabh Maheshwari",
  "email": "rishabh21071993@gmail.com",
  "phone_number": "+91-9643879864",
  "location": {
    "address": "",
    "city": "",
    "country": ""
  },
  "linkedin_profile": "linkedin.com/in/rishabh-maheshwari-750a76181",
  "github_profile": "",
  "portfolio_website": "",
  "career_objective": "Dedicated and results-oriented Data Engineer",
  "total_experience": "8+ years",
  "relevant_experience": "8+ years",
  "current_job_title": "Senior Data Engineer",
  "current_company": "EPAM SYSTEMS",
  "previous_job_titles": [
    "Data Engineer",
    "Data Engineer"
  ],
  "previous_companies": [
    "Cognizant Technology Solutions",
    "Tata Consultancy Services"
  ],
  "skills": {
    "technical_skills": [
      "Python",
      "SQL",
      "Scala",
      "Hadoop",
      "Spark",
      "Kafka",
      "Hive",
      "Sqoop",
      "Airflow",
      "Azure",
      "Snowflake",
      "MySQL",
      "PostgreSQL",
      "Power BI"
    ],
    "soft_skills": [
      

In [20]:
print(type(new.content))

<class 'str'>


In [21]:
import json
candidate_data_1 = json.loads(parsed_candidate_data.content)
candidate_data_2 = json.loads(new.content)

In [25]:
print(type(candidate_data_1), type(candidate_data_2))

<class 'dict'> <class 'dict'>


In [22]:
candidate_data_1

{'name': 'Sanjay Kumar',
 'email': 'sanjay.kr114@gmail.com',
 'phone_number': '9868949825',
 'location': {'address': 'H. No. - D-2, M.R. Dayalpur, P.O. Gokul Pur, Delhi - 110094',
  'city': 'Delhi',
  'country': 'India'},
 'linkedin_profile': '',
 'github_profile': '',
 'portfolio_website': '',
 'career_objective': 'I am a competent and organized individual who is able to work as part of a team or individual and manage several priorities. I have a positive attitude, strong work ethic, and a keen desire to learn and grow within a firm.',
 'total_experience': 5,
 'relevant_experience': 3,
 'current_job_title': 'HR Executive',
 'current_company': 'Sirius Global Limited',
 'previous_job_titles': ['HR-ADMIN EXECUTIVE',
  'HR-ADMIN EXECUTIVE/ OFFICE COORDINATOR',
  'LOWER DIVISIONAL CLERK',
  'STORE IN-CHARGE',
  'FINANCIAL ADVISOR'],
 'previous_companies': ['Lion Manpower Solutions Private Limited',
  '1 Delhi Air Sqn. (Flg.)',
  'Prakash Switchgear',
  'Met Life India Insurance Co. Limited

In [23]:
candidate_data_2

{'name': 'Rishabh Maheshwari',
 'email': 'rishabh21071993@gmail.com',
 'phone_number': '+91-9643879864',
 'location': {'address': '', 'city': '', 'country': ''},
 'linkedin_profile': 'linkedin.com/in/rishabh-maheshwari-750a76181',
 'github_profile': '',
 'portfolio_website': '',
 'career_objective': 'Dedicated and results-oriented Data Engineer',
 'total_experience': '8+ years',
 'relevant_experience': '8+ years',
 'current_job_title': 'Senior Data Engineer',
 'current_company': 'EPAM SYSTEMS',
 'previous_job_titles': ['Data Engineer', 'Data Engineer'],
 'previous_companies': ['Cognizant Technology Solutions',
  'Tata Consultancy Services'],
 'skills': {'technical_skills': ['Python',
   'SQL',
   'Scala',
   'Hadoop',
   'Spark',
   'Kafka',
   'Hive',
   'Sqoop',
   'Airflow',
   'Azure',
   'Snowflake',
   'MySQL',
   'PostgreSQL',
   'Power BI'],
  'soft_skills': ['Excellent Problem-Solving and Analytical Skills']},
 'education': [{'degree': 'B.Tech',
   'institution': 'Meerut Insti