In [None]:
# Install the requirements

In [None]:
pip install openai pyPDF2 python-docx

In [None]:
import PyPDF2, openai, json
from docx import Document

In [None]:
openai.api_key = 'OPENAI API KEY'

In [None]:
MODEL = "gpt-4o-mini"

In [None]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Initialize an empty string to store the text
        full_text = ""
        
        # Loop through all the pages in the PDF and extract text
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            full_text += page.extract_text() + "\n"  # Add a new line after each page

    return full_text

In [None]:
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

In [None]:
# Define the prompts

In [None]:
SYSTEM_PROMPT = """
You are a highly skilled assistant specialized in analyzing resumes for hiring purposes. Your tasks are as follows:

    1. **Mask Personal Information:** 
       - When you receive a resume text, your first task is to identify and mask all personal identifiable information (PII). This includes, but is not limited to, names, addresses, phone numbers, email addresses, and any other sensitive details that could identify the person.
       - Replace the masked information with a placeholder in the format `[MASKED]`.

    2. **Generate Unique ID:**
       - After masking the PII, generate a unique identifier (UUID) for the person. This ID should be used to reference the person in the JSON output and should be included in the `masked_info` property.

    3. **Parse Resume into JSON:**
       - Structure the resume into a JSON file with the following main keys:
         - `summary`: A brief overview of the person’s qualifications and career goals.
         - `education`: Details of the person's educational background.
         - `work_experience`: Information about the person's professional experience.
         - `academic_experience`: Any additional academic achievements or experiences.
         - `skills`: A list of the person's skills and competencies.
         - `projects`: A list of the person's projects and descriptions.
         - `certifications`: A list of the person's certifications with specific details.
         - `Questions`: A list of questions and applicant answers at the end of the file. 
             The questions start in the section with the name "Questionnaire Results". Do not truncate answers. Include all of them.
         - `masked_info`: A section that includes the masked PII and the unique ID generated.

    4. **Output:**
       - Ensure that the entire resume content, including the masked information and structured sections, is output as a JSON file.

    Your JSON output should be structured like this:

    ```json
    {
      "summary": "Masked summary information.",
      "education": "Masked education details.",
      "work_experience": "Masked work experience details.",
      "academic_experience": "Masked academic experience details.",
      "skills": "skills details.",
      "projects": "projects details",
      "certifications": "certifications",
      "questions": [{
          "q": "question asked from applicant",
          "a": "applicant's answer",
      }],
      "masked_info": {
        "unique_id": "generated-unique-id",
        "original_text": "Original PII. DO NOT MASK THE DATA FOR THIS FIELD. Create a key for each of the PIIs according to the original text. For example, first name, city, etc. 
        If any of them needs trimming or formatting do so. For example no space in the email."
      }
    }
    ```
"""

In [None]:
JOB_DESCRIPTION_SYSTEM_PROMPT = """
You are an assistant tasked with parsing job descriptions into a structured JSON format for further analysis. 
The job descriptions contain multiple sections such as job context, accountabilities, requirements, preferences, and behavioral competencies. These sections should be extracted and organized in the JSON output.

Here’s how the parsing should be structured:
1. ** Job Title**: Title of the job. 
2. **Job Context**: Extract the general context or summary of the job.
3. **Accountabilities**: List the responsibilities and tasks associated with the role.
4. **Requirements**:
   - **Education and Experience**: List of education and experience required.
   - **Related Experience**: List required professional experience, if mentioned.
       - **AND Group**: If it is mentioned that the experience 'MUST' include the following, they should be here as a list.
       - **OR Group**: If it is not mentioned that the experience 'MUST' include the following, they should be here as a list.
   - **Skills**: Extract necessary skills, separating them into technical and soft skills when possible.
   - **Certifications**: Extract any required certifications.
5. **Preferences**:
   - List of skills, experience, and education that is not mandatory but it is mentioned that it is preferred or preference may be given to appcants with those. 
6. **Behavioral Competencies**: Extract any competencies or traits the candidate is expected to display, such as teamwork, communication, or problem-solving.
7. **Other Sections**: Include any other sections such as company values or benefits that are mentioned in the job description.

For each section, the parsed data should be outputted in a clean JSON format with relevant properties and descriptions. Example:
```json
    {
      "title": "title of the job",
      "job_context": "A brief summary of the job's purpose and overview.",
      "accountabilities": [
        "Responsibility 1",
        "Responsibility 2"
      ],
      "requirements": {
        "education_experience": [
            "education/experience_1",
            "education/experience_2",
        ],
        "related_experience": {
          "AND": [
            "Requirement 1",
            "Requirement 2"
          ],
          "OR": [
            "Requirement 3",
            "Requirement 4"
          ]
        },
        "skills": {
          "technical": ["Skill 1", "Skill 2"],
          "soft": ["Skill 3", "Skill 4"]
        },
        "certifications": ["Certification 1"],
      },
      "preferences": {
        "education": ["Preferred qualifications."],
        "experience" ["Preferred experience"],
      },
      "behavioral_competencies": [
        "Competency 1",
        "Competency 2"
      ],
      "other_sections": {
        "values": "Company values or benefits."
      }
    }
```
Ensure that all relevant sections are parsed accurately and that the AND/OR logic is respected in the `conditional_requirements` section.
"""

In [None]:
def mask_pii_and_parse_resume_to_json(text):
    client = openai.OpenAI(api_key=openai.api_key)
    response = client.chat.completions.create(
    model=MODEL,
    response_format={"type": "json_object"},
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": text}
        ]
    )
    return response.choices[0].message.content

In [None]:
def parse_job_description_to_json(text):
    client = openai.OpenAI(api_key=openai.api_key)
    response = client.chat.completions.create(
    model=MODEL,
    response_format={"type": "json_object"},
    messages=[
        {"role": "system", "content": JOB_DESCRIPTION_SYSTEM_PROMPT},
        {"role": "user", "content": text}
        ]
    )
    return response.choices[0].message.content

In [None]:
resume_text = extract_text_from_pdf('RESUME_PDF_PATH')

In [None]:
resume_text

In [None]:
resume_response = mask_pii_and_parse_resume_to_json(resume_text)

In [None]:
parsed_resume_response = json.loads(resume_response)

In [None]:
parsed_resume_response

In [None]:
job_description_text = extract_text_from_docx('JOB_DESCRIPTION_DOCX')

In [None]:
job_description_response = parse_job_description_to_json(job_description_text)

In [None]:
job_description_response_parsed = json.loads(job_description_response)

In [None]:
job_description_response_parsed