### Installations:

In [85]:
!pip install openai pandas langdetect PyPDF2



### Imports:

In [1]:
import openai
from langdetect import detect, DetectorFactory
import os
from PyPDF2 import PdfReader
import pandas as pd

### Set environment variable for Open Ai client:

In [2]:
%env OPENAI_API_KEY=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

env: OPENAI_API_KEY=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


### Create basic Variables, paths and set Open AI client

In [3]:
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"
max_chunk_size = 1000
overlap_size = 50

# Make langdetect non-deterministic results predictable
DetectorFactory.seed = 0

In [4]:
# Function to read PDF and extract text
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

In [5]:
def split_text(text, max_chunk_size, overlap_size=50):
    words = text.split()
    chunks = []
    current_chunk = ""
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_chunk_size:
            current_chunk += word + " "
        else:
            chunks.append(current_chunk)
            current_chunk = word + " "
    chunks.append(current_chunk)  # Add the last chunk
    return chunks

In [6]:
# Function to translate text using OpenAI's API (Updated for openai>=1.0.0)
def translate_text(client, text, target_language="en"):
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"Translate the following text to {target_language}:\n\n{text}",
        max_tokens=500  # Adjust as needed
    )
    # Access the text from the response
    return response.choices[0].text.strip()

In [7]:

def process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client):
    if not os.path.exists(translated_output_directory):
        os.makedirs(translated_output_directory)

    translated_files_list = []
    english_files_list = []

    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)

            if text.strip():
                if detect(text) != 'en':
                    chunks = split_text(text, max_chunk_size, overlap_size)
                    translated_text = ""

                    for chunk in chunks:
                        if detect(chunk) != 'en':
                            chunk = translate_text(client, chunk, target_language="en")
                        translated_text += chunk + " "
                    
                    # Save the translated text to a .txt file
                    translated_filename = f"translated_{filename.replace('.pdf', '.txt')}"
                    translated_path = os.path.join(translated_output_directory, translated_filename)
                    save_text_to_file(translated_text, translated_path)

                    translated_files_list.append(translated_filename)
                else:
                    english_files_list.append(filename)
            else:
                print(f"Document {filename} is empty or contains very little text.")

    # Save the list of translated files to a text file for reference
    save_file_list(translated_files_list, translated_output_directory, 'translated_files_list.txt')
    # Optionally save the list of English files as well
    save_file_list(english_files_list, translated_output_directory, 'english_files_list.txt')

def save_file_list(file_list, directory, filename):
    with open(os.path.join(directory, filename), 'w', encoding='utf-8') as f:
        for file in file_list:
            f.write(f"{file}\n")

# Function to save text to a file
def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

In [100]:
process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client)

Document 12632728.pdf is empty or contains very little text.


### Create named entities to look for in resumes

In [8]:
entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

In [181]:
import re

max_chunk_size = 3500  
overlap_size = 50      


def extract_text_from_txt(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']  # Common encodings
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Cannot decode file {file_path} with any of the provided encodings.")

def split_into_chunks(text, max_chunk_size, overlap_size):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) > max_chunk_size:
            # Split the chunk at the max chunk size
            chunk = ' '.join(current_chunk[:len(current_chunk)-overlap_size])
            chunks.append(chunk)
            # Start the next chunk with the overlap
            current_chunk = current_chunk[-overlap_size:]
    
    # Add the last chunk
    chunks.append(' '.join(current_chunk))
    return chunks
    
def extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size):
    extracted_info = ""
    chunks = split_into_chunks(text, max_chunk_size, overlap_size)  # Ensure chunks are small enough

    for chunk in chunks:
        # Adjusted prompt to ask for years of experience as a number, including fractional years for months
        prompt = (
            "Extract the following entities from this text, calculating years of experience as a decimal number where months are converted to a fractional year without any additional info: "
            + ", ".join(entities)
            + ".\n\n"
            + chunk
        )
        prompt_length = len(prompt.split())  # Calculate the prompt length in tokens

        max_tokens_for_completion = 4097 - prompt_length  # Adjust max tokens based on prompt length
        max_tokens_for_completion = min(max_tokens_for_completion, 300)  # Limit to 300 or less

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_tokens_for_completion
        )
        extracted_info += response.choices[0].text.strip() + "\n"

    return extracted_info

def process_resume(directory, filename, client, entities, data_list, is_txt=False):
    file_path = os.path.join(directory, filename)
    text = extract_text_from_txt(file_path) if is_txt else extract_text_from_pdf(file_path)
    
    if text.strip():
        extracted_info = extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size)
        info_dict = {'Filename': filename}
        
        for entity in entities:
            # Use regex to find the entity and its value
            pattern = re.compile(rf"{entity}\s*:\s*(.*)", re.IGNORECASE)
            match = pattern.search(extracted_info)
            if match:
                info_dict[entity] = match.group(1).strip()
            else:
                info_dict[entity] = None  # Or an appropriate placeholder if not found
        
        data_list.append(info_dict)


def create_entities_report(directory_path, translated_output_directory, client, entities):
    data = []
    # Process PDFs in the original directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            process_resume(directory_path, filename, client, entities, data)

    # Process translated PDFs in the output directory
    for filename in os.listdir(translated_output_directory):
        if filename.lower().startswith('translated_') and filename.lower().endswith('.txt'):
            process_resume(translated_output_directory, filename, client, entities, data, is_txt=True)

    df = pd.DataFrame(data)
    df.to_excel(os.path.join(translated_output_directory, 'resume_entities_report.xlsx'), index=False)

create_entities_report(directory_path, translated_output_directory, client, entities)


### Data Frame from CV's for named entities

In [9]:
import os
file_path = os.path.join(translated_output_directory, 'resume_entities_report.xlsx')
df = pd.read_excel(file_path)
df.set_index('Filename', inplace=True)
df.sort_index(inplace=True)
df.head(10)

Unnamed: 0_level_0,job title,years of experience,highest level of education,language skills,key skills
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10276858.pdf,Food Prep Chef,5+ years,None mentioned,,"Cooking, food preparation, sanitation, kitchen..."
10329506.pdf,"Registered Client Service Associate, Relations...",10.5 years,,,"Microsoft Word for Windows, Excel, Power Point..."
10344379.pdf,- Administrative support professional,- 5.8 years as an Administrative support profe...,Not specified.,Not specified.,- Organizational skills
10395944.pdf,Line Service Technician,3 years and 3 months,Associate's degree,English,"Great People Skills, Microsoft Office, Fueling..."
10428916.pdf,"Recreation & Sports Coordinator, Senior Health...","Recreation & Sports Coordinator (4.5 years), S...",ACSM Exercise Physiologist,,"Program development and implementation, custom..."
10466583.pdf,Floral designer,9 years and 9 months (as of July 2020),Medical Assistant certificate from Northwester...,None mentioned,"Customer service, inventory control, employee ..."
10527994.pdf,Substitute Teacher,4.75 years,Bachelor of Arts,English,"Outlook, Excel, Word, PowerPoint, QuickBooks, ..."
10554236.pdf,Accountant,11 years,Bachelor's degree,None mentioned,"Financial planning, reporting, analysis, accou..."
10603337.pdf,Sales Associate,4.5 years,Associates degree,None mentioned in text,"Customer service, sales, inventory management,..."
10641230.pdf,IT Manager/Network Administrator,8.5 years,Some college/Associate's degree,Proficient in English,"Hardware and software troubleshooting, network..."


In [10]:
df = pd.read_excel(file_path)
df.set_index('Filename', inplace=True)
df.sort_index(inplace=True)
years_of_experience = df['years of experience']
index = df.index
years_of_experience.head(30)

Filename
10276858.pdf                                                        5+ years
10329506.pdf                                                      10.5 years
10344379.pdf               - 5.8 years as an Administrative support profe...
10395944.pdf                                            3 years and 3 months
10428916.pdf               Recreation & Sports Coordinator (4.5 years), S...
10466583.pdf                          9 years and 9 months (as of July 2020)
10527994.pdf                                                      4.75 years
10554236.pdf                                                        11 years
10603337.pdf                                                       4.5 years
10641230.pdf                                                       8.5 years
10724818.pdf                                                       3.5 years
10816645.pdf                                 05/2012 to 10/2015 = 3.33 years
10818478.pdf                                           3 years and 

In [184]:
import re
import os

new_file_name = 'updated_years_resume_entities_report.xlsx'
new_file_path = os.path.join(translated_output_directory, new_file_name)
pd.DataFrame().to_excel(new_file_path)

def calculate_years_of_experience(client, text_descriptions):
    numeric_experience_list = []

    for text in text_descriptions:
        prompt = f"Convert the following description of work experience '{text}' into a numeric value representing total years of experience."

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=50,
            temperature=0.3
        )
        
        try:
            # Find all numeric values in the response and take the first one
            # This regex matches numbers with optional decimal points
            extracted_numbers = re.findall(r'\b\d+\.?\d*\b', response.choices[0].text.strip())
            if extracted_numbers:
                # Convert the first extracted number to a float
                numeric_experience = float(extracted_numbers[0])
                numeric_experience_list.append(numeric_experience)
            else:
                # If no numbers are found, it may not be possible to calculate experience
                numeric_experience_list.append(float('nan'))  # Append NaN for manual review
        except Exception as e:
            print(f"An error occurred: {e}")
            numeric_experience_list.append(float('nan'))  # Append NaN for manual review

    return numeric_experience_list

# Assuming you have set up the 'client' and have the 'years_of_experience' from the DataFrame
text_descriptions = df['years of experience'].astype(str).tolist()
numeric_years_of_experience = calculate_years_of_experience(client, text_descriptions)

# Add the numeric years of experience back to the DataFrame
df['numeric_years_of_experience'] = numeric_years_of_experience
df.to_excel(new_file_path)

# Output confirmation
print(f"The updated DataFrame has been saved to {new_file_path}.")

The updated DataFrame has been saved to C:\Users\apleczkan\PycharmProjects\task1-cv-resumes\resumes_translated\updated_years_resume_entities_report.xlsx.


### CV Summarization

In [104]:
import openai

def summarize_text(client, text, max_chunk_size=3000, overlap_size=50):
    """
    This function uses OpenAI's GPT-3 model to generate a summary of the resume text.
    """

    chunks = split_text(text, max_chunk_size, overlap_size)
    summary = ""

    for chunk in chunks:
        prompt = (
            "Please summarize the following resume into a short paragraph that includes "
            "the job title, years of experience, highest level of education, language skills, "
            "and key skills:\n\n" + chunk  # Use the current chunk, not the entire text
        )
        
        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",  # Use the latest available model
                prompt=prompt,
                max_tokens=150,  # Adjust as needed for the summary length
                temperature=0.5
            )
            chunk_summary = response.choices[0].text.strip()
            summary += chunk_summary + "\n"  # Concatenate the summaries from different chunks
        except Exception as e:
            # Handle any exception that occurs
            print(f"An error occurred: {e}")
    
    return summary



In [105]:
import os
import pandas as pd

# Directory paths
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"
xlsx_file_path = os.path.join(translated_output_directory, "resume_entities_report.xlsx")

# Load the DataFrame from the Excel file
df = pd.read_excel(xlsx_file_path)
df.set_index('Filename', inplace=True)

# Iterate over the DataFrame and summarize each resume
for filename in df.index:
    # Determine the correct file path
    if filename.startswith('translated_'):
        resume_path = os.path.join(translated_output_directory, filename)
    else:
        resume_path = os.path.join(directory_path, filename)

    # Skip non-resume files like 'translated_files_list.txt'
    if 'translated_files_list' in filename:
        continue

    # Check the file extension and read the content
    if resume_path.lower().endswith('.pdf'):
        try:
            resume_text = extract_text_from_pdf(resume_path)
        except Exception as e:
            print(f"An error occurred while reading PDF file: {e}")
            continue
    elif resume_path.lower().endswith('.txt'):
        try:
            resume_text = extract_text_from_txt(resume_path)
        except Exception as e:
            print(f"An error occurred while reading text file: {e}")
            continue
    else:
        print(f"Unsupported file format for file: {resume_path}")
        continue

    # Generate a summary for the resume (assuming summarize_text function is defined)
    summary = summarize_text(client, resume_text)
    df.at[filename, 'Summary'] = summary

# Uncomment the below lines to see the DataFrame and save it
print(df.head())
df.to_excel(os.path.join(translated_output_directory, 'updated_resume_summaries.xlsx'))


                                                      job title  \
Filename                                                          
10276858.pdf                                    Cook, Line Cook   
10329506.pdf  Registered Client Service Associate, Relations...   
10344379.pdf                Administrative support professional   
10395944.pdf  Line Service Technician, Ramp Agent, Team Memb...   
10428916.pdf                    Recreation & Sports Coordinator   

                                       years of experience  \
Filename                                                     
10276858.pdf                          6 years and 5 months   
10329506.pdf                          9 years (as of 2012)   
10344379.pdf                                          0.00   
10395944.pdf  0.48 years (calculated from 42 flying hours)   
10428916.pdf               5.333333 (5 years and 4 months)   

                                     highest level of education  \
Filename                    

In [109]:
df.head()

Unnamed: 0_level_0,job title,years of experience,highest level of education,language skills,key skills,Summary
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10276858.pdf,"Cook, Line Cook",6 years and 5 months,High School Diploma,,"Food preparation, kitchen management, patisser...",Experienced Food Prep Chef with 5+ years of fo...
10329506.pdf,"Registered Client Service Associate, Relations...",9 years (as of 2012),Not mentioned,None mentioned,"Client support, maintaining and developing new...",Experienced Registered Client Service Associat...
10344379.pdf,Administrative support professional,0.00,,None mentioned,"Organizational skills, technical skills, inter...",This experienced administrative support profes...
10395944.pdf,"Line Service Technician, Ramp Agent, Team Memb...",0.48 years (calculated from 42 flying hours),"High School Diploma, currently pursuing Associ...",None mentioned,"Microsoft Office, Fueling Aircraft, Airport Ra...",This resume belongs to a Line Service Technici...
10428916.pdf,Recreation & Sports Coordinator,5.333333 (5 years and 4 months),"ACSM Exercise Physiologist, TRX Qualified Inst...",None mentioned,"Recreation facility management, program planni...",This candidate is a highly qualified Recreatio...


### Scoring criteria based on provided vacancy:

### Job requirements from job description:

In [190]:
job_description = """
FullStack(NodeJS, ReactJS), Online Genealogy Service
Client
The client is an international company that provides an online genealogy service that helps its clients understand their past and family history.

Project overview
The core programming language is JavaScript (ES2020), a website running on React.js and GraphQL and the back-end platform is based on Node.js (Express). Microservices running under Kubernetes. The project methodology is Scrum.

Team
There are a few Full Stack teams, up to 8 people each. Each team has a team lead and a product owner.

Position overview
We are looking for a specialist to join one of the teams (which is more Frontend oriented) is working on the further development of existing platforms. Regarding the work schedule, each employee should be available till 4 pm UK time.

Technology stack
JavaScript, React.js, GraphQL, Node.js (Express), Kubernetes.
 
Requirements
Development experience using a Node.js (Express) + React.js stack
Experience with SQL Server
Experience with PostgreSQL
Knowledge of Kafka
Knowledge of RabbitMQ
Dev-level experience with K8s/Docker
Knowledge of sound engineering practices like pair programming, upfront automated testing, continuous deployment, and trunk-based development
Spoken English

Nice to have
Knowledge of Apollo engine, Kafka, Postgres
Experience with microservices architecture development
Experience with GraphQL
Experience with RabbitMQ, SQL Server
Experience in development with C#
Experience with SOLR
Software development experience in Python
"""

entities = [
    "job title", "years of experience", "highest level of education", "language skills", "key skills"
]


prompt = (
    "Please structure the job requirements from the following text into a JSON-like format with these categories: "
    + ", ".join(entities)
    + ".\n\n"
    + job_description
)

response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    max_tokens=300  # Adjust as needed
)
extracted_requirements = response.choices[0].text.strip()
print(extracted_requirements)


{
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": [
        "Node.js",
        "React.js",
        "GraphQL",
        "Kubernetes",
        "SQL Server",
        "PostgreSQL",
        "Kafka",
        "RabbitMQ",
        "C#",
        "SOLR",
        "Python",
        "Sound engineering practices",
        "Pair programming",
        "Automated testing",
        "Continuous deployment",
        "Trunk-based development"
    ]
}


In [191]:
print(extracted_requirements)

{
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": [
        "Node.js",
        "React.js",
        "GraphQL",
        "Kubernetes",
        "SQL Server",
        "PostgreSQL",
        "Kafka",
        "RabbitMQ",
        "C#",
        "SOLR",
        "Python",
        "Sound engineering practices",
        "Pair programming",
        "Automated testing",
        "Continuous deployment",
        "Trunk-based development"
    ]
}


### Scoring function:

In [None]:
import pandas as pd
import os
import re
import json

job_requirements_str = """
{
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": [
        "Node.js",
        "React.js",
        "GraphQL",
        "Kubernetes",
        "SQL Server",
        "PostgreSQL",
        "Kafka",
        "RabbitMQ",
        "C#",
        "SOLR",
        "Python",
        "Sound engineering practices",
        "Pair programming",
        "Automated testing",
        "Continuous deployment",
        "Trunk-based development"
    ]
}
"""

job_requirements = json.loads(job_requirements_str)

def calculate_matching_score(resume_info, job_requirements):
    total_criteria = len(job_requirements)
    matching_criteria = 0

    try:# For each job requirement, check if it's matched in the resume info and count matches
        for category, requirement in job_requirements.items():
            if category == 'skills':
                matched_skills = sum(skill in requirement for skill in resume_info.get(category, []))
                matching_criteria += matched_skills / len(requirement) if requirement else 0
            elif category == 'experience_years':
                # Assuming 'experience_years' in resume_info is a float representing years
                resume_years = float(resume_info.get(category, 0))
                required_years = float(re.search(r'\d+', requirement).group())
                matching_criteria += resume_years / required_years if required_years else 0
            elif category == 'education_level':
                matching_criteria += int(requirement.lower() in resume_info.get(category, "").lower())
            elif category == 'languages':
                # Assuming 'languages' in resume_info is a list of languages
                matching_criteria += any(lang.lower() in resume_info.get(category, []) for lang in requirement)
            # Add more elif blocks for additional categories if needed

    except Exception as e:
        print(f"Error occurred: {e}")

    return matching_criteria / total_criteria if total_criteria else 0


def score_resumes(directory_path, job_requirements, entities, client):
    data = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf') or filename.lower().endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            text = extract_text_from_txt(file_path)  # Function to extract text from file
            extracted_info = extract_entities_with_llm(client, text, entities, 3500, 50)
            info_dict = {'Filename': filename}

            for entity in entities:
                pattern = re.compile(rf"{entity}\s*:\s*(.*)", re.IGNORECASE)
                match = pattern.search(extracted_info)
                if match:
                    info_dict[entity] = match.group(1).strip()
                else:
                    info_dict[entity] = None

            resume_score = calculate_matching_score(info_dict, job_requirements)
            info_dict['Score'] = resume_score
            data.append(info_dict)

    return pd.DataFrame(data)
    
scored_resumes_df = score_resumes(directory_path, job_requirements, entities, client)
scored_resumes_df.sort_values(by='Score', ascending=False, inplace=True)
print(scored_resumes_df.head(10))


Debugging: Type of resume_info: <class 'dict'>
Debugging: Content of resume_info: {'Filename': '10276858.pdf', 'job title': 'not mentioned in the text', 'years of experience': 'not mentioned in the text', 'highest level of education': 'not mentioned in the text', 'language skills': 'not mentioned in the text', 'key skills': 'not mentioned in the text'}
Debugging: Type of job_requirements: <class 'dict'>
Debugging: Content of job_requirements: {'job title': 'FullStack Developer', 'years of experience': 'At least 2 years of development experience', 'highest level of education': "Bachelor's or higher in Computer Science or related field", 'language skills': 'Fluent in spoken English', 'key skills': ['Node.js', 'React.js', 'GraphQL', 'Kubernetes', 'SQL Server', 'PostgreSQL', 'Kafka', 'RabbitMQ', 'C#', 'SOLR', 'Python', 'Sound engineering practices', 'Pair programming', 'Automated testing', 'Continuous deployment', 'Trunk-based development']}
Debugging: Type of resume_info: <class 'dict'>
D

In [1]:
from openai import OpenAI

job_requirements_str = """
{
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": [
        "Node.js",
        "React.js",
        "GraphQL",
        "Kubernetes",
        "SQL Server",
        "PostgreSQL",
        "Kafka",
        "RabbitMQ",
        "C#",
        "SOLR",
        "Python",
        "Sound engineering practices",
        "Pair programming",
        "Automated testing",
        "Continuous deployment",
        "Trunk-based development"
    ]
}
"""

job_requirements = json.loads(job_requirements_str)

def split_text(text, max_chunk_size, overlap_size=50):
    words = text.split()
    chunks = []
    current_chunk = ""
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_chunk_size:
            current_chunk += word + " "
        else:
            if overlap_size > 0:
                overlap = " ".join(words[words.index(word)-overlap_size:words.index(word)])
                current_chunk = overlap + " "
            chunks.append(current_chunk.strip())
            current_chunk = word + " "
    chunks.append(current_chunk.strip())  # Add the last chunk
    return chunks

# Initialize your OpenAI client with your API key
client = OpenAI(api_key='your-api-key')

def call_llm(prompt):
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=50,
        temperature=0.3
    )
    return response.choices[0].text.strip()

def extract_entities_with_model(text_chunks, job_requirements):
    extracted_entities = []

    for chunk in text_chunks:
        # For each category, you would tailor the prompt according to the type of information you are extracting.
        extracted_entities_in_chunk = {key: [] for key in job_requirements}  # Initialize dictionary to hold extraction results

        # Let's extract years of experience as an example
        if 'years of experience' in job_requirements:
            years_of_experience_prompt = (
                f"Convert the following description of work experience '{chunk}' into "
                "a numeric value representing total years of experience."
            )
            years_of_experience = call_llm(years_of_experience_prompt)
            extracted_entities_in_chunk['years of experience'] = years_of_experience

        # ... perform additional entity extractions for other fields ...

        extracted_entities.append(extracted_entities_in_chunk)
    return extracted_entities

In [40]:
import os
import re
import json
import pandas as pd
from collections import Counter
from fuzzywuzzy import fuzz  # For fuzzy string matching

max_chunk_size = 3000  
overlap_size = 50    

entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

# Define the job description as a string
job_description_str = """
{
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": [
        "Node.js", "React.js", "GraphQL", "Kubernetes", "SQL Server",
        "PostgreSQL", "Kafka", "RabbitMQ", "C#", "SOLR", "Python",
        "Sound engineering practices", "Pair programming",
        "Automated testing", "Continuous deployment", "Trunk-based development"
    ]
}
"""

def split_into_chunks(text, max_chunk_size, overlap_size):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) > max_chunk_size:
            # Split the chunk at the max chunk size
            chunk = ' '.join(current_chunk[:len(current_chunk)-overlap_size])
            chunks.append(chunk)
            # Start the next chunk with the overlap
            current_chunk = current_chunk[-overlap_size:]
    
    # Add the last chunk
    chunks.append(' '.join(current_chunk))
    return chunks


def extract_entities_with_llm(client, text, entities, max_chunk_size=3000, overlap_size=50):
    extracted_info = {}
    chunks = split_into_chunks(text, max_chunk_size, overlap_size)  # Ensure chunks are small enough

    for entity in entities:
        # Initialize each entity in our collection dictionary
        extracted_info[entity] = []
    
    for chunk in chunks:
        prompt_builder = []
        for entity in entities:
            # Customizing the prompt for each different entity
            if entity == 'years of experience':
                prompt_builder.append(f"{entity} as a numeric value where months are considered as fractions of a year")
            elif entity == 'key skills':
                prompt_builder.append(f"{entity} as a list matching any of: {', '.join(job_description[entity])}")
            else:
                prompt_builder.append(entity)
        
        prompt = (
            "From the following text excerpt, extract the entities and "
            "present them in a structured way using the format 'Entity: Value'.\n\n"
            "Please extract the entities "
            + ", ".join(prompt_builder)
            + ".\n\n"
            + chunk
        )
        
        prompt_length = len(prompt.split())  # Calculate the prompt length in tokens

        max_tokens_for_completion = 4097 - prompt_length  # Adjust max tokens based on prompt length
        max_tokens_for_completion = min(max_tokens_for_completion, 300)  # Limit to 300 or less

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_tokens_for_completion
        )

        # Process the structured response and fill the extracted_info dict
        structured_response = response.choices[0].text.strip().split('\n')
        for line in structured_response:
            for entity in entities:
                if line.lower().startswith(entity.lower() + ':'):
                    split_line = line.split(':', 1)
                    if len(split_line) > 1:
                        value = split_line[1].strip()
                        if value:
                            if entity == 'key skills':  # Special case as we expect a list
                                skills = re.findall(r"[\w']+", value)
                                extracted_info[entity].extend(skills)
                            else:
                                extracted_info[entity].append(value)
                    break  # Move on to the next line once the entity is found

    
    # Aggregate the extracted information by combining or choosing the most mentioned entity
    for entity, values in extracted_info.items():
        if values:
            if entity == 'key skills':
                skills_counter = Counter(values)
                # Select skills that are most frequently mentioned
                extracted_info[entity] = [skill for skill, count in skills_counter.items()]
            else:
                # For other entities, we expect a single value, so we take the most frequent one
                value_counter = Counter(values)
                extracted_info[entity], _ = value_counter.most_common(1)[0]
    
    # Now, the extracted_info is ready for scoring against the job_description
    return extracted_info

directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"

def extract_text_from_txt(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']  # Common encodings
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Cannot decode file {file_path} with any of the provided encodings.")

# We keep the existing functions extract_text_from_txt, extract_text_from_pdf, and split_into_chunks unchanged.
# ...

def score_resume(extracted_info, job_description):
    """
    Scores the resume based on the extracted information and the job description.
    This function needs to be tailored to take into account all specific entities and the logic of their comparison.
    
    For example, you could have a matching scale for each entity, and sum up the scores for a final score.
    As our example, we'll score only based on the presence or absence of extracted info.
    A better scoring function should consider the relevance of the match, not just its presence.
    """
    score = 0
    for entity in job_description:
        if entity in extracted_info and extracted_info[entity] is not None:
            # Naive scoring: present = 1 point, not present = 0 points
            score += 1
    
    # More complex scoring logic can be added here
    return score

def extract_and_score_resume(client, text, job_description, entities, max_chunk_size, overlap_size):
    extracted_entities = extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size)
    
    # info_dict to collect each entity's most common or first occurrence in the resume
    info_dict = {entity: None for entity in entities}

    for entity in entities:
        # Directly access the entity's value from the extracted_entities dictionary
        if entity in extracted_entities and extracted_entities[entity]:
            # Note: If you're expecting a single value for an entity, use extracted_entities[entity][0]
            # If you're expecting multiple values for an entity as a list, just use extracted_entities[entity]
            info_dict[entity] = extracted_entities[entity]

    # Now you can score the resume based on the content of info_dict and the job_description
    resume_score = score_resume(info_dict, job_description)
    return info_dict, resume_score

def create_entities_report(
    directory_path=directory_path,
    translated_output_directory=translated_output_directory,
    client=client,
    job_description=job_description,
    entities=entities,
    max_chunk_size=max_chunk_size,
    overlap_size=overlap_size
):
    """
    This function processes all resumes in the given directories, extracting entities and scoring them against a job description.
    It generates a report that ranks the resumes based on their scores.

    :param directory_path: Path to the directory containing the original resumes.
    :param translated_output_directory: Path to the directory where the translated resumes are stored.
    :param client: OpenAI client initialized with an API key.
    :param job_description: Dictionary containing the job description to score against.
    :param entities: List of entity types to extract from the resumes.
    :param max_chunk_size: Maximum size of the text chunk to be processed by the LLM in a single request.
    :param overlap_size:Size of the overlap between chunks of text to ensure continuity is maintained in entity extraction.
    """
    data = []
    
    # Process all resumes in the directories
    for directory, is_txt in [(directory_path, False), (translated_output_directory, True)]:
        for filename in os.listdir(directory):
            file_extension = '.txt' if is_txt else '.pdf'
            if filename.lower().endswith(file_extension):
                file_path = os.path.join(directory, filename)
                
                if is_txt:
                    text = extract_text_from_txt(file_path)
                else:
                    text = extract_text_from_pdf(file_path)

                if text.strip():
                    # Assuming that extract_and_score_resume is a function defined elsewhere 
                    # that takes the following arguments in the order given below.
                    info_dict, resume_score = extract_and_score_resume(client, text, job_description, entities, max_chunk_size, overlap_size)
                    info_dict['Filename'] = filename
                    info_dict['Score'] = resume_score
                    data.append(info_dict)

    # Creating a DataFrame from the extracted data and scores
    df = pd.DataFrame(data)
    df = df.sort_values(by='Score', ascending=False)  # Sort dataframe by score in descending order
    output_file_path = os.path.join(translated_output_directory, 'resume_scoring_report.xlsx')
    df.to_excel(output_file_path, index=False)

    print(f"Report generated and saved to {output_file_path}")

# The `job_description` variable should be a dictionary parsed from JSON.
job_description = json.loads(job_description_str)

# Call the main function with the correct parameters.
create_entities_report(
    directory_path=directory_path,
    translated_output_directory=translated_output_directory,
    client=client,
    job_description=job_description,
    entities=entities,
    max_chunk_size=max_chunk_size,
    overlap_size=overlap_size
)

Report generated and saved to C:\Users\apleczkan\PycharmProjects\task1-cv-resumes\resumes_translated\resume_scoring_report.xlsx
