#### The code is organized in such a way that it can be run independently for each specific part of the task. This is also due to issues with running individual cells in Jupyter Notebook v.7, which, for some reason, were not remembering cells that had already been executed once.

### Installations:

In [34]:
!pip install openai pandas langdetect PyPDF2 transformers plotly matplotlib scikit-learn torch torchvision scipy



### Imports:

In [1]:
import openai
from langdetect import detect, DetectorFactory
import os
from PyPDF2 import PdfReader
import pandas as pd

### Set environment variable for Open Ai client:

In [2]:
%env OPENAI_API_KEY=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

env: OPENAI_API_KEY=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


### Create basic Variables, paths and set Open AI client

In [3]:
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"
logs_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs"
max_chunk_size = 1000
overlap_size = 50

DetectorFactory.seed = 0

### extract_text_from_pdf

In [4]:

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

### Split text into chunks with overlap

In [5]:
def split_text(text, max_chunk_size, overlap_size=50):
    words = text.split()
    chunks = []
    current_chunk = ""
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_chunk_size:
            current_chunk += word + " "
        else:
            chunks.append(current_chunk)
            current_chunk = word + " "
    chunks.append(current_chunk)
    return chunks

In [6]:
# Function to translate text using OpenAI's API  for openai>=1.0.0
def translate_text(client, text, target_language="en"):
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"Translate the following text to {target_language}:\n\n{text}",
        max_tokens=500
    )
    return response.choices[0].text.strip()

In [7]:

def process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client):
    if not os.path.exists(translated_output_directory):
        os.makedirs(translated_output_directory)

    translated_files_list = []
    english_files_list = []

    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)

            if text.strip():
                if detect(text) != 'en':
                    chunks = split_text(text, max_chunk_size, overlap_size)
                    translated_text = ""

                    for chunk in chunks:
                        if detect(chunk) != 'en':
                            chunk = translate_text(client, chunk, target_language="en")
                        translated_text += chunk + " "
                    
                    # Save the translated text to a .txt file
                    translated_filename = f"translated_{filename.replace('.pdf', '.txt')}"
                    translated_path = os.path.join(translated_output_directory, translated_filename)
                    save_text_to_file(translated_text, translated_path)

                    translated_files_list.append(translated_filename)
                else:
                    english_files_list.append(filename)
            else:
                print(f"Document {filename} is empty or contains very little text.")

    # Save the list of translated files to a text file for reference
    save_file_list(translated_files_list, logs_directory, 'translated_files_list.txt')
    # Optionally save the list of English files as well
    save_file_list(english_files_list, logs_directory, 'english_files_list.txt')

def save_file_list(file_list, directory, filename):
    with open(os.path.join(directory, filename), 'w', encoding='utf-8') as f:
        for file in file_list:
            f.write(f"{file}\n")

# Function to save text to a file
def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

In [42]:
process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client)

Document 12632728.pdf is empty or contains very little text.


### Create named entities to look for in resumes

In [8]:
entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

In [44]:
import re

max_chunk_size = 3500  
overlap_size = 50      

# added encodings because I encoutered problems with reading all files
def extract_text_from_txt(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252'] 
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Cannot decode file {file_path} with any of the provided encodings.")

def split_into_chunks(text, max_chunk_size, overlap_size):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) > max_chunk_size:
            # Split the chunk at the max chunk size
            chunk = ' '.join(current_chunk[:len(current_chunk)-overlap_size])
            chunks.append(chunk)
            # Start the next chunk with the overlap
            current_chunk = current_chunk[-overlap_size:]
    
    # Add the last chunk
    chunks.append(' '.join(current_chunk))
    return chunks
    
def extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size):
    extracted_info = ""
    chunks = split_into_chunks(text, max_chunk_size, overlap_size)  # Ensure chunks are small enough

    for chunk in chunks:

        prompt = (
            "Extract the following entities from this text, calculating years of experience as a decimal number where months are converted to a fractional year without any additional info: "
            + ", ".join(entities)
            + ".\n\n"
            + chunk
        )
        prompt_length = len(prompt.split()) 

        max_tokens_for_completion = 4097 - prompt_length 
        max_tokens_for_completion = min(max_tokens_for_completion, 300) 

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_tokens_for_completion
        )
        extracted_info += response.choices[0].text.strip() + "\n"

    return extracted_info

def process_resume(directory, filename, client, entities, data_list, is_txt=False):
    file_path = os.path.join(directory, filename)
    text = extract_text_from_txt(file_path) if is_txt else extract_text_from_pdf(file_path)
    
    if text.strip():
        extracted_info = extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size)
        info_dict = {'Filename': filename}
        
        for entity in entities:
            pattern = re.compile(rf"{entity}\s*:\s*(.*)", re.IGNORECASE)
            match = pattern.search(extracted_info)
            if match:
                info_dict[entity] = match.group(1).strip()
            else:
                info_dict[entity] = None  
        
        data_list.append(info_dict)


def create_entities_report(directory_path, translated_output_directory, client, entities):
    data = []
    # Process PDFs in the original directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            process_resume(directory_path, filename, client, entities, data)

    # Process translated PDFs in the output directory
    for filename in os.listdir(translated_output_directory):
        if filename.lower().startswith('translated_') and filename.lower().endswith('.txt'):
            process_resume(translated_output_directory, filename, client, entities, data, is_txt=True)

    df = pd.DataFrame(data)
    df.to_excel(os.path.join(logs_directory, 'resume_entities_report.xlsx'), index=False)

create_entities_report(directory_path, translated_output_directory, client, entities)

### Data Frame from CV's for named entities

In [45]:
import os
file_path = os.path.join(translated_output_directory, 'resume_entities_report.xlsx')
df = pd.read_excel(file_path)
df.set_index('Filename', inplace=True)
df.sort_index(inplace=True)
df.head(10)

Unnamed: 0_level_0,job title,years of experience,highest level of education,language skills,key skills
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10276858.pdf,Food Prep Chef,5+ years,None mentioned,,"Cooking, food preparation, sanitation, kitchen..."
10329506.pdf,"Registered Client Service Associate, Relations...",10.5 years,,,"Microsoft Word for Windows, Excel, Power Point..."
10344379.pdf,- Administrative support professional,- 5.8 years as an Administrative support profe...,Not specified.,Not specified.,- Organizational skills
10395944.pdf,Line Service Technician,3 years and 3 months,Associate's degree,English,"Great People Skills, Microsoft Office, Fueling..."
10428916.pdf,"Recreation & Sports Coordinator, Senior Health...","Recreation & Sports Coordinator (4.5 years), S...",ACSM Exercise Physiologist,,"Program development and implementation, custom..."
10466583.pdf,Floral designer,9 years and 9 months (as of July 2020),Medical Assistant certificate from Northwester...,None mentioned,"Customer service, inventory control, employee ..."
10527994.pdf,Substitute Teacher,4.75 years,Bachelor of Arts,English,"Outlook, Excel, Word, PowerPoint, QuickBooks, ..."
10554236.pdf,Accountant,11 years,Bachelor's degree,None mentioned,"Financial planning, reporting, analysis, accou..."
10603337.pdf,Sales Associate,4.5 years,Associates degree,None mentioned in text,"Customer service, sales, inventory management,..."
10641230.pdf,IT Manager/Network Administrator,8.5 years,Some college/Associate's degree,Proficient in English,"Hardware and software troubleshooting, network..."


In [46]:
df = pd.read_excel(file_path)
df.set_index('Filename', inplace=True)
df.sort_index(inplace=True)
years_of_experience = df['years of experience']
index = df.index
years_of_experience.head(30)

Filename
10276858.pdf                                                        5+ years
10329506.pdf                                                      10.5 years
10344379.pdf               - 5.8 years as an Administrative support profe...
10395944.pdf                                            3 years and 3 months
10428916.pdf               Recreation & Sports Coordinator (4.5 years), S...
10466583.pdf                          9 years and 9 months (as of July 2020)
10527994.pdf                                                      4.75 years
10554236.pdf                                                        11 years
10603337.pdf                                                       4.5 years
10641230.pdf                                                       8.5 years
10724818.pdf                                                       3.5 years
10816645.pdf                                 05/2012 to 10/2015 = 3.33 years
10818478.pdf                                           3 years and 

In [58]:
import re
import os

new_file_name = 'updated_years_resume_entities_report.xlsx'
new_file_path = os.path.join(logs_directory, new_file_name)
pd.DataFrame().to_excel(new_file_path)

def calculate_years_of_experience(client, text_descriptions):
    numeric_experience_list = []

    for text in text_descriptions:
        prompt = f"Convert the following description of work experience '{text}' into a numeric value representing total years of experience."

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=50,
            temperature=0.5
        )
        
        try:
            # Find all numeric values in the response and take the first one
            # This regex matches numbers with optional decimal points
            extracted_numbers = re.findall(r'\b\d+\.?\d*\b', response.choices[0].text.strip())
            if extracted_numbers:
                # Convert the first extracted number to a float
                numeric_experience = float(extracted_numbers[0])
                numeric_experience_list.append(numeric_experience)
            else:
                # If no numbers are found, it may not be possible to calculate experience
                numeric_experience_list.append(float('nan'))  # Append NaN for manual review
        except Exception as e:
            print(f"An error occurred: {e}")
            numeric_experience_list.append(float('nan'))  # Append NaN for manual review

    return numeric_experience_list

# Assuming you have set up the 'client' and have the 'years_of_experience' from the DataFrame
text_descriptions = df['years of experience'].astype(str).tolist()
numeric_years_of_experience = calculate_years_of_experience(client, text_descriptions)

# Add the numeric years of experience back to the DataFrame
df['numeric_years_of_experience'] = numeric_years_of_experience
df.to_excel(new_file_path)

# Output confirmation
print(f"The updated DataFrame has been saved to {new_file_path}.")

The updated DataFrame has been saved to C:\Users\apleczkan\PycharmProjects\task1-cv-resumes\logs\updated_years_resume_entities_report.xlsx.


### CV Summarization

In [48]:
import openai

def summarize_text(client, text, max_chunk_size=3000, overlap_size=50):
    """
    This function uses OpenAI's GPT-3 model to generate a summary of the resume text.
    """

    chunks = split_text(text, max_chunk_size, overlap_size)
    summary = ""

    for chunk in chunks:
        prompt = (
            "Please summarize the following resume into a short paragraph that includes "
            "the job title, years of experience, highest level of education, language skills, "
            "and key skills:\n\n" + chunk  # Use the current chunk, not the entire text
        )
        
        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",  # Use the latest available model
                prompt=prompt,
                max_tokens=150,  # Adjust as needed for the summary length
                temperature=0.5
            )
            chunk_summary = response.choices[0].text.strip()
            summary += chunk_summary + "\n"  # Concatenate the summaries from different chunks
        except Exception as e:
            # Handle any exception that occurs
            print(f"An error occurred: {e}")
    
    return summary



In [49]:
import os
import pandas as pd

# Directory paths
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"
xlsx_file_path = os.path.join(logs_directory, "resume_entities_report.xlsx")

# Load the DataFrame from the Excel file
df = pd.read_excel(xlsx_file_path)
df.set_index('Filename', inplace=True)

# Iterate over the DataFrame and summarize each resume
for filename in df.index:
    # Determine the correct file path
    if filename.startswith('translated_'):
        resume_path = os.path.join(translated_output_directory, filename)
    else:
        resume_path = os.path.join(directory_path, filename)

    # Skip non-resume files like 'translated_files_list.txt'
    if 'translated_files_list' in filename:
        continue

    # Check the file extension and read the content
    if resume_path.lower().endswith('.pdf'):
        try:
            resume_text = extract_text_from_pdf(resume_path)
        except Exception as e:
            print(f"An error occurred while reading PDF file: {e}")
            continue
    elif resume_path.lower().endswith('.txt'):
        try:
            resume_text = extract_text_from_txt(resume_path)
        except Exception as e:
            print(f"An error occurred while reading text file: {e}")
            continue
    else:
        print(f"Unsupported file format for file: {resume_path}")
        continue

    # Generate a summary for the resume (assuming summarize_text function is defined)
    summary = summarize_text(client, resume_text)
    df.at[filename, 'Summary'] = summary

# Uncomment the below lines to see the DataFrame and save it
print(df.head())
df.to_excel(os.path.join(logs_directory, 'updated_resume_summaries.xlsx'))


                                                      job title  \
Filename                                                          
10276858.pdf                                     Food Prep Chef   
10329506.pdf  Registered Client Service Associate, Relations...   
10344379.pdf              - Administrative support professional   
10395944.pdf                            Line Service Technician   
10428916.pdf                    Recreation & Sports Coordinator   

                                            years of experience  \
Filename                                                          
10276858.pdf                  5+ years (casual and fine dining)   
10329506.pdf                    8.5 years (June 2012 - current)   
10344379.pdf  5 years (assuming they started in Jan 2015 and...   
10395944.pdf                                            3 years   
10428916.pdf  5 years and 3 months (calculated from 03/2015 ...   

                                     highest level of educat

In [50]:
df.head()

Unnamed: 0_level_0,job title,years of experience,highest level of education,language skills,key skills,Summary
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10276858.pdf,Food Prep Chef,5+ years (casual and fine dining),,,"Culinary insight, standard food preparation, h...",Experienced Food Prep Chef with 5+ years of fo...
10329506.pdf,"Registered Client Service Associate, Relations...",8.5 years (June 2012 - current),Bachelor's degree (year not specified),None mentioned,"Client support, operational services, marketin...",Experienced Registered Client Service Associat...
10344379.pdf,- Administrative support professional,5 years (assuming they started in Jan 2015 and...,Not specified,Not specified,- Organizational skills,This administrative support professional has s...
10395944.pdf,Line Service Technician,3 years,Associate of Applied Science in Aviation Pilot...,None mentioned,"Great People Skills, Microsoft Office, Fueling...",This candidate is a sophomore student at South...
10428916.pdf,Recreation & Sports Coordinator,5 years and 3 months (calculated from 03/2015 ...,"TRX Qualified Instructor, ACSM Exercise Physio...",None mentioned,Assisting in daily supervision and administrat...,This candidate is a Recreation & Sports Coordi...


### Scoring criteria based on provided vacancy:

### Job requirements from job description:

In [51]:
job_description = """
FullStack(NodeJS, ReactJS), Online Genealogy Service
Client
The client is an international company that provides an online genealogy service that helps its clients understand their past and family history.

Project overview
The core programming language is JavaScript (ES2020), a website running on React.js and GraphQL and the back-end platform is based on Node.js (Express). Microservices running under Kubernetes. The project methodology is Scrum.

Team
There are a few Full Stack teams, up to 8 people each. Each team has a team lead and a product owner.

Position overview
We are looking for a specialist to join one of the teams (which is more Frontend oriented) is working on the further development of existing platforms. Regarding the work schedule, each employee should be available till 4 pm UK time.

Technology stack
JavaScript, React.js, GraphQL, Node.js (Express), Kubernetes.
 
Requirements
Development experience using a Node.js (Express) + React.js stack
Experience with SQL Server
Experience with PostgreSQL
Knowledge of Kafka
Knowledge of RabbitMQ
Dev-level experience with K8s/Docker
Knowledge of sound engineering practices like pair programming, upfront automated testing, continuous deployment, and trunk-based development
Spoken English

Nice to have
Knowledge of Apollo engine, Kafka, Postgres
Experience with microservices architecture development
Experience with GraphQL
Experience with RabbitMQ, SQL Server
Experience in development with C#
Experience with SOLR
Software development experience in Python
"""

entities = [
    "job title", "years of experience", "highest level of education", "language skills", "key skills"
]


prompt = (
    "Please structure the job requirements from the following text into a JSON-like format with these categories: "
    + ", ".join(entities)
    + ".\n\n"
    + job_description
)

response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    max_tokens=300  # Adjust as needed
)
extracted_requirements = response.choices[0].text.strip()
print(extracted_requirements)


{
    "job title": "Full Stack Developer",
    "years of experience": "At least 3 years of experience in Full Stack development",
    "highest level of education": "Bachelor's degree in Computer Science or related field",
    "language skills": {
        "English": "Spoken"
    },
    "key skills": [
        "Node.js",
        "React.js",
        "GraphQL",
        "Express",
        "Kubernetes",
        "SQL Server",
        "PostgreSQL",
        "Kafka",
        "RabbitMQ",
        "Sound engineering practices",
        "Pair programming",
        "Automated testing",
        "Continuous deployment",
        "Trunk-based development"
    ]
}


### Scoring function:

In [16]:
pip install fuzzywuzzy python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.23.0-py3-none-any.whl.metadata (3.8 kB)
Collecting Levenshtein==0.23.0 (from python-Levenshtein)
  Downloading Levenshtein-0.23.0-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.23.0->python-Levenshtein)
  Downloading rapidfuzz-3.6.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.23.0-py3-none-any.whl (9.4 kB)
Downloading Levenshtein-0.23.0-cp311-cp311-win_amd64.whl (101 kB)
   ---------------------------------------- 0.0/101.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/101.1 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/101.1 kB ? eta -:--:--
   ----------- --------------------------- 30.7/101.1 kB 325.1 kB/s eta 0:00:01
   --------------- ----------------------- 41.0/101.1 kB 326.8 kB/s eta 0:00:01
   ----------------------- --------------- 61.4/101.1 kB 363.1 kB/s eta 0:00:01
   -------------------

In [59]:
import os
import re
import json
import pandas as pd
from collections import Counter
from fuzzywuzzy import fuzz  # For fuzzy string matching

max_chunk_size = 3000  
overlap_size = 50    
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"

entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

# Define the job description as a string
job_description_str = """
{
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": [
        "Node.js", "React.js", "GraphQL", "Kubernetes", "SQL Server",
        "PostgreSQL", "Kafka", "RabbitMQ", "C#", "SOLR", "Python",
        "Sound engineering practices", "Pair programming",
        "Automated testing", "Continuous deployment", "Trunk-based development"
    ]
}
"""

# The `job_description` variable should be a dictionary parsed from JSON.
job_description = json.loads(job_description_str)

def split_into_chunks(text, max_chunk_size, overlap_size):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) > max_chunk_size:
            # Split the chunk at the max chunk size
            chunk = ' '.join(current_chunk[:len(current_chunk)-overlap_size])
            chunks.append(chunk)
            # Start the next chunk with the overlap
            current_chunk = current_chunk[-overlap_size:]
    
    # Add the last chunk
    chunks.append(' '.join(current_chunk))
    return chunks

def extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size):
    extracted_info = {entity: [] for entity in entities}  # Initialize as a dictionary
    chunks = split_into_chunks(text, max_chunk_size, overlap_size)  # Ensure chunks are small enough

    for chunk in chunks:
        prompt = f"Please extract the following entities from this text: {', '.join(entities)}.\n\n{chunk}"
        prompt_length = len(prompt.split())  # Calculate the prompt length in tokens

        max_tokens_for_completion = 4097 - prompt_length  # Adjust max tokens based on prompt length
        max_tokens_for_completion = min(max_tokens_for_completion, 300)  # Limit to 300 or less

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_tokens_for_completion
        )

        # Process the structured response and fill the extracted_info dict
        structured_response = response.choices[0].text.strip().split('\n')
        for line in structured_response:
            for entity in entities:
                if line.lower().startswith(entity.lower() + ':'):
                    split_line = line.split(':', 1)
                    if len(split_line) > 1:
                        value = split_line[1].strip()
                        if value:
                            if entity == 'key skills':  # Special case as we expect a list
                                skills = re.findall(r"[\w']+", value)
                                extracted_info[entity].extend(skills)
                            else:
                                extracted_info[entity].append(value)
                    break  # Move on to the next line once the entity is found

    # Aggregate the extracted information by combining or choosing the most mentioned entity
    for entity, values in extracted_info.items():
        if values:
            if entity == 'key skills':
                skills_counter = Counter(values)
                # Select skills that are most frequently mentioned
                extracted_info[entity] = [skill for skill, count in skills_counter.items() if count > 1]  # Adjusted to filter skills mentioned more than once
            else:
                # For other entities, we expect a single value, so we take the most frequent one
                value_counter = Counter(values)
                extracted_info[entity], _ = value_counter.most_common(1)[0]

    # Now, the extracted_info is ready for scoring against the job_description
    return extracted_info



def extract_text_from_txt(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']  # Common encodings
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Cannot decode file {file_path} with any of the provided encodings.")

# We keep the existing functions extract_text_from_txt, extract_text_from_pdf, and split_into_chunks unchanged.
# ...

def score_resume(extracted_info, job_description):
    score = 0
    total_weight = 0

    # Define weights for each entity
    weights = {
        "job title": 2,
        "years of experience": 1.5,
        "highest level of education": 1,
        "language skills": 1,
        "key skills": 3
    }

    def get_first_value(entity):
        return ' '.join(extracted_info[entity]).lower() if entity in extracted_info and extracted_info[entity] else ''

    # Function to calculate and weight individual scores
    def calculate_weighted_score(entity, extracted_value, required_value):
        if entity == "key skills":
            total_skills = len(required_value)
            matching_skills = sum(skill in extracted_value for skill in required_value)
            return (matching_skills / total_skills) * weights[entity]
        else:
            return (1 if fuzz.partial_ratio(extracted_value, required_value) > 80 else 0) * weights[entity]

    # Scoring for Years of Experience
    extracted_years_list = re.findall(r"\d+", get_first_value('years of experience'))
    required_years = float(re.findall(r"\d+", job_description['years of experience'])[0])
    extracted_years = float(extracted_years_list[0]) if extracted_years_list else 0
    experience_score = min(extracted_years / required_years, 1) * weights['years of experience']
    score += experience_score

    # Scoring for other entities
    for entity in ['job title', 'highest level of education', 'language skills']:
        extracted_value = get_first_value(entity)
        required_value = job_description[entity].lower()
        score += calculate_weighted_score(entity, extracted_value, required_value)

    # Special handling for 'key skills' as it's a list
    if 'key skills' in extracted_info:
        score += calculate_weighted_score('key skills', extracted_info['key skills'], job_description['key skills'])

    total_weight = sum(weights.values())
    return score / total_weight  # Normalize score based on total weight


def extract_and_score_resume(client, text, job_description, entities, max_chunk_size, overlap_size):
    extracted_entities = extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size)
    
    # info_dict to collect each entity's most common or first occurrence in the resume
    info_dict = {entity: [] for entity in entities}  # Initialize with empty lists

    for entity in entities:
        # Directly access the entity's value from the extracted_entities dictionary
        if entity in extracted_entities and extracted_entities[entity]:
            info_dict[entity] = extracted_entities[entity]

    # Now you can score the resume based on the content of info_dict and the job_description
    resume_score = score_resume(info_dict, job_description)
    return info_dict, resume_score

def create_entities_report(
    directory_path=directory_path,
    translated_output_directory=translated_output_directory,
    client=client,
    job_description=job_description,
    entities=entities,
    max_chunk_size=max_chunk_size,
    overlap_size=overlap_size
):
    """
    This function processes all resumes in the given directories, extracting entities and scoring them against a job description.
    It generates a report that ranks the resumes based on their scores.

    :param directory_path: Path to the directory containing the original resumes.
    :param translated_output_directory: Path to the directory where the translated resumes are stored.
    :param client: OpenAI client initialized with an API key.
    :param job_description: Dictionary containing the job description to score against.
    :param entities: List of entity types to extract from the resumes.
    :param max_chunk_size: Maximum size of the text chunk to be processed by the LLM in a single request.
    :param overlap_size:Size of the overlap between chunks of text to ensure continuity is maintained in entity extraction.
    """
    data = []
    
    # Process all resumes in the directories
    for directory, is_txt in [(directory_path, False), (translated_output_directory, True)]:
        for filename in os.listdir(directory):
            file_extension = '.txt' if is_txt else '.pdf'
            if filename.lower().endswith(file_extension):
                file_path = os.path.join(directory, filename)
                
                if is_txt:
                    text = extract_text_from_txt(file_path)
                else:
                    text = extract_text_from_pdf(file_path)

                if text.strip():
                    # Assuming that extract_and_score_resume is a function defined elsewhere 
                    # that takes the following arguments in the order given below.
                    info_dict, resume_score = extract_and_score_resume(client, text, job_description, entities, max_chunk_size, overlap_size)
                    info_dict['Filename'] = filename
                    info_dict['Score'] = resume_score
                    data.append(info_dict)

    # Creating a DataFrame from the extracted data and scores
    df = pd.DataFrame(data)
    df = df.sort_values(by='Score', ascending=False)  # Sort dataframe by score in descending order
    output_file_path = os.path.join(logs_directory, 'resume_scoring_report.xlsx')
    df.to_excel(output_file_path, index=False)

    print(f"Report generated and saved to {output_file_path}")



# Call the main function with the correct parameters.
create_entities_report(
    directory_path=directory_path,
    translated_output_directory=translated_output_directory,
    client=client,
    job_description=job_description,
    entities=entities,
    max_chunk_size=max_chunk_size,
    overlap_size=overlap_size
)

Report generated and saved to C:\Users\apleczkan\PycharmProjects\task1-cv-resumes\logs\resume_scoring_report.xlsx


### Because previous solution was not giving appropriate scoring mechanism I switched to embeddings

### Using embeddings:

In [20]:
import pandas as pd

# Define the file paths for the Excel files
excel_file_1 = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_resume_summaries.xlsx"
excel_file_2 = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_years_resume_entities_report.xlsx"

# Load the Excel files into DataFrames
df1 = pd.read_excel(excel_file_1)
df2 = pd.read_excel(excel_file_2)

# Ensure that "Filename" is set as the index for both DataFrames to use for alignment
df1.set_index('Filename', inplace=True)
df2.set_index('Filename', inplace=True)

# Replace the "ABC" column in df1 with the "DEF" column from df2
df1['years of experience'] = df2['numeric_years_of_experience']

# Reset the index if you want "Filename" back as a column
df1.reset_index(inplace=True)

# Define the complete path for saving the modified DataFrame to a new Excel file
save_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_years_of_exp_and_summary.xlsx"

# Save the modified DataFrame back to an Excel file with the specified path
df1.to_excel(save_path, index=True)


In [27]:
df1.head(20)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Filename                    30 non-null     object 
 1   job title                   30 non-null     object 
 2   years of experience         28 non-null     float64
 3   highest level of education  28 non-null     object 
 4   language skills             26 non-null     object 
 5   key skills                  30 non-null     object 
 6   Summary                     30 non-null     object 
dtypes: float64(1), object(6)
memory usage: 1.8+ KB


In [None]:
import pandas as pd
from transformers import pipeline

# Load the DataFrame from the Excel file
df = pd.read_excel('your_excel_file.xlsx')

# Initialize the text embedding model
embedding_model = "text-embedding-ada-002"
text_embedding = pipeline(task="feature-extraction", model=embedding_model)

# Create a dictionary to store embeddings for each column
column_embeddings = {}

# Loop through each column and generate embeddings
for column in df.columns:
    if df[column].dtype == 'O':  # Check if the column contains text (object type)
        text_data = df[column].astype(str)
        embeddings = text_embedding(text_data.tolist())
        column_embeddings[column] = embeddings

print(embeddings)

### working embeddings

In [62]:
import pandas as pd
import openai
import os

# Set up the OpenAI client with your API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize the OpenAI client
client = openai.OpenAI(api_key=openai.api_key)

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    # Access the embedding using dot notation
    return response.data[0].embedding

# Define the path to your Excel file
excel_file_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_years_of_exp_and_summary.xlsx"

# Load the Excel file into a DataFrame
df = pd.read_excel(excel_file_path, index_col='Filename')

# Define which columns contain text that you want to embed
text_columns = ['job title', 'years of experience', 'highest level of education', 'language skills', 'key skills', 'Summary']

# Generate embeddings for the specified text columns
for column in text_columns:
    # Skip columns with non-text data
    if df[column].dtype == 'object':
        # Use the get_embedding function directly
        df[column + ' embedding'] = df[column].apply(lambda x: get_embedding(x) if pd.notnull(x) else np.nan)

# Save the DataFrame, including the embeddings, back to an Excel file
save_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_with_embeddings.xlsx"
df.to_excel(save_path, index=True)

### Read file as xlsx and save as CSV

In [5]:
import pandas as pd
import numpy as np
import openai
import os

# Set up the OpenAI client with your API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize the OpenAI client
client = openai.OpenAI(api_key=openai.api_key)

# Embedding model parameters
embedding_model = "text-embedding-ada-002"

def get_embedding(text, model=embedding_model):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    # Access the embedding using dot notation
    return response.data[0].embedding

# Define the path to your Excel file
excel_file_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_years_of_exp_and_summary.xlsx"  # Update this path

# Load the Excel file into a DataFrame
df = pd.read_excel(excel_file_path, index_col='Filename')

# Define which columns contain text that you want to embed
text_columns = ['job title', 'years of experience', 'highest level of education', 'language skills', 'key skills', 'Summary']

# Generate embeddings for the specified text columns
for column in text_columns:
    # Skip columns with non-text data
    if df[column].dtype == 'object':
        # Use the get_embedding function directly
        df[column + ' embedding'] = df[column].apply(lambda x: get_embedding(x) if pd.notnull(x) else np.nan)

# Save the DataFrame, including the embeddings, back to a CSV file
save_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_with_embeddings.csv"  # Update this path
df.to_csv(save_path, index=True)


### Check if everything worked as expected

In [7]:


df = pd.read_csv("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_with_embeddings.csv")
df.head()
df.tail()

Head of the DataFrame:
Tail of the DataFrame:


Unnamed: 0.1,Filename,Unnamed: 0,job title,years of experience,highest level of education,language skills,key skills,Summary,job title embedding,highest level of education embedding,language skills embedding,key skills embedding,Summary embedding
25,11409460.pdf,25,"Buyer/Planner, Logistics Analyst, Warehouse Ex...",9.0,Bachelor of Science in Petroleum Engineering,"English, Portuguese, Spanish (trilingual)","Solid Works, CAD, Matlab, MS Office, ERP, fore...","5 years of experience, Bachelor of Science in ...","[-0.008133571594953537, -0.018270084634423256,...","[-0.0014201682060956955, -0.01565457321703434,...","[-0.019281834363937378, 0.014315702952444553, ...","[-0.015969468280673027, -0.004412265028804541,...","[-0.026880642399191856, -0.007857105694711208,..."
26,11522068.pdf,26,Group Fitness Instructor,0.25,high school diploma,basic Spanish,"coaching, leadership, communication, planning,...",Experienced Training and Development professio...,"[-0.02366562932729721, -0.005919776391237974, ...","[-0.022974243387579918, 0.01006863173097372, 0...","[-0.01946578547358513, 0.01046796515583992, 0....","[-0.0179140605032444, -0.012485937215387821, -...","[-0.028901871293783188, -0.002812109887599945,..."
27,11555549.pdf,27,"Visual Arts Specialist, Student Teacher, Visua...",11.0,"Master's degree in Education, Bachelor's of Ar...",None mentioned,"Adobe Creative Cloud (Illustrator, InDesign, P...","2006, 2007, 2008, 2009, 2010, 2011\n\nThis res...","[-0.02232000231742859, -0.008892614394426346, ...","[0.008514679968357086, -0.0018213900038972497,...","[-0.009927770122885704, -0.0036321107763797045...","[0.0030750473961234093, -0.006319214124232531,...","[-0.0040991054847836494, -0.004221417475491762..."
28,12491898.pdf,28,Construction Laborer,,Not specified,Not specified,"Construction, labor, equal opportunity, opport...",Experienced Construction Laborer with a strong...,"[-0.005029403138905764, -0.008206220343708992,...","[-0.017151663079857826, -0.017191031947731972,...","[-0.017093688249588013, -0.017185518518090248,...","[-0.018421413376927376, -0.016304463148117065,...","[-0.022693656384944916, -0.01898142322897911, ..."
29,translated_12491898.txt,29,Construction laborer,,Not specified,Not specified,"Construction, manual labor, technical skills",Experienced construction laborer with a strong...,"[-0.006934361066669226, -0.011867663823068142,...","[-0.017093688249588013, -0.017185518518090248,...","[-0.017093688249588013, -0.017185518518090248,...","[0.0037867920473217964, -0.01434994861483574, ...","[-0.01902652159333229, -0.023665789514780045, ..."


In [66]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from ast import literal_eval  # Import literal_eval from the ast module
import openai
import os


# Custom function to convert string representation of a list to an actual list of floats
def string_to_float_list(s):
    try:
        return np.array(literal_eval(s))
    except:
        return np.nan

def can_convert_to_list(s):
    try:
        _ = literal_eval(s)
        return True
    except:
        return False

# Convert string representations of lists back into actual lists
for column in df.columns:
    if 'embedding' in column:
        # Check if all the values in the column can be converted
        can_convert = df[column].apply(can_convert_to_list)
        if not can_convert.all():
            print(f"Cannot convert all values in column {column}.")
            # Output some of the problematic strings
            print(df[column][~can_convert].head())

# Set up the OpenAI client with your API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize the OpenAI client
client = openai.OpenAI(api_key=openai.api_key)

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    # Access the embedding using dot notation
    return response.data[0].embedding

# Define a utility function for cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

# Load the DataFrame with embeddings
df_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_with_embeddings.xlsx"
df = pd.read_excel(df_path, index_col='Filename')

print("Initial DataFrame head:", df.head())
print("Initial DataFrame size:", len(df))

if df.empty:
    print("The DataFrame is empty immediately after loading from the file.")
# Convert string representations of lists back into actual lists
# Assume that each column embedding is stored as a string representation of a list

embedding_columns = [col for col in df.columns if 'embedding' in col]
for column in embedding_columns:
    df[column] = df[column].apply(string_to_float_list)

for column in embedding_columns:
    nan_count = df[column].isna().sum()
    print(f"NaNs in {column}: {nan_count}")
    if nan_count > 0:
        # If there are NaNs, let's see which rows have NaNs
        print(f"Rows with NaNs in {column}:\n", df[df[column].isna()])

df.dropna(subset=[col for col in df.columns if 'embedding' in col], inplace=True)
if df.empty:
    print("The DataFrame is empty after dropping NaN values.")

for column in df.columns:
    if 'embedding' in column:
        print(f"NaNs in {column}: {df[column].isna().sum()}")

# Drop NaN values only if all embedding columns are NaN for a row
df.dropna(how='all', subset=[col for col in df.columns if 'embedding' in col], inplace=True)

# Check the DataFrame size after dropping NaNs
print(f"DataFrame size after dropping rows with all NaN embeddings: {len(df)}")

# Define the job description
job_description_str = """
{
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": "Node.js, React.js, GraphQL, Kubernetes, SQL Server,
        PostgreSQL, Kafka, RabbitMQ, C#, SOLR, Python,
        Sound engineering practices, Pair programming,
        Automated testing, Continuous deployment, Trunk-based development"
}
"""

# Embed the job description
job_description_embedding = get_embedding(job_description_str)

print(f"Number of rows in DataFrame after NaN removal: {len(df)}")
print("Sample embeddings:", df[[col for col in df.columns if 'embedding' in col]].iloc[0])

# Define which columns contain text that you want to search
text_columns = ['job title', 'years of experience', 'highest level of education', 'language skills', 'key skills', 'Summary']

# Function to search resumes based on job description
def search_resumes(df, job_description_embedding, text_columns, n=3):
    # Calculate similarity for each resume
    similarities = []  # List to store similarities
    for index, row in df.iterrows():
        try:
            # Compute the mean embedding for the current row
            embeddings = [row[col + ' embedding'] for col in text_columns if col + ' embedding' in row and not pd.isna(row[col + ' embedding'])]
            if embeddings:  # Only proceed if there are valid embeddings
                mean_embedding = np.mean(embeddings, axis=0)
                similarity = cosine_similarity(mean_embedding, job_description_embedding)
                similarities.append((index, similarity))
            else:
                print(f"Row {index} has no valid embeddings.")
        except Exception as e:
            print(f"An error occurred at row {index}: {e}")

    # Create a DataFrame from the similarities
    similarity_df = pd.DataFrame(similarities, columns=['Filename', 'similarity'])

    # Sort by similarity
    results = similarity_df.sort_values('similarity', ascending=False).head(n)

    return results

top_matches = search_resumes(df, job_description_embedding, text_columns, n=5)

# Check if the DataFrame is empty
if top_matches.empty:
    print("No matches found.")
else:
    # Print out the top matches
    print(top_matches)


Cannot convert all values in column job title embedding.
Filename
10276858.pdf    [0.014491122215986252, -0.006481783930212259, ...
10329506.pdf    [-0.028984833508729935, -0.018259640783071518,...
10344379.pdf    [-0.010848717764019966, 0.01684059016406536, 0...
10395944.pdf    [-0.01303948275744915, -0.007236499339342117, ...
10428916.pdf    [-0.010918157175183296, -0.02344488352537155, ...
Name: job title embedding, dtype: object
Cannot convert all values in column highest level of education embedding.
Filename
10276858.pdf                                                  NaN
10329506.pdf    [0.007109690923243761, -0.029303627088665962, ...
10344379.pdf    [-0.017093688249588013, -0.017185518518090248,...
10395944.pdf    [-0.013130523264408112, -0.016788125038146973,...
10428916.pdf    [-0.010102304629981518, -0.002048614900559187,...
Name: highest level of education embedding, dtype: object
Cannot convert all values in column language skills embedding.
Filename
10276858.pdf        

IndexError: single positional indexer is out-of-bounds

In [31]:
# Necessary imports
import pandas as pd
import numpy as np
from numpy.linalg import norm
from ast import literal_eval
import openai
import os

# Function to get embedding
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

# Function to convert string representation of a list to an actual list of floats
# Custom function to convert string representation of a list to an actual list of floats
def string_to_float_list(s):
    try:
        return np.array(literal_eval(s))
    except:
        return np.nan

# Function to check if a string can be evaluated to a list
def can_convert_to_list(s):
    try:
        _ = eval(s)
        return True
    except:
        return False

# Set up the OpenAI client with your API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Define a utility function for cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

# Break down the job description
job_description = {
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": "Node.js, React.js, GraphQL, Kubernetes, SQL Server, "
                 "PostgreSQL, Kafka, RabbitMQ, C#, SOLR, Python, "
                 "Sound engineering practices, Pair programming, "
                 "Automated testing, Continuous deployment, Trunk-based development"
}

job_description_embeddings = {}
for key, value in job_description.items():
    job_description_embeddings[key] = get_embedding(value)



# Embed the job description
job_description_embedding = get_embedding(job_description_str)

# Load the DataFrame with embeddings from a CSV file
df_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_with_embeddings.csv"
df = pd.read_csv(df_path)

# Convert string representations of lists back into actual lists
embedding_columns = [col for col in df.columns if 'embedding' in col]
for column in embedding_columns:
    df[column] = df[column].apply(string_to_float_list)

# Drop rows with NaN values in embedding columns after conversion
df.dropna(subset=embedding_columns, inplace=True)

# Define which columns contain embeddings that you want to compare
resume_embedding_columns = [
    'job title embedding',
    'years of experience embedding',
    'highest level of education embedding',
    'language skills embedding',
    'key skills embedding',
    'Summary embedding'
]

def search_resumes(df):
    similarities = []
    for _, row in df.iterrows():
        # Compute similarity for each aspect of the job description
        similarity_scores = []
        for key in job_description_embeddings.keys():
            resume_embedding = row.get(f'{key} embedding')
            if isinstance(resume_embedding, np.ndarray):
                job_embedding = job_description_embeddings[key]
                similarity = cosine_similarity(resume_embedding, job_embedding)
                similarity_scores.append(similarity)

        # Average the similarity scores if there are valid scores
        if similarity_scores:
            avg_similarity = np.nanmean(similarity_scores)
            similarities.append((row['Filename'], avg_similarity))

    similarity_df = pd.DataFrame(similarities, columns=['Filename', 'similarity'])
    
    csv_file_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\scores.csv"
    similarity_df.to_csv(csv_file_path, index=True)
    print(f'DataFrame saved to {csv_file_path}')


    return similarity_df.sort_values('similarity', ascending=False)


def average_embedding(row):
    embeddings = [row[col] for col in resume_embedding_columns if isinstance(row[col], np.ndarray)]
    if embeddings:
        return np.mean(np.stack(embeddings), axis=0)
    else:
        return np.nan

top_matches = search_resumes(df)

print(top_matches.head(10))

DataFrame saved to C:\Users\apleczkan\PycharmProjects\task1-cv-resumes\logs\scores.csv
        Filename  similarity
8   10641230.pdf    0.839038
21  11409460.pdf    0.823826
7   10603337.pdf    0.811665
9   10724818.pdf    0.808522
11  10818478.pdf    0.808050
0   10329506.pdf    0.806979
22  11522068.pdf    0.805057
6   10554236.pdf    0.804116
10  10816645.pdf    0.802921
5   10527994.pdf    0.799554


### Combine scores with existing excel file and save under new name

In [39]:
import pandas as pd

# Specify the file paths
excel_file = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_years_of_exp_and_summary.xlsx"
csv_file_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\scores.csv"
output_excel_file = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\resumes_summary_scores_sorted.xlsx"

excel_df = pd.read_excel(excel_file)
scores_df = pd.read_csv(csv_file_path)

scores_df = scores_df.rename(columns={'similarity': 'scores'})
excel_df['scores'] = scores_df['scores']

# Sort the combined DataFrame by the 'scores' column
sorted_df = excel_df.sort_values(by='scores', ascending=False)
sorted_df = sorted_df.drop(columns=["Unnamed: 0"])
sorted_df = sorted_df.set_index("Filename")
sorted_df.to_excel(output_excel_file, index=False)

print(f"Sorted and saved DataFrame to {output_excel_file}")


Sorted and saved DataFrame to C:\Users\apleczkan\PycharmProjects\task1-cv-resumes\logs\resumes_summary_scores_sorted.xlsx


In [40]:
print("Sorted DataFrame:")
sorted_df.head(20)

Sorted DataFrame:


Unnamed: 0_level_0,job title,years of experience,highest level of education,language skills,key skills,Summary,scores
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10603337.pdf,Sales Associate,6.66,High School Diploma,English (assuming since text is in English),"customer service, sales, inventory management,...",This sales associate has 5 years of experience...,0.839038
11188218.pdf,Customer Advocate,7.5,"Formally educated in illustration, photography...",None mentioned,"Outstanding customer service, technologically ...",This customer advocate has eight years of expe...,0.823826
10554236.pdf,Financial Accountant,0.66,Bachelor's degree,None mentioned.,"Financial planning, reporting, analysis, criti...",Experienced Financial Accountant with a Bachel...,0.811665
10641230.pdf,IT Management,5.8,Bachelor's degree,English,"hardware troubleshooting, software management,...",Experienced IT Management professional with a ...,0.808522
10816645.pdf,Group Fitness Coordinator,3.42,Bachelor's Degree,None mentioned,"Ability to design, develop, implement, and eva...",Experienced Group Fitness Coordinator with ove...,0.80805
10276858.pdf,Food Prep Chef,5.0,,,"Culinary insight, standard food preparation, h...",Experienced Food Prep Chef with 5+ years of fo...,0.806979
11257723.pdf,"General Liability Claims Representative, Auto ...",12.0,B.S. in Journalism,None mentioned,"Claims file management processes, litigation r...",This resume outlines the experience and skills...,0.805057
10527994.pdf,"Substitute Teacher, Maintenance Technician, Sa...",4.75,Bachelor of Arts in Business Administration Ma...,None mentioned,"Outlook, Excel, Word, PowerPoint, QuickBooks, ...",Experienced Substitute Teacher with over 4 yea...,0.804116
10724818.pdf,"Executive Assistant, Sales",5.0,Not specified,Highly versed in linguistics,"Time management, attention to detail, Microsof...",This executive assistant has over 5 years of e...,0.802921
10466583.pdf,Floral Designer,6.75,Medical Assistant from Northwestern College,None mentioned,"Customer service, inventory control, employee ...",This is a Floral Designer with over 10 years o...,0.799554
