### Installations:

In [1]:
!pip install openai pandas langdetect PyPDF2



### Imports:

In [50]:
import openai
from langdetect import detect, DetectorFactory
import os
from PyPDF2 import PdfReader
import pandas as pd

### Set environment variable for Open Ai client:

In [51]:
%env OPENAI_API_KEY=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

env: OPENAI_API_KEY=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


### Create basic Variables, paths and set Open AI client

In [134]:
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"
max_chunk_size = 1000
overlap_size = 50

# Make langdetect non-deterministic results predictable
DetectorFactory.seed = 0

In [53]:
# Function to read PDF and extract text
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

In [54]:
def split_text(text, max_chunk_size, overlap_size=50):
    words = text.split()
    chunks = []
    current_chunk = ""
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_chunk_size:
            current_chunk += word + " "
        else:
            chunks.append(current_chunk)
            current_chunk = word + " "
    chunks.append(current_chunk)  # Add the last chunk
    return chunks

In [55]:
# Function to translate text using OpenAI's API (Updated for openai>=1.0.0)
def translate_text(client, text, target_language="en"):
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"Translate the following text to {target_language}:\n\n{text}",
        max_tokens=500  # Adjust as needed
    )
    # Access the text from the response
    return response.choices[0].text.strip()

In [56]:

def process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client):
    if not os.path.exists(translated_output_directory):
        os.makedirs(translated_output_directory)

    translated_files_list = []
    english_files_list = []

    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)

            if text.strip():
                if detect(text) != 'en':
                    chunks = split_text(text, max_chunk_size, overlap_size)
                    translated_text = ""

                    for chunk in chunks:
                        if detect(chunk) != 'en':
                            chunk = translate_text(client, chunk, target_language="en")
                        translated_text += chunk + " "
                    
                    # Save the translated text to a .txt file
                    translated_filename = f"translated_{filename.replace('.pdf', '.txt')}"
                    translated_path = os.path.join(translated_output_directory, translated_filename)
                    save_text_to_file(translated_text, translated_path)

                    translated_files_list.append(translated_filename)
                else:
                    english_files_list.append(filename)
            else:
                print(f"Document {filename} is empty or contains very little text.")

    # Save the list of translated files to a text file for reference
    save_file_list(translated_files_list, translated_output_directory, 'translated_files_list.txt')
    # Optionally save the list of English files as well
    save_file_list(english_files_list, translated_output_directory, 'english_files_list.txt')

def save_file_list(file_list, directory, filename):
    with open(os.path.join(directory, filename), 'w', encoding='utf-8') as f:
        for file in file_list:
            f.write(f"{file}\n")

# Function to save text to a file
def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

In [57]:
process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client)

Document 12632728.pdf is empty or contains very little text.


### Create named entities to look for in resumes

In [85]:
entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

In [140]:
import re

max_chunk_size = 3500  # Adjust as needed
overlap_size = 50      # Adjust as needed


def extract_text_from_txt(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']  # Common encodings
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Cannot decode file {file_path} with any of the provided encodings.")

def split_into_chunks(text, max_chunk_size, overlap_size):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) > max_chunk_size:
            # Split the chunk at the max chunk size
            chunk = ' '.join(current_chunk[:len(current_chunk)-overlap_size])
            chunks.append(chunk)
            # Start the next chunk with the overlap
            current_chunk = current_chunk[-overlap_size:]
    
    # Add the last chunk
    chunks.append(' '.join(current_chunk))
    return chunks
    
# def extract_entities_with_llm(client, text, entities, max_chunk_size=3500, overlap_size=50):
#     """
#     Extract entities from the text using OpenAI's API, handling long texts by splitting into chunks.
#     """
#     # Split the text into chunks
#     chunks = split_text(text, max_chunk_size, overlap_size)

#     # Process each chunk and concatenate the results
#     extracted_info = ""
#     for chunk in chunks:
#         prompt = f"Please extract the following entities from this text: {', '.join(entities)}.\n\n{chunk}"
#         response = client.completions.create(
#             model="gpt-3.5-turbo-instruct",
#             prompt=prompt,
#             max_tokens=300  # Adjust as needed
#         )
#         extracted_info += response.choices[0].text.strip() + "\n"

#     return extracted_info

def extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size):
    extracted_info = ""
    chunks = split_into_chunks(text, max_chunk_size, overlap_size)  # Ensure chunks are small enough

    for chunk in chunks:
        prompt = f"Please extract the following entities from this text: {', '.join(entities)}.\n\n{chunk}"
        prompt_length = len(prompt.split())  # Calculate the prompt length in tokens

        max_tokens_for_completion = 4097 - prompt_length  # Adjust max tokens based on prompt length
        max_tokens_for_completion = min(max_tokens_for_completion, 300)  # Limit to 300 or less

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_tokens_for_completion
        )
        extracted_info += response.choices[0].text.strip() + "\n"

    return extracted_info

def process_resume(directory, filename, client, entities, data_list, is_txt=False):
    file_path = os.path.join(directory, filename)
    text = extract_text_from_txt(file_path) if is_txt else extract_text_from_pdf(file_path)
    
    if text.strip():
        extracted_info = extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size)
        info_dict = {'Filename': filename}
        
        for entity in entities:
            # Use regex to find the entity and its value
            pattern = re.compile(rf"{entity}\s*:\s*(.*)", re.IGNORECASE)
            match = pattern.search(extracted_info)
            if match:
                info_dict[entity] = match.group(1).strip()
            else:
                info_dict[entity] = None  # Or an appropriate placeholder if not found
        
        data_list.append(info_dict)


def create_entities_report(directory_path, translated_output_directory, client, entities):
    data = []
    # Process PDFs in the original directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            process_resume(directory_path, filename, client, entities, data)

    # Process translated PDFs in the output directory
    for filename in os.listdir(translated_output_directory):
        if filename.lower().startswith('translated_') and filename.lower().endswith('.txt'):
            process_resume(translated_output_directory, filename, client, entities, data, is_txt=True)

    df = pd.DataFrame(data)
    df.to_excel(os.path.join(translated_output_directory, 'resume_entities_report.xlsx'), index=False)

create_entities_report(directory_path, translated_output_directory, client, entities)


### Data Frame from CV's for named entities

In [127]:
import os
file_path = os.path.join(translated_output_directory, 'resume_entities_report.xlsx')
df = pd.read_excel(file_path)
df.set_index('Filename', inplace=True)
df.sort_index(inplace=True)
df.head(10)

Unnamed: 0_level_0,skills,experience_years,education_level,languages
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10276858.pdf,"Culinary insight, food preparation, kitchen pr...",5+ years in food and beverage experience,Not mentioned,Not mentioned
10329506.pdf,"Dreamweaver, Adobe PageMaker 6.5, Adobe Photos...",19 (from 2001 to 2020),B.S in Computer Information Systems from Stray...,None mentioned
10344379.pdf,"Strong organizational, technical, and interper...","3 years as a customer service advocate, 1 year...",None mentioned.,None mentioned.
10395944.pdf,"Customer service, leadership, team leadership,...","1 year as Line Service Technician, 6 months as...",Associate of Applied Science in Aviation Pilot...,English
10428916.pdf,"ACSM Exercise Physiologist, TRX Qualified Inst...",9 years (Recreation & Sports Coordinator: 6 ye...,Bachelor of Science in Kinesiology,None mentioned in text
10466583.pdf,"customer service, inventory control, employee ...","June 2013 to March 2016, January 2011 to Decem...",Medical Assistant degree from Northwestern Col...,None mentioned in text
10527994.pdf,"Outlook, Excel, Word, PowerPoint, QuickBooks, ...",,,
10554236.pdf,"financial planning, reporting and analysis, ac...","July 2011 to November 2012 (1 year, 5 months);...",Bachelor's degree in Accounting,None mentioned in the text.
10603337.pdf,"customer service, photo, credit, editing, fash...",5,Associates Degree and High School Diploma,None mentioned in the text.
10641230.pdf,"Troubleshooting, Networking, Server Technologi...",8 years (July 2011 to present),Associate of Science in Information Technology...,"HTML, HTML5, XML, CSS, CSS3, JavaScript, TCP/IP."


### CV Summarization

In [128]:
import openai

def summarize_text(client, text, max_chunk_size=3000, overlap_size=50):
    """
    This function uses OpenAI's GPT-3 model to generate a summary of the resume text.
    """

    chunks = split_text(text, max_chunk_size, overlap_size)
    summary = ""

    for chunk in chunks:
        prompt = (
            "Please summarize the following resume into a short paragraph that includes "
            "the job title, years of experience, highest level of education, language skills, "
            "and key skills:\n\n" + chunk  # Use the current chunk, not the entire text
        )
        
        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",  # Use the latest available model
                prompt=prompt,
                max_tokens=150,  # Adjust as needed for the summary length
                temperature=0.5
            )
            chunk_summary = response.choices[0].text.strip()
            summary += chunk_summary + "\n"  # Concatenate the summaries from different chunks
        except Exception as e:
            # Handle any exception that occurs
            print(f"An error occurred: {e}")
    
    return summary



In [161]:
import os
import pandas as pd

# Directory paths
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"
xlsx_file_path = os.path.join(translated_output_directory, "resume_entities_report.xlsx")

# Load the DataFrame from the Excel file
df = pd.read_excel(xlsx_file_path)
df.set_index('Filename', inplace=True)

# Iterate over the DataFrame and summarize each resume
for filename in df.index:
    # Determine the correct file path
    if filename.startswith('translated_'):
        resume_path = os.path.join(translated_output_directory, filename)
    else:
        resume_path = os.path.join(directory_path, filename)

    # Skip non-resume files like 'translated_files_list.txt'
    if 'translated_files_list' in filename:
        continue

    # Check the file extension and read the content
    if resume_path.lower().endswith('.pdf'):
        try:
            resume_text = extract_text_from_pdf(resume_path)
        except Exception as e:
            print(f"An error occurred while reading PDF file: {e}")
            continue
    elif resume_path.lower().endswith('.txt'):
        try:
            resume_text = extract_text_from_txt(resume_path)
        except Exception as e:
            print(f"An error occurred while reading text file: {e}")
            continue
    else:
        print(f"Unsupported file format for file: {resume_path}")
        continue

    # Generate a summary for the resume (assuming summarize_text function is defined)
    summary = summarize_text(client, resume_text)
    df.at[filename, 'Summary'] = summary

# Uncomment the below lines to see the DataFrame and save it
print(df.head())
df.to_excel(os.path.join(translated_output_directory, 'updated_resume_summaries.xlsx'))


                                                         skills  \
Filename                                                          
10276858.pdf  Culinary insight, food preparation, kitchen pr...   
10329506.pdf  Microsoft Word, Excel, Power Point, Access, Ad...   
10344379.pdf  organizational, technical, interpersonal, lead...   
10395944.pdf  Great People Skills, Microsoft Office, Airport...   
10428916.pdf  Recreation coordination, staff management, bud...   

                                               experience_years  \
Filename                                                          
10276858.pdf  4 years (from 01/2014 to 05/2015 in the food s...   
10329506.pdf  Registered Client Service Associate for 9 year...   
10344379.pdf  Over 5 years (Jan 2015 to Current as a custome...   
10395944.pdf  03/2017 to Current Line Service Technician Com...   
10428916.pdf  Not mentioned explicitly, but can be inferred ...   

                                                education_le

In [163]:
df.head()

Unnamed: 0_level_0,skills,experience_years,education_level,languages,Summary
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10276858.pdf,"Culinary insight, food preparation, kitchen pr...",4 years (from 01/2014 to 05/2015 in the food s...,High School Diploma in Culinary/Auto Body Cour...,Not mentioned in the text.,Experienced and highly skilled Food Prep Chef ...
10329506.pdf,"Microsoft Word, Excel, Power Point, Access, Ad...",Registered Client Service Associate for 9 year...,No specific education level mentioned.,,Experienced Registered Client Service Associat...
10344379.pdf,"organizational, technical, interpersonal, lead...",Over 5 years (Jan 2015 to Current as a custome...,"Not specified, but likely some level of techni...",No specific languages are mentioned,Experienced administrative support professiona...
10395944.pdf,"Great People Skills, Microsoft Office, Airport...",03/2017 to Current Line Service Technician Com...,2018 Associate of Applied Science : Aviation P...,None mentioned in the text.,This individual is a Line Service Technician w...
10428916.pdf,"Recreation coordination, staff management, bud...","Not mentioned explicitly, but can be inferred ...",Bachelor of Science in Kinesiology,None mentioned in the text,The candidate is a highly qualified Recreation...


### Scoring criteria based on provided vacancy:

### Job requirements from job description:

In [164]:
job_description = """
FullStack(NodeJS, ReactJS), Online Genealogy Service
Client
The client is an international company that provides an online genealogy service that helps its clients understand their past and family history.

Project overview
The core programming language is JavaScript (ES2020), a website running on React.js and GraphQL and the back-end platform is based on Node.js (Express). Microservices running under Kubernetes. The project methodology is Scrum.

Team
There are a few Full Stack teams, up to 8 people each. Each team has a team lead and a product owner.

Position overview
We are looking for a specialist to join one of the teams (which is more Frontend oriented) is working on the further development of existing platforms. Regarding the work schedule, each employee should be available till 4 pm UK time.

Technology stack
JavaScript, React.js, GraphQL, Node.js (Express), Kubernetes.
 
Requirements
Development experience using a Node.js (Express) + React.js stack
Experience with SQL Server
Experience with PostgreSQL
Knowledge of Kafka
Knowledge of RabbitMQ
Dev-level experience with K8s/Docker
Knowledge of sound engineering practices like pair programming, upfront automated testing, continuous deployment, and trunk-based development
Spoken English

Nice to have
Knowledge of Apollo engine, Kafka, Postgres
Experience with microservices architecture development
Experience with GraphQL
Experience with RabbitMQ, SQL Server
Experience in development with C#
Experience with SOLR
Software development experience in Python
"""

entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

entity_categories = {
    "job title": "Job Title",
    "years of experience": "Experience Years",
    "highest level of education": "Education Level",
    "language skills": "Languages",
    "key skills": "Skills"
}

prompt = (
    "Please structure the job requirements from the text into the following categories: "
    + ", ".join([entity_categories[entity] for entity in entities])
    + ". Provide the response in a format that can be easily parsed into a dictionary.\n\n"
    + job_description
)

response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    max_tokens=300  # Adjust as needed
)
extracted_requirements = response.choices[0].text.strip()
print(extracted_requirements)


{
    'Job Title': 'Full Stack Developer',
    'Experience Years': 'At least 2 years',
    'Education Level': 'Bachelor's degree in Computer Science or related field',
    'Languages': 'English',
    'Skills': ['Node.js', 'React.js', 'GraphQL', 'Express', 'Kubernetes', 'SQL Server', 'PostgreSQL', 'Kafka', 'RabbitMQ', 'C#', 'SOLR', 'Python', 'Sound engineering practices']
}


In [165]:
print(extracted_requirements)

{
    'Job Title': 'Full Stack Developer',
    'Experience Years': 'At least 2 years',
    'Education Level': 'Bachelor's degree in Computer Science or related field',
    'Languages': 'English',
    'Skills': ['Node.js', 'React.js', 'GraphQL', 'Express', 'Kubernetes', 'SQL Server', 'PostgreSQL', 'Kafka', 'RabbitMQ', 'C#', 'SOLR', 'Python', 'Sound engineering practices']
}


### Scoring function:

In [144]:
import pandas as pd
import os
import re

def calculate_matching_score(resume_info, job_requirements):
    score = 0
    num_criteria = 4  # Four criteria: Skills, Experience, Education, Languages

    # Scoring for skills
    if 'skills' in resume_info and job_requirements.get('skills'):
        if len(job_requirements['skills']) > 0:
            matched_skills = sum(skill.lower() in resume_info['skills'].lower() for skill in job_requirements['skills'])
            score += (matched_skills / len(job_requirements['skills'])) / num_criteria
        else:
            num_criteria -= 1

    # Scoring for experience
    if 'experience_years' in resume_info and resume_info['experience_years']:
        try:
            experience_years = int(resume_info['experience_years'])
            score += (1 if experience_years >= job_requirements['experience_years'] else 0) / num_criteria
        except ValueError:
            # If conversion to int fails, handle appropriately (e.g., score 0 for this criterion)
            pass

    # Scoring for education
    if 'education_level' in resume_info:
        score += (1 if job_requirements['education_level'].lower() in resume_info['education_level'].lower() else 0) / num_criteria

    # Scoring for languages
    if 'languages' in resume_info:
        score += (1 if any(lang.lower() in resume_info['languages'].lower() for lang in job_requirements['languages']) else 0) / num_criteria

    # Adjust the total score if any criteria were not applicable
    if num_criteria < 4:
        score = score * (4 / num_criteria)

    return score

def score_resumes(directory_path, job_requirements, entities):
    data = []
    # Iterate over each resume in the directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf') or filename.lower().endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            text = extract_text_from_txt(file_path)  # Assuming this function exists
            extracted_info = extract_entities_with_llm(client, text, entities, max_chunk_size=3500, overlap_size=50)  # Assuming this function exists
            info_dict = {'Filename': filename}

            for entity in entities:
                pattern = re.compile(rf"{entity}\s*:\s*(.*)", re.IGNORECASE)
                match = pattern.search(extracted_info)
                if match:
                    info_dict[entity] = match.group(1).strip()
                else:
                    info_dict[entity] = None

            resume_score = calculate_matching_score(info_dict, job_requirements)
            info_dict['Score'] = resume_score
            data.append(info_dict)

    return pd.DataFrame(data)

# Assuming directory_path and job_requirements are defined elsewhere
entities = ['skills', 'experience_years', 'education_level', 'languages']

scored_resumes_df = score_resumes(directory_path, job_requirements, entities)
scored_resumes_df.sort_values(by='Score', ascending=False, inplace=True)  # Sorting by score in descending order
print(scored_resumes_df.head(10))  # Display top 10 candidates


        Filename                                             skills  \
15  10898339.pdf          communication, multitasking, organization   
28  12491898.pdf                   HTML, PDF conversion, Qt, coding   
4   10428916.pdf                    NLP, programming, data analysis   
25  11409460.pdf      C++, Java, Python, SQL, HTML, CSS, JavaScript   
22  11257723.pdf                                               Å¯   
11  10816645.pdf     HTML, PDF, Qt, CreationDate, ExtGState, Stream   
16  10909720.pdf                                            unknown   
27  11555549.pdf     No specific skills are mentioned in this text.   
26  11522068.pdf  language processing, computer programming, dat...   
24  11360471.pdf  HTML, Qt, CSS, JavaScript, HTML to PDF conversion   

                   experience_years  \
15                                5   
28                               12   
4                                 5   
25                                8   
22                       

In [146]:
scored_resumes_df.head(10)

Unnamed: 0,Filename,skills,experience_years,education_level,languages,Score
15,10898339.pdf,"communication, multitasking, organization",5,Bachelor's degree,"English, Spanish, French",0.5
28,12491898.pdf,"HTML, PDF conversion, Qt, coding",12,Not mentioned in the text,Not mentioned in the text,0.5
4,10428916.pdf,"NLP, programming, data analysis",5,Masters,"English, Spanish, French",0.5
25,11409460.pdf,"C++, Java, Python, SQL, HTML, CSS, JavaScript",8,Bachelor's degree in Computer Science or relat...,"English, Spanish, French, Mandarin Chinese",0.5
22,11257723.pdf,nÅ¯,5,A,english,0.5
11,10816645.pdf,"HTML, PDF, Qt, CreationDate, ExtGState, Stream",4,,None.,0.5
16,10909720.pdf,unknown,unknown,unknown,unknown,0.25
27,11555549.pdf,No specific skills are mentioned in this text.,No mention of experience years.,None mentioned.,None mentioned.,0.25
26,11522068.pdf,"language processing, computer programming, dat...",7 years,Master's degree in Computer Science,"English, German, French, Spanish",0.25
24,11360471.pdf,"HTML, Qt, CSS, JavaScript, HTML to PDF conversion",Unknown,Unknown,None mentioned,0.25
