# 1.4.1. Task 1: Working with the documents 

## 1. Translation.
Find the resumes (or parts of them) that are not in English, and translate them into English using LLM.

## 2. Entities extraction.
Extract useful named entities from the resume using LLM. For example, you can extract the job title, years of experience, highest level of education, language skills, and key skills, or define any entities that you find interesting. As an additional task, you may create an Excel report that contains entities from 20-30 resumes.

## 3. Summarisation.
Make a short summary of the resume. You may choose any size you find useful. Defining the structure of the summary (adding the obligatory entities) or just getting it from LLM is up to you. The general idea is to provide an opportunity for recruiters to read it quickly and not scan 2-3 pages.

## 4. Resume scoring.
Develop a mechanism to provide a ranking of the resumes for a vacancy by providing a score (float value from 0 to 1). A particular vacancy can be found at [https://www.dataart.team/vacancies](https://www.dataart.team/vacancies) (or on LinkedIn). It should work in 2 modes: calculate the score for the provided vacancy and resume, and present the top 10 candidates for the vacancy.
a and algorithms for scoring.
ncy.

### Installations:

In [None]:
!pip install openai pandas langdetect PyPDF2 transformers plotly matplotlib scikit-learn torch torchvision scipy

### Imports

In [None]:
import openai
import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from langdetect import DetectorFactory, detect

### Set environment variable for Open Ai client:

In [None]:
%env OPENAI_API_KEY=

### Create basic Variables, paths and set Open AI client

In [None]:
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"
logs_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs"
max_chunk_size = 3500  
overlap_size = 50  

DetectorFactory.seed = 0

### extract_text_from_pdf

In [None]:
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

### Split text into chunks with overlap

In [None]:
def split_text(text, max_chunk_size, overlap_size=50):
    words = text.split()
    chunks = []
    current_chunk = ""
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_chunk_size:
            current_chunk += word + " "
        else:
            chunks.append(current_chunk)
            current_chunk = word + " "
    chunks.append(current_chunk)
    return chunks

### Function to translate text using OpenAI's API  for openai>=1.0.0

In [None]:

def translate_text(client, text, target_language="en"):
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"Translate the following text to {target_language}:\n\n{text}",
        max_tokens=500
    )
    return response.choices[0].text.strip()

### Function to process PDFs and save translated versions

In [None]:

def process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client):
    if not os.path.exists(translated_output_directory):
        os.makedirs(translated_output_directory)

    translated_files_list = []
    english_files_list = []

    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)

            if text.strip():
                if detect(text) != 'en':
                    chunks = split_text(text, max_chunk_size, overlap_size)
                    translated_text = ""

                    for chunk in chunks:
                        if detect(chunk) != 'en':
                            chunk = translate_text(client, chunk, target_language="en")
                        translated_text += chunk + " "
                    
                    # Save the translated text to a .txt file
                    translated_filename = f"translated_{filename.replace('.pdf', '.txt')}"
                    translated_path = os.path.join(translated_output_directory, translated_filename)
                    save_text_to_file(translated_text, translated_path)

                    translated_files_list.append(translated_filename)
                else:
                    english_files_list.append(filename)
            else:
                print(f"Document {filename} is empty or contains very little text.")

    # Save the list of translated files to a text file for reference
    save_file_list(translated_files_list, logs_directory, 'translated_files_list.txt')
    # Optionally save the list of English files as well
    save_file_list(english_files_list, logs_directory, 'english_files_list.txt')

def save_file_list(file_list, directory, filename):
    with open(os.path.join(directory, filename), 'w', encoding='utf-8') as f:
        for file in file_list:
            f.write(f"{file}\n")

# Function to save text to a file
def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)


### Create named entities to look for in resumes

In [None]:
entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

### Function to extract text from TXT files with different encodings

In [None]:
def extract_text_from_txt(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Cannot decode file {file_path} with any of the provided encodings.")


### Function to extract entities using the language model

In [None]:
def extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size):
    extracted_info = ""
    chunks = split_into_chunks(text, max_chunk_size, overlap_size)  # Ensure chunks are small enough

    for chunk in chunks:
        prompt = (
            "Extract the following entities from this text, calculating years of experience as a decimal number where months are converted to a fractional year without any additional info: "
            + ", ".join(entities)
            + ".\n\n"
            + chunk
        )
        prompt_length = len(prompt.split())

        max_tokens_for_completion = 4097 - prompt_length
        max_tokens_for_completion = min(max_tokens_for_completion, 300)

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_tokens_for_completion
        )
        extracted_info += response.choices[0].text.strip() + "\n"

    return extracted_info


### Function to process resumes and extract named entities

In [None]:
def process_resume(directory, filename, client, entities, data_list, is_txt=False):
    file_path = os.path.join(directory, filename)
    text = extract_text_from_txt(file_path) if is_txt else extract_text_from_pdf(file_path)

    if text.strip():
        extracted_info = extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size)
        info_dict = {'Filename': filename}

        for entity in entities:
            pattern = re.compile(rf"{entity}\s*:\s*(.*)", re.IGNORECASE)
            match = pattern.search(extracted_info)
            if match:
                info_dict[entity] = match.group(1).strip()
            else:
                info_dict[entity] = None

        data_list.append(info_dict)


### Function to create a report of named entities from resumes

In [None]:

def create_entities_report(directory_path, translated_output_directory, client, entities):
    data = []

    # Process PDFs in the original directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            process_resume(directory_path, filename, client, entities, data)

    # Process translated PDFs in the output directory
    for filename in os.listdir(translated_output_directory):
        if filename.lower().startswith('translated_') and filename.lower().endswith('.txt'):
            process_resume(translated_output_directory, filename, client, entities, data, is_txt=True)

    df = pd.DataFrame(data)
    df.to_excel(os.path.join(logs_directory, 'resume_entities_report.xlsx'), index=False)


### Load the DataFrame from the Excel file

In [None]:
file_path = os.path.join(translated_output_directory, 'resume_entities_report.xlsx')
df = pd.read_excel(file_path)
df.set_index('Filename', inplace=True)
df.sort_index(inplace=True)

# Extract 'years of experience' data from the DataFrame
years_of_experience = df['years of experience']

### Function to calculate numeric years of experience from text descriptions

In [None]:

def calculate_years_of_experience(client, text_descriptions):
    numeric_experience_list = []

    for text in text_descriptions:
        prompt = f"Convert the following description of work experience '{text}' into a numeric value representing total years of experience."

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=50,
            temperature=0.5
        )

        try:
            # Find all numeric values in the response and take the first one
            # This regex matches numbers with optional decimal points
            extracted_numbers = re.findall(r'\b\d+\.?\d*\b', response.choices[0].text.strip())
            if extracted_numbers:
                # Convert the first extracted number to a float
                numeric_experience = float(extracted_numbers[0])
                numeric_experience_list.append(numeric_experience)
            else:
                # If no numbers are found, it may not be possible to calculate experience
                numeric_experience_list.append(float('nan'))  # Append NaN for manual review
        except Exception as e:
            print(f"An error occurred: {e}")
            numeric_experience_list.append(float('nan'))  # Append NaN for manual review

    return numeric_experience_list

# Assuming you have set up the 'client' and have the 'years_of_experience' from the DataFrame
text_descriptions = df['years of experience'].astype(str).tolist()
numeric_years_of_experience = calculate_years_of_experience(client, text_descriptions)

# Add the numeric years of experience back to the DataFrame
df['numeric_years_of_experience'] = numeric_years_of_experience

# Output confirmation
print(f"The updated DataFrame has been saved to {new_file_path}.")


### Function to summarize resumes

In [None]:
# Function to summarize resumes using OpenAI's GPT-3 model
def summarize_text(client, text, max_chunk_size=3000, overlap_size=50):
    chunks = split_text(text, max_chunk_size, overlap_size)
    summary = ""

    for chunk in chunks:
        prompt = (
            "Please summarize the following resume into a short paragraph that includes "
            "the job title, years of experience, highest level of education, language skills, "
            "and key skills:\n\n" + chunk
        )

        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                max_tokens=150,  # Adjust as needed for the summary length
                temperature=0.5
            )
            chunk_summary = response.choices[0].text.strip()
            summary += chunk_summary + "\n"
        except Exception as e:
            # Handle any exception that occurs
            print(f"An error occurred: {e}")

    return summary


### Process and Summarize Resumes

In [None]:
# Process and summarize resumes, assuming summarize_text function is defined
for filename in df.index:
    # Determine the correct file path
    if filename.startswith('translated_'):
        resume_path = os.path.join(translated_output_directory, filename)
    else:
        resume_path = os.path.join(directory_path, filename)

    # Skip non-resume files like 'translated_files_list.txt'
    if 'translated_files_list' in filename:
        continue

    # Check the file extension and read the content
    if resume_path.lower().endswith('.pdf'):
        try:
            resume_text = extract_text_from_pdf(resume_path)
        except Exception as e:
            print(f"An error occurred while reading PDF file: {e}")
            continue
    elif resume_path.lower().endswith('.txt'):
        try:
            resume_text = extract_text_from_txt(resume_path)
        except Exception as e:
            print(f"An error occurred while reading text file: {e}")
            continue
    else:
        print(f"Unsupported file format for file: {resume_path}")
        continue

    # Generate a summary for the resume (assuming summarize_text function is defined)
    summary = summarize_text(client, resume_text)
    df.at[filename, 'Summary'] = summary

# Uncomment the below lines to see the DataFrame and save it
print(df.head())
df.to_excel(os.path.join(logs_directory, 'updated_resume_summaries.xlsx'))
df.head()


### Scoring criteria based on provided vacancy:

### Job requirements from job description:

In [None]:
job_description = """
FullStack(NodeJS, ReactJS), Online Genealogy Service
Client
The client is an international company that provides an online genealogy service that helps its clients understand their past and family history.

Project overview
The core programming language is JavaScript (ES2020), a website running on React.js and GraphQL and the back-end platform is based on Node.js (Express). Microservices running under Kubernetes. The project methodology is Scrum.

Team
There are a few Full Stack teams, up to 8 people each. Each team has a team lead and a product owner.

Position overview
We are looking for a specialist to join one of the teams (which is more Frontend oriented) is working on the further development of existing platforms. Regarding the work schedule, each employee should be available till 4 pm UK time.

Technology stack
JavaScript, React.js, GraphQL, Node.js (Express), Kubernetes.
 
Requirements
Development experience using a Node.js (Express) + React.js stack
Experience with SQL Server
Experience with PostgreSQL
Knowledge of Kafka
Knowledge of RabbitMQ
Dev-level experience with K8s/Docker
Knowledge of sound engineering practices like pair programming, upfront automated testing, continuous deployment, and trunk-based development
Spoken English

Nice to have
Knowledge of Apollo engine, Kafka, Postgres
Experience with microservices architecture development
Experience with GraphQL
Experience with RabbitMQ, SQL Server
Experience in development with C#
Experience with SOLR
Software development experience in Python
"""

entities = [
    "job title", "years of experience", "highest level of education", "language skills", "key skills"
]


prompt = (
    "Please structure the job requirements from the following text into a JSON-like format with these categories: "
    + ", ".join(entities)
    + ".\n\n"
    + job_description
)

response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    max_tokens=300  # Adjust as needed
)
extracted_requirements = response.choices[0].text.strip()
print(extracted_requirements)


### Scoring function - this solution was not giving very well results:

In [None]:
# pip install fuzzywuzzy python-Levenshtein
# import os
# import re
# import json
# import pandas as pd
# from collections import Counter
# from fuzzywuzzy import fuzz  # For fuzzy string matching

# max_chunk_size = 3000  
# overlap_size = 50    
# directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
# translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"

# entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

# # Define the job description as a string
# job_description_str = """
# {
#     "job title": "FullStack Developer",
#     "years of experience": "At least 2 years of development experience",
#     "highest level of education": "Bachelor's or higher in Computer Science or related field",
#     "language skills": "Fluent in spoken English",
#     "key skills": [
#         "Node.js", "React.js", "GraphQL", "Kubernetes", "SQL Server",
#         "PostgreSQL", "Kafka", "RabbitMQ", "C#", "SOLR", "Python",
#         "Sound engineering practices", "Pair programming",
#         "Automated testing", "Continuous deployment", "Trunk-based development"
#     ]
# }
# """

# # The `job_description` variable should be a dictionary parsed from JSON.
# job_description = json.loads(job_description_str)

# def split_into_chunks(text, max_chunk_size, overlap_size):
#     words = text.split()
#     chunks = []
#     current_chunk = []

#     for word in words:
#         current_chunk.append(word)
#         if len(' '.join(current_chunk)) > max_chunk_size:
#             # Split the chunk at the max chunk size
#             chunk = ' '.join(current_chunk[:len(current_chunk)-overlap_size])
#             chunks.append(chunk)
#             # Start the next chunk with the overlap
#             current_chunk = current_chunk[-overlap_size:]
    
#     # Add the last chunk
#     chunks.append(' '.join(current_chunk))
#     return chunks

# def extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size):
#     extracted_info = {entity: [] for entity in entities}  # Initialize as a dictionary
#     chunks = split_into_chunks(text, max_chunk_size, overlap_size)  # Ensure chunks are small enough

#     for chunk in chunks:
#         prompt = f"Please extract the following entities from this text: {', '.join(entities)}.\n\n{chunk}"
#         prompt_length = len(prompt.split())  # Calculate the prompt length in tokens

#         max_tokens_for_completion = 4097 - prompt_length  # Adjust max tokens based on prompt length
#         max_tokens_for_completion = min(max_tokens_for_completion, 300)  # Limit to 300 or less

#         response = client.completions.create(
#             model="gpt-3.5-turbo-instruct",
#             prompt=prompt,
#             max_tokens=max_tokens_for_completion
#         )

#         # Process the structured response and fill the extracted_info dict
#         structured_response = response.choices[0].text.strip().split('\n')
#         for line in structured_response:
#             for entity in entities:
#                 if line.lower().startswith(entity.lower() + ':'):
#                     split_line = line.split(':', 1)
#                     if len(split_line) > 1:
#                         value = split_line[1].strip()
#                         if value:
#                             if entity == 'key skills':  # Special case as we expect a list
#                                 skills = re.findall(r"[\w']+", value)
#                                 extracted_info[entity].extend(skills)
#                             else:
#                                 extracted_info[entity].append(value)
#                     break  # Move on to the next line once the entity is found

#     # Aggregate the extracted information by combining or choosing the most mentioned entity
#     for entity, values in extracted_info.items():
#         if values:
#             if entity == 'key skills':
#                 skills_counter = Counter(values)
#                 # Select skills that are most frequently mentioned
#                 extracted_info[entity] = [skill for skill, count in skills_counter.items() if count > 1]  # Adjusted to filter skills mentioned more than once
#             else:
#                 # For other entities, we expect a single value, so we take the most frequent one
#                 value_counter = Counter(values)
#                 extracted_info[entity], _ = value_counter.most_common(1)[0]

#     # Now, the extracted_info is ready for scoring against the job_description
#     return extracted_info



# def extract_text_from_txt(file_path):
#     encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']  # Common encodings
#     for encoding in encodings:
#         try:
#             with open(file_path, 'r', encoding=encoding) as file:
#                 return file.read()
#         except UnicodeDecodeError:
#             continue
#     raise ValueError(f"Cannot decode file {file_path} with any of the provided encodings.")

# # We keep the existing functions extract_text_from_txt, extract_text_from_pdf, and split_into_chunks unchanged.
# # ...

# def score_resume(extracted_info, job_description):
#     score = 0
#     total_weight = 0

#     # Define weights for each entity
#     weights = {
#         "job title": 2,
#         "years of experience": 1.5,
#         "highest level of education": 1,
#         "language skills": 1,
#         "key skills": 3
#     }

#     def get_first_value(entity):
#         return ' '.join(extracted_info[entity]).lower() if entity in extracted_info and extracted_info[entity] else ''

#     # Function to calculate and weight individual scores
#     def calculate_weighted_score(entity, extracted_value, required_value):
#         if entity == "key skills":
#             total_skills = len(required_value)
#             matching_skills = sum(skill in extracted_value for skill in required_value)
#             return (matching_skills / total_skills) * weights[entity]
#         else:
#             return (1 if fuzz.partial_ratio(extracted_value, required_value) > 80 else 0) * weights[entity]

#     # Scoring for Years of Experience
#     extracted_years_list = re.findall(r"\d+", get_first_value('years of experience'))
#     required_years = float(re.findall(r"\d+", job_description['years of experience'])[0])
#     extracted_years = float(extracted_years_list[0]) if extracted_years_list else 0
#     experience_score = min(extracted_years / required_years, 1) * weights['years of experience']
#     score += experience_score

#     # Scoring for other entities
#     for entity in ['job title', 'highest level of education', 'language skills']:
#         extracted_value = get_first_value(entity)
#         required_value = job_description[entity].lower()
#         score += calculate_weighted_score(entity, extracted_value, required_value)

#     # Special handling for 'key skills' as it's a list
#     if 'key skills' in extracted_info:
#         score += calculate_weighted_score('key skills', extracted_info['key skills'], job_description['key skills'])

#     total_weight = sum(weights.values())
#     return score / total_weight  # Normalize score based on total weight


# def extract_and_score_resume(client, text, job_description, entities, max_chunk_size, overlap_size):
#     extracted_entities = extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size)
    
#     # info_dict to collect each entity's most common or first occurrence in the resume
#     info_dict = {entity: [] for entity in entities}  # Initialize with empty lists

#     for entity in entities:
#         # Directly access the entity's value from the extracted_entities dictionary
#         if entity in extracted_entities and extracted_entities[entity]:
#             info_dict[entity] = extracted_entities[entity]

#     # Now you can score the resume based on the content of info_dict and the job_description
#     resume_score = score_resume(info_dict, job_description)
#     return info_dict, resume_score

# def create_entities_report(
#     directory_path=directory_path,
#     translated_output_directory=translated_output_directory,
#     client=client,
#     job_description=job_description,
#     entities=entities,
#     max_chunk_size=max_chunk_size,
#     overlap_size=overlap_size
# ):
#     """
#     This function processes all resumes in the given directories, extracting entities and scoring them against a job description.
#     It generates a report that ranks the resumes based on their scores.

#     :param directory_path: Path to the directory containing the original resumes.
#     :param translated_output_directory: Path to the directory where the translated resumes are stored.
#     :param client: OpenAI client initialized with an API key.
#     :param job_description: Dictionary containing the job description to score against.
#     :param entities: List of entity types to extract from the resumes.
#     :param max_chunk_size: Maximum size of the text chunk to be processed by the LLM in a single request.
#     :param overlap_size:Size of the overlap between chunks of text to ensure continuity is maintained in entity extraction.
#     """
#     data = []
    
#     # Process all resumes in the directories
#     for directory, is_txt in [(directory_path, False), (translated_output_directory, True)]:
#         for filename in os.listdir(directory):
#             file_extension = '.txt' if is_txt else '.pdf'
#             if filename.lower().endswith(file_extension):
#                 file_path = os.path.join(directory, filename)
                
#                 if is_txt:
#                     text = extract_text_from_txt(file_path)
#                 else:
#                     text = extract_text_from_pdf(file_path)

#                 if text.strip():
#                     # Assuming that extract_and_score_resume is a function defined elsewhere 
#                     # that takes the following arguments in the order given below.
#                     info_dict, resume_score = extract_and_score_resume(client, text, job_description, entities, max_chunk_size, overlap_size)
#                     info_dict['Filename'] = filename
#                     info_dict['Score'] = resume_score
#                     data.append(info_dict)

#     # Creating a DataFrame from the extracted data and scores
#     df = pd.DataFrame(data)
#     df = df.sort_values(by='Score', ascending=False)  # Sort dataframe by score in descending order
#     output_file_path = os.path.join(logs_directory, 'resume_scoring_report.xlsx')
#     df.to_excel(output_file_path, index=False)

#     print(f"Report generated and saved to {output_file_path}")



# # Call the main function with the correct parameters.
# create_entities_report(
#     directory_path=directory_path,
#     translated_output_directory=translated_output_directory,
#     client=client,
#     job_description=job_description,
#     entities=entities,
#     max_chunk_size=max_chunk_size,
#     overlap_size=overlap_size
# )

### Because previous solution was not giving appropriate scoring mechanism I switched to embeddings

### Using embeddings:

In [None]:
import pandas as pd

# Define the file paths for the Excel files
excel_file_1 = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_resume_summaries.xlsx"
excel_file_2 = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_years_resume_entities_report.xlsx"

# Load the Excel files into DataFrames
df1 = pd.read_excel(excel_file_1)
df2 = pd.read_excel(excel_file_2)

# Ensure that "Filename" is set as the index for both DataFrames to use for alignment
df1.set_index('Filename', inplace=True)
df2.set_index('Filename', inplace=True)

# Replace the "ABC" column in df1 with the "DEF" column from df2
df1['years of experience'] = df2['numeric_years_of_experience']

# Reset the index if you want "Filename" back as a column
df1.reset_index(inplace=True)

# Define the complete path for saving the modified DataFrame to a new Excel file
save_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_years_of_exp_and_summary.xlsx"

# Save the modified DataFrame back to an Excel file with the specified path
df1.to_excel(save_path, index=True)


In [None]:
df1.head(20)
df1.info()

### working embeddings

In [None]:
import pandas as pd
import openai
import os

# Set up the OpenAI client with your API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize the OpenAI client
client = openai.OpenAI(api_key=openai.api_key)

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    # Access the embedding using dot notation
    return response.data[0].embedding

# Define the path to your Excel file
excel_file_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_years_of_exp_and_summary.xlsx"

# Load the Excel file into a DataFrame
df = pd.read_excel(excel_file_path, index_col='Filename')

# Define which columns contain text that you want to embed
text_columns = ['job title', 'years of experience', 'highest level of education', 'language skills', 'key skills', 'Summary']

# Generate embeddings for the specified text columns
for column in text_columns:
    # Skip columns with non-text data
    if df[column].dtype == 'object':
        # Use the get_embedding function directly
        df[column + ' embedding'] = df[column].apply(lambda x: get_embedding(x) if pd.notnull(x) else np.nan)

# Save the DataFrame, including the embeddings, back to an Excel file
save_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_with_embeddings.xlsx"
df.to_excel(save_path, index=True)

### Read file as xlsx and save as CSV

In [None]:
import pandas as pd
import numpy as np
import openai
import os

# Set up the OpenAI client with your API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize the OpenAI client
client = openai.OpenAI(api_key=openai.api_key)

# Embedding model parameters
embedding_model = "text-embedding-ada-002"

def get_embedding(text, model=embedding_model):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    # Access the embedding using dot notation
    return response.data[0].embedding

# Define the path to your Excel file
excel_file_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_years_of_exp_and_summary.xlsx"  # Update this path

# Load the Excel file into a DataFrame
df = pd.read_excel(excel_file_path, index_col='Filename')

# Define which columns contain text that you want to embed
text_columns = ['job title', 'years of experience', 'highest level of education', 'language skills', 'key skills', 'Summary']

# Generate embeddings for the specified text columns
for column in text_columns:
    # Skip columns with non-text data
    if df[column].dtype == 'object':
        # Use the get_embedding function directly
        df[column + ' embedding'] = df[column].apply(lambda x: get_embedding(x) if pd.notnull(x) else np.nan)

# Save the DataFrame, including the embeddings, back to a CSV file
save_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_with_embeddings.csv"  # Update this path
df.to_csv(save_path, index=True)


### Check if everything worked as expected

In [None]:


df = pd.read_csv("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_with_embeddings.csv")
df.head()
df.tail()

In [None]:
# Necessary imports
import pandas as pd
import numpy as np
from numpy.linalg import norm
from ast import literal_eval
import openai
import os

# Function to get embedding
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

# Function to convert string representation of a list to an actual list of floats
# Custom function to convert string representation of a list to an actual list of floats
def string_to_float_list(s):
    try:
        return np.array(literal_eval(s))
    except:
        return np.nan

# Function to check if a string can be evaluated to a list
def can_convert_to_list(s):
    try:
        _ = eval(s)
        return True
    except:
        return False

# Set up the OpenAI client with your API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Define a utility function for cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

# Break down the job description
job_description = {
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": "Node.js, React.js, GraphQL, Kubernetes, SQL Server, "
                 "PostgreSQL, Kafka, RabbitMQ, C#, SOLR, Python, "
                 "Sound engineering practices, Pair programming, "
                 "Automated testing, Continuous deployment, Trunk-based development"
}

job_description_embeddings = {}
for key, value in job_description.items():
    job_description_embeddings[key] = get_embedding(value)



# Embed the job description
job_description_embedding = get_embedding(job_description_str)

# Load the DataFrame with embeddings from a CSV file
df_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/updated_with_embeddings.csv"
df = pd.read_csv(df_path)

# Convert string representations of lists back into actual lists
embedding_columns = [col for col in df.columns if 'embedding' in col]
for column in embedding_columns:
    df[column] = df[column].apply(string_to_float_list)

# Drop rows with NaN values in embedding columns after conversion
df.dropna(subset=embedding_columns, inplace=True)

# Define which columns contain embeddings that you want to compare
resume_embedding_columns = [
    'job title embedding',
    'years of experience embedding',
    'highest level of education embedding',
    'language skills embedding',
    'key skills embedding',
    'Summary embedding'
]

def search_resumes(df):
    similarities = []
    for _, row in df.iterrows():
        # Compute similarity for each aspect of the job description
        similarity_scores = []
        for key in job_description_embeddings.keys():
            resume_embedding = row.get(f'{key} embedding')
            if isinstance(resume_embedding, np.ndarray):
                job_embedding = job_description_embeddings[key]
                similarity = cosine_similarity(resume_embedding, job_embedding)
                similarity_scores.append(similarity)

        # Average the similarity scores if there are valid scores
        if similarity_scores:
            avg_similarity = np.nanmean(similarity_scores)
            similarities.append((row['Filename'], avg_similarity))

    similarity_df = pd.DataFrame(similarities, columns=['Filename', 'similarity'])
    
    csv_file_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\scores.csv"
    similarity_df.to_csv(csv_file_path, index=True)
    print(f'DataFrame saved to {csv_file_path}')


    return similarity_df.sort_values('similarity', ascending=False)


def average_embedding(row):
    embeddings = [row[col] for col in resume_embedding_columns if isinstance(row[col], np.ndarray)]
    if embeddings:
        return np.mean(np.stack(embeddings), axis=0)
    else:
        return np.nan

top_matches = search_resumes(df)

print(top_matches.head(10))

### Combine scores with existing excel file and save under new name

In [None]:
import pandas as pd

# Specify the file paths
excel_file = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\updated_years_of_exp_and_summary.xlsx"
csv_file_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\scores.csv"
output_excel_file = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\logs\\resumes_summary_scores_sorted.xlsx"

excel_df = pd.read_excel(excel_file)
scores_df = pd.read_csv(csv_file_path)

scores_df = scores_df.rename(columns={'similarity': 'scores'})
excel_df['scores'] = scores_df['scores']

# Sort the combined DataFrame by the 'scores' column
sorted_df = excel_df.sort_values(by='scores', ascending=False)
sorted_df = sorted_df.drop(columns=["Unnamed: 0"])
sorted_df = sorted_df.set_index("Filename")
sorted_df.to_excel(output_excel_file, index=False)

print(f"Sorted and saved DataFrame to {output_excel_file}")


In [4]:
print("Sorted DataFrame:")
sorted_df.head(10)

Sorted DataFrame:


NameError: name 'sorted_df' is not defined