# 1.4.1. Task 1: Working with the documents 

## 1. Translation.
Find the resumes (or parts of them) that are not in English, and translate them into English using LLM.

## 2. Entities extraction.
Extract useful named entities from the resume using LLM. For example, you can extract the job title, years of experience, highest level of education, language skills, and key skills, or define any entities that you find interesting. As an additional task, you may create an Excel report that contains entities from 20-30 resumes.

## 3. Summarisation.
Make a short summary of the resume. You may choose any size you find useful. Defining the structure of the summary (adding the obligatory entities) or just getting it from LLM is up to you. The general idea is to provide an opportunity for recruiters to read it quickly and not scan 2-3 pages.

## 4. Resume scoring.
Develop a mechanism to provide a ranking of the resumes for a vacancy by providing a score (float value from 0 to 1). A particular vacancy can be found at [https://www.dataart.team/vacancies](https://www.dataart.team/vacancies) (or on LinkedIn). It should work in 2 modes: calculate the score for the provided vacancy and resume, and present the top 10 candidates for the vacancy.
a and algorithms for scoring.
ncy.

### Installations:

In [None]:
!pip install -q openai pandas langdetect PyPDF2 faiss-cpu

### Imports

In [None]:
import os
import re
from pathlib import Path

import numpy as np
import pandas as pd
from PyPDF2 import PdfReader
from langdetect import detect, DetectorFactory
import openai

### Set environment variable for Open Ai client:

In [None]:
%env OPENAI_API_KEY=

### Create basic Variables, paths and set Open AI client

In [None]:
from pathlib import Path
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
directory_path = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/test_resumes_dataset")
translated_output_directory = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/resumes_translated")
logs_directory = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs")
max_chunk_size = 3500  
overlap_size = 50  

DetectorFactory.seed = 0

### extract_text_from_pdf

In [None]:
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

### Split text into chunks with overlap

In [None]:
def split_text(text, max_chunk_size, overlap_size=50):
    words = text.split()
    chunks = []
    current_chunk = ""
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_chunk_size:
            current_chunk += word + " "
        else:
            chunks.append(current_chunk)
            current_chunk = word + " "
    chunks.append(current_chunk)
    return chunks

### Function to translate text using OpenAI's API  for openai>=1.0.0

In [None]:
def translate_text(client, text, target_language="en"):
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"Translate the following text to {target_language}:\n\n{text}",
        max_tokens=500
    )
    return response.choices[0].text.strip()

### Function to process PDFs and save translated versions

In [None]:
def process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client):
    directory_path = Path(directory_path)
    translated_output_directory = Path(translated_output_directory)
    
    translated_output_directory.mkdir(parents=True, exist_ok=True)

    translated_files_list = []
    english_files_list = []

    for pdf_path in directory_path.glob('*.pdf'):  
        text = extract_text_from_pdf(str(pdf_path))

        if text.strip():
            if detect(text) != 'en':
                chunks = split_text(text, max_chunk_size, overlap_size)
                translated_text = ""

                for chunk in chunks:
                    if detect(chunk) != 'en':
                        chunk = translate_text(client, chunk, target_language="en")
                    translated_text += chunk + " "
                
                translated_filename = f"translated_{pdf_path.stem}.txt"
                translated_path = translated_output_directory / translated_filename
                save_text_to_file(translated_text, translated_path)

                translated_files_list.append(translated_filename)
            else:
                english_files_list.append(pdf_path.name)
        else:
            print(f"Document {pdf_path.name} is empty or contains very little text.")


    save_file_list(translated_files_list, Path(logs_directory), 'translated_files_list.txt')
    save_file_list(english_files_list, Path(logs_directory), 'english_files_list.txt')

def save_file_list(file_list, directory, filename):
    directory = Path(directory)  
    file_path = directory / filename  
    with file_path.open('w', encoding='utf-8') as f:
        for file in file_list:
            f.write(f"{file}\n")


def save_text_to_file(text, file_path):
    file_path = Path(file_path)  
    with file_path.open('w', encoding='utf-8') as f:
        f.write(text)


In [None]:
process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client)

### Create named entities to look for in resumes

In [None]:
entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

### Function to extract text from TXT files with different encodings

In [None]:
def extract_text_from_txt(file_path):
    file_path = Path(file_path) 
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            with file_path.open('r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Cannot decode file {file_path} with any of the provided encodings.")


### Function to extract entities using the language model

In [None]:
def extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size):
    extracted_info = ""
    chunks = split_text(text, max_chunk_size, overlap_size) 

    for chunk in chunks:
        prompt = (
                "Extract the following entities from this text, calculating years of experience as a decimal number where months are converted to a fractional year without any additional info: "
                + ", ".join(entities)
                + ". If a value for an entity is not present or cannot be extracted, fill in with just: NaN."
                + "\n\n"
                + chunk
            )
        prompt_length = len(prompt.split())

        max_tokens_for_completion = 4097 - prompt_length
        max_tokens_for_completion = min(max_tokens_for_completion, 300)

        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_tokens_for_completion,
            temperature=0.35
        )
        extracted_info += response.choices[0].text.strip() + "\n"

    return extracted_info


### Function to process resumes and extract named entities

In [None]:
def process_resume(directory, filename, client, entities, data_list, is_txt=False):
    file_path = directory / filename 
    text = extract_text_from_txt(file_path) if is_txt else extract_text_from_pdf(file_path)

    if text.strip():
        extracted_info = extract_entities_with_llm(client, text, entities, max_chunk_size, overlap_size)
        info_dict = {'Filename': filename}

        for entity in entities:
            pattern = re.compile(rf"{entity}\s*:\s*(.*)", re.IGNORECASE)
            match = pattern.search(extracted_info)
            if match:
                info_dict[entity] = match.group(1).strip()
            else:
                info_dict[entity] = None

        data_list.append(info_dict)


### Function to create a report of named entities from resumes

In [None]:
def create_entities_report(directory_path, translated_output_directory, client, entities):
    data = []
    directory_path = Path(directory_path)
    translated_output_directory = Path(translated_output_directory)

    for pdf_path in directory_path.glob('*.pdf'):
        process_resume(directory_path, pdf_path.name, client, entities, data)

    for txt_path in translated_output_directory.glob('0_translated_*.txt'):
        process_resume(translated_output_directory, txt_path.name, client, entities, data, is_txt=True)

    df = pd.DataFrame(data)
    report_path = logs_directory / '1_resumes_entities_report.xlsx'
    df.to_excel(report_path, index=False)

create_entities_report(directory_path, translated_output_directory, client, entities)

file_path = logs_directory / '1_resumes_entities_report.xlsx'

if file_path.is_file():
    df = pd.read_excel(file_path)
    df.set_index('Filename', inplace=True)
    df.sort_index(inplace=True)
    years_of_experience = df['years of experience']
else:
    print(f"Error: The file {file_path} does not exist.")

### Load the DataFrame from the Excel file

In [None]:
file_path = os.path.join(logs_directory, '1_resumes_entities_report.xlsx')
df = pd.read_excel(file_path)
df.set_index('Filename', inplace=True)
df.sort_index(inplace=True)

years_of_experience = df['years of experience']

In [None]:
df

### Function to calculate numeric years of experience from text descriptions

In [None]:
file_path = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/1_resumes_entities_report.xlsx")
df = pd.read_excel(file_path)

text_descriptions = df['years of experience'].astype(str).tolist()

new_file_path = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/2_resumes_updated_years.xlsx")

def calculate_years_of_experience(client, text_descriptions):
    numeric_experience_list = []

    for text in text_descriptions:
        prompt = f"Convert the following description of work experience '{text}' into a numeric value representing total years of experience."

        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                max_tokens=50,
                temperature=0.1
            )

            extracted_numbers = re.findall(r'\b\d+\.?\d*\b', response.choices[0].text.strip())
            if extracted_numbers:
                numeric_experience = float(extracted_numbers[0])
                numeric_experience_list.append(numeric_experience)
            else:
                numeric_experience_list.append(float('nan'))
        except Exception as e:
            print(f"An error occurred: {e}")
            numeric_experience_list.append(float('nan'))

    return numeric_experience_list

numeric_years_of_experience = calculate_years_of_experience(client, text_descriptions)

df['numeric_years_of_experience'] = numeric_years_of_experience

df.to_excel(new_file_path, index=False)

print(f"The updated DataFrame has been saved to {new_file_path}.")

### Function to summarize resumes

In [None]:
def summarize_text(client, text, max_chunk_size=3000, overlap_size=50):
    chunks = split_text(text, max_chunk_size, overlap_size)
    summary = ""

    for chunk in chunks:
        prompt = (
            "Please summarize the following resume into a short paragraph that includes "
            "the job title, years of experience, highest level of education, language skills, "
            "and key skills:\n\n" + chunk
        )

        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                max_tokens=150,  
                temperature=0.2
            )
            chunk_summary = response.choices[0].text.strip()
            summary += chunk_summary + "\n"
        except Exception as e:
            print(f"An error occurred: {e}")

    return summary

### Process and Summarize Resumes

In [None]:
for filename in df.index:
    if filename.startswith('translated_'):
        resume_path = translated_output_directory / filename
    else:
        resume_path = directory_path / filename

    if 'translated_files_list' in filename:
        continue

    if resume_path.suffix.lower() == '.pdf':
        try:
            resume_text = extract_text_from_pdf(str(resume_path))
        except Exception as e:
            print(f"An error occurred while reading PDF file: {e}")
            continue
    elif resume_path.suffix.lower() == '.txt':
        try:
            resume_text = extract_text_from_txt(str(resume_path))
        except Exception as e:
            print(f"An error occurred while reading text file: {e}")
            continue
    else:
        print(f"Unsupported file format for file: {resume_path}")
        continue

    summary = summarize_text(client, resume_text)
    df.at[filename, 'Summary'] = summary

print(df.head())
updated_file_path = logs_directory / '3_resumes_summaries.xlsx'
df.to_excel(str(updated_file_path))
df.head()

### Scoring criteria based on provided vacancy:

### job description from vacancies.DataArt:

In [None]:
job_description = """
FullStack(NodeJS, ReactJS), Online Genealogy Service
Client
The client is an international company that provides an online genealogy service that helps its clients understand their past and family history.

Project overview
The core programming language is JavaScript (ES2020), a website running on React.js and GraphQL and the back-end platform is based on Node.js (Express). Microservices running under Kubernetes. The project methodology is Scrum.

Team
There are a few Full Stack teams, up to 8 people each. Each team has a team lead and a product owner.

Position overview
We are looking for a specialist to join one of the teams (which is more Frontend oriented) is working on the further development of existing platforms. Regarding the work schedule, each employee should be available till 4 pm UK time.

Technology stack
JavaScript, React.js, GraphQL, Node.js (Express), Kubernetes.
 
Requirements
Development experience using a Node.js (Express) + React.js stack
Experience with SQL Server
Experience with PostgreSQL
Knowledge of Kafka
Knowledge of RabbitMQ
Dev-level experience with K8s/Docker
Knowledge of sound engineering practices like pair programming, upfront automated testing, continuous deployment, and trunk-based development
Spoken English

Nice to have
Knowledge of Apollo engine, Kafka, Postgres
Experience with microservices architecture development
Experience with GraphQL
Experience with RabbitMQ, SQL Server
Experience in development with C#
Experience with SOLR
Software development experience in Python
"""

### combine columns in excel:

In [None]:
excel_file_1 = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/3_resumes_summaries.xlsx")
excel_file_2 = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/2_resumes_updated_years.xlsx")

df1 = pd.read_excel(excel_file_1)
df2 = pd.read_excel(excel_file_2)

df1.set_index('Filename', inplace=True)
df2.set_index('Filename', inplace=True)

df1['years of experience'] = df2['numeric_years_of_experience']
df1.reset_index(inplace=True)

save_path = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/4_resumes_years_summaries.xlsx")

df1.to_excel(save_path, index=False)

In [None]:
df1.head(10)

### working embeddings

### Read file as xlsx and save as CSV, clean excel file

In [None]:
import json

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

def list_to_json_str(lst):
    return json.dumps(lst)

excel_file_path = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/4_resumes_years_summaries.xlsx")

df = pd.read_excel(excel_file_path, index_col='Filename')

text_columns = ['job title', 'years of experience', 'highest level of education', 'language skills', 'key skills', 'Summary']

for column in text_columns:
    if df[column].dtype == 'object':
        df[column + ' embedding'] = df[column].apply(lambda x: list_to_json_str(get_embedding(x)) if pd.notnull(x) else np.nan)

save_csv_path = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/5_resumes_embeddings.csv")
df.to_csv(save_csv_path, index=True)

save_excel_path = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/5_resumes_embeddings.xlsx")
df.to_excel(save_excel_path, index=True)

print(f"DataFrame saved to {save_csv_path} and {save_excel_path}")

### Check if everything worked as expected

In [None]:
df = pd.read_csv("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/5_resumes_embeddings.csv")
df.head()
df.tail()
df.info()

### Scoring using vector similarity search

In [None]:
from numpy.linalg import norm
from ast import literal_eval

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

def string_to_float_list(s):
    try:
        return np.array(literal_eval(s))
    except:
        return np.nan

def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

job_description = {
    "job title": "FullStack Developer",
    "years of experience": "At least 2 years of development experience",
    "highest level of education": "Bachelor's or higher in Computer Science or related field",
    "language skills": "Fluent in spoken English",
    "key skills": "Node.js, React.js, GraphQL, Kubernetes, SQL Server, PostgreSQL, Kafka, RabbitMQ, C#, SOLR, Python, Sound engineering practices, Pair programming, Automated testing, Continuous deployment, Trunk-based development"
}

job_description_embeddings = {key: get_embedding(value) for key, value in job_description.items()}

df_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/5_resumes_embeddings.csv"
df = pd.read_csv(df_path)

embedding_columns = [col for col in df.columns if 'embedding' in col]
df[embedding_columns] = df[embedding_columns].applymap(string_to_float_list)

def search_resumes(df, job_description_embeddings):
    df['similarity'] = 0.0

    for index, row in df.iterrows():
        similarity_scores = []
        for key, job_embedding in job_description_embeddings.items():
            embedding_col_name = f'{key} embedding'

            if embedding_col_name in df.columns:
                resume_embedding = row.get(embedding_col_name)

                if isinstance(resume_embedding, np.ndarray) and not np.isnan(resume_embedding).any():
                    similarity_scores.append(cosine_similarity(resume_embedding, job_embedding))

        if similarity_scores:
            df.at[index, 'similarity'] = np.mean(similarity_scores)
        else:
            df.at[index, 'similarity'] = 0.0  

    sorted_df = df.sort_values('similarity', ascending=False)

    csv_file_path = "C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/6_scores.csv"
    sorted_df.to_csv(csv_file_path, index=False)
    print(f"DataFrame saved to {csv_file_path}")

    return sorted_df

top_matches = search_resumes(df, job_description_embeddings)
top_matches.head(10)

In [None]:
top_matches.head(10)

### Combine scores with existing excel file and save under new name

In [None]:
excel_file = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/4_resumes_years_summaries.xlsx")
csv_file_path = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/6_scores.csv")
output_excel_file = Path("C:/Users/apleczkan/PycharmProjects/task1-cv-resumes/logs/7_resumes_summary_scores_sorted.xlsx")

excel_df = pd.read_excel(excel_file)
scores_df = pd.read_csv(csv_file_path)

excel_df = excel_df.sort_values(by='Filename')
scores_df = scores_df.sort_values(by='Filename')

scores_df = scores_df.rename(columns={'similarity': 'scores'})

excel_df['scores'] = scores_df['scores'].values

if "Unnamed: 0" in excel_df.columns:
    excel_df = excel_df.drop(columns=["Unnamed: 0"])

sorted_df = excel_df.sort_values(by='scores', ascending=False)

sorted_df.to_excel(output_excel_file, index=False)

print(f"Sorted and saved DataFrame to {output_excel_file}")

In [None]:
print("Sorted DataFrame:")
sorted_df.head(10)