### Installations:

In [67]:
!pip install openai pandas langdetect PyPDF2



### Imports:

In [69]:
import openai
from langdetect import detect, DetectorFactory
import os
from PyPDF2 import PdfReader

### Set environment variable for Open Ai client:

In [70]:
%env OPENAI_API_KEY=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

env: OPENAI_API_KEY=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


### Create basic Variables, paths and set Open AI client

In [71]:
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
directory_path = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\test_resumes_dataset"
translated_output_directory = "C:\\Users\\apleczkan\\PycharmProjects\\task1-cv-resumes\\resumes_translated"
max_chunk_size = 1000
overlap_size = 50

# Make langdetect non-deterministic results predictable
DetectorFactory.seed = 0

In [72]:
# Function to read PDF and extract text
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

In [73]:
def split_text(text, max_chunk_size, overlap_size=50):
    words = text.split()
    chunks = []
    current_chunk = ""
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_chunk_size:
            current_chunk += word + " "
        else:
            chunks.append(current_chunk)
            current_chunk = word + " "
    chunks.append(current_chunk)  # Add the last chunk
    return chunks

In [74]:
# Function to translate text using OpenAI's API (Updated for openai>=1.0.0)
def translate_text(client, text, target_language="en"):
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"Translate the following text to {target_language}:\n\n{text}",
        max_tokens=500  # Adjust as needed
    )
    # Access the text from the response
    return response.choices[0].text.strip()

In [75]:

def process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client):
    if not os.path.exists(translated_output_directory):
        os.makedirs(translated_output_directory)

    translated_files_list = []
    english_files_list = []

    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)

            if text.strip():
                if detect(text) != 'en':
                    chunks = split_text(text, max_chunk_size, overlap_size)
                    translated_text = ""

                    for chunk in chunks:
                        if detect(chunk) != 'en':
                            chunk = translate_text(client, chunk, target_language="en")
                        translated_text += chunk + " "
                    
                    # Save the translated text to a .txt file
                    translated_filename = f"translated_{filename.replace('.pdf', '.txt')}"
                    translated_path = os.path.join(translated_output_directory, translated_filename)
                    save_text_to_file(translated_text, translated_path)

                    translated_files_list.append(translated_filename)
                else:
                    english_files_list.append(filename)
            else:
                print(f"Document {filename} is empty or contains very little text.")

    # Save the list of translated files to a text file for reference
    save_file_list(translated_files_list, translated_output_directory, 'translated_files_list.txt')
    # Optionally save the list of English files as well
    save_file_list(english_files_list, translated_output_directory, 'english_files_list.txt')

def save_file_list(file_list, directory, filename):
    with open(os.path.join(directory, filename), 'w', encoding='utf-8') as f:
        for file in file_list:
            f.write(f"{file}\n")

# Function to save text to a file
def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

In [76]:
process_pdfs(directory_path, max_chunk_size, overlap_size, translated_output_directory, client)

Document 12632728.pdf is empty or contains very little text.


### Create named entities to look for in resumes

In [78]:
entities = ["job title", "years of experience", "highest level of education", "language skills", "key skills"]

In [83]:
import re
def extract_text_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def process_resume(directory, filename, client, entities, data_list, is_txt=False):
    file_path = os.path.join(directory, filename)
    text = extract_text_from_txt(file_path) if is_txt else extract_text_from_pdf(file_path)
    
    if text.strip():
        extracted_info = extract_entities_with_llm(client, text, entities)
        info_dict = {'Filename': filename}
        
        for entity in entities:
            # Use regex to find the entity and its value
            pattern = re.compile(rf"{entity}\s*:\s*(.*)", re.IGNORECASE)
            match = pattern.search(extracted_info)
            if match:
                info_dict[entity] = match.group(1).strip()
            else:
                info_dict[entity] = None  # Or an appropriate placeholder if not found
        
        data_list.append(info_dict)


def create_entities_report(directory_path, translated_output_directory, client, entities):
    data = []
    # Process PDFs in the original directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            process_resume(directory_path, filename, client, entities, data)

    # Process translated PDFs in the output directory
    for filename in os.listdir(translated_output_directory):
        if filename.lower().startswith('translated_') and filename.lower().endswith('.txt'):
            process_resume(translated_output_directory, filename, client, entities, data, is_txt=True)

    df = pd.DataFrame(data)
    df.to_excel(os.path.join(translated_output_directory, 'resume_entities_report.xlsx'), index=False)

create_entities_report(directory_path, translated_output_directory, client, entities)
