In [1]:
!pip install pdfplumber
!pip install pdf2image
!pip install easyocr
!pip install deep_translator
!pip install langdetect
!pip install --upgrade openai

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m969.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.1

In [2]:
import pdfplumber
from pdf2image import convert_from_path
import easyocr
import io
import numpy as np
import re
import unicodedata
from deep_translator import GoogleTranslator
from nltk.tokenize import sent_tokenize
import pandas as pd
import nltk
import os

In [3]:
nltk.download('punkt', quiet=True)
reader = easyocr.Reader(['en'], gpu=False)
translator = GoogleTranslator(source='auto', target='en')



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

In [4]:

def extract_text_from_pdf_file(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join([page.extract_text() or '' for page in pdf.pages])
        if text.strip():
            return text.strip()
    except Exception as e:
        print(f"pdfplumber failed: {e}")

    # Fallback to OCR
    try:
        images = convert_from_path(file_path)
        text = ""
        for image in images:
            result = reader.readtext(np.array(image), detail=0, paragraph=True)
            text += " ".join(result) + "\n"
        return text.strip()
    except Exception as e:
        print(f"OCR failed: {e}")
        return ""

# Reuse the same preprocess function
def preprocess_text(text):
    text = unicodedata.normalize('NFKD', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = text.strip().lower()
    return text

def force_split_text(text, max_len):
    return [text[i:i+max_len] for i in range(0, len(text), max_len)]

def safe_translate(text, max_chunk_size=4500):
    try:
        # Ensure input is string
        if not isinstance(text, str):
            text = '' if pd.isna(text) else str(text)

        text = text.strip()
        if not text:
            return ''

        if len(text) <= max_chunk_size:
            return translator.translate(text)

        # Tokenize into sentences
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = ''

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            if len(sentence) > max_chunk_size:
                sub_chunks = force_split_text(sentence, max_chunk_size)
                chunks.extend(sub_chunks)
                continue

            if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk += ' ' + sentence

        if current_chunk:
            chunks.append(current_chunk.strip())

        # Translate chunks safely
        translated_chunks = []
        for i, chunk in enumerate(chunks):
            if len(chunk) > max_chunk_size:
                print(f"[SKIPPED] Chunk {i} exceeds {max_chunk_size} characters: {len(chunk)} chars")
                sub_chunks = force_split_text(chunk, max_chunk_size)
                for sub_chunk in sub_chunks:
                    translated_chunks.append(translator.translate(sub_chunk))
            else:
                translated_chunks.append(translator.translate(chunk))

        return ' '.join(translated_chunks)

    except Exception as e:
        snippet = str(text)[:100].replace('\n', ' ')
        print(f"Translation error for: {snippet}... — {e}")
        return text


In [5]:
# Directory containing PDF files
pdf_directory = "/content/"  # Adjust this path

# List of PDF files
pdf_files = ["Scrum.pdf", "Data Engineer.pdf", "Data Analyst.pdf"]

# Initialize an empty list to store data
data = []

# Process each PDF file
for pdf in pdf_files:
    pdf_path = os.path.join(pdf_directory, pdf)

    # Extract text using OCR
    extracted_text = preprocess_text(extract_text_from_pdf_file(pdf_path))

    # Translate to English
    translated_text = safe_translate(extracted_text)

    # Append data to the list
    data.append({
        "demand_id": os.path.splitext(pdf)[0],
        "extracted_text": extracted_text,
        "translated_text": translated_text
    })

# Convert list to DataFrame
df = pd.DataFrame(data)



Translation error for: poste : data engineer • type de contrat : contrat • date de début : dès que possible • durée du m... — 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************





In [6]:
df

Unnamed: 0,demand_id,extracted_text,translated_text
0,Scrum,scrum master – contrat de 12 mois lieu : montr...,Scrum Master - 12 -month contract Place: Montr...
1,Data Engineer,poste : data engineer • type de contrat : cont...,poste : data engineer • type de contrat : cont...
2,Data Analyst,profil data analyst – marketing analytics à p...,Profile Data Analyst - Marketing Analytics abo...


In [20]:
from openai import OpenAI
import json

client = OpenAI(api_key="key")


def extract_features_openai_v1(job_description):
    prompt = f"""
    You are an expert in job description analysis. Extract structured information from the following job description and categorize skills and languages using a 1–3 scale as per the given guidelines.

    ### Job Description:
    {job_description}

    ### Guidelines for Scaling (1-3)

    - **Required & Preferred Skills**:
      - **3** = Critical expertise (e.g., "Expert in Python", "2+ years experience in Kubernetes")
      - **2** = Required but not expert level (e.g., "Required: Java, C++")
      - **1** = Mentioned but not explicitly required (e.g., "Nice to have: AWS")

    - **Languages**:
      - **3** = Critical requirement (e.g., "Fluency in French is essential")
      - **2** = Important but secondary (e.g., "Functional English required")
      - **1** = Nice to have (e.g., "Basic German knowledge preferred")

    ### Output Format (Strict JSON):
    {{
      "Job Title": "<Job Title>",
      "Required Skills": [{{"skill": "<Skill>", "level": <1|2|3>}}],
      "Preferred Skills": [{{"skill": "<Skill>", "level": <1|2|3>}}],
      "Experience Required": <float>,
      "Languages": [{{"language": "<Language>", "level": <1|2|3>}}],
      "Responsibilities": ["<bullet point responsibility 1>", "..."],
      "Location": "<City or leave empty>",
      "Salary": "<Value or empty>",
      "Additional Notes": {{
        "Duration": "<value>",
        "Type": "<value>",
        "Mode of work": "<value>"
      }}
    }}

    Ensure the JSON is properly formatted, strictly follows the structure, and only includes relevant data.
    **Translate all extracted information into English, even if the job description is written in another language.**
        """

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-16k",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
        )
        content = response.choices[0].message.content
        return json.loads(content)
    except Exception as e:
        return {"error": str(e)}


#         # response = openai.ChatCompletion.create(
#         #     model="gpt-4",
#         #     messages=[{"role": "user", "content": prompt}],
#         #     temperature=0.2,
#         # )
#         response = openai.ChatCompletion.create(
#             model="gpt-3.5-turbo-16k",  # <- switch here
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.2,
#         )

#         content = response.choices[0].message["content"]
#         extracted_features = json.loads(content)
#     except Exception as e:
#         extracted_features = {"error": str(e)}

    return extracted_features

# Apply to DataFrame
df["extracted_features"] = df["extracted_text"].apply(extract_features_openai_v1)


In [43]:
df

Unnamed: 0,demand_id,extracted_text,translated_text,extracted_features
0,Scrum,scrum master – contrat de 12 mois lieu : montr...,Scrum Master - 12 -month contract Place: Montr...,"{'Job Title': 'Scrum Master', 'Required Skills..."
1,Data Engineer,poste : data engineer • type de contrat : cont...,poste : data engineer • type de contrat : cont...,"{'Job Title': 'Data Engineer', 'Required Skill..."
2,Data Analyst,profil data analyst – marketing analytics à p...,Profile Data Analyst - Marketing Analytics abo...,{'Job Title': 'Data Analyst - Marketing Analyt...


In [23]:
# df.to_csv('features_extract_v2.csv')

In [None]:
import ast
import json
import os

# Optional: Create output directory
output_dir = "job_features_json"
os.makedirs(output_dir, exist_ok=True)

for idx, row in df.iterrows():
    features_data = row['extracted_features']
    demand_id = row['demand_id']

    # Sanitize filename
    filename = '_'.join(demand_id.split()) + '_features.json'
    filepath = os.path.join(output_dir, filename)

    # Safely parse the string to dict
    if isinstance(features_data, str):
        try:
            features_data = ast.literal_eval(features_data)
        except Exception as e:
            print(f"[ERROR] Failed to parse row {idx}: {e}")
            features_data = {}

    # Save as JSON
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(features_data, f, indent=4, ensure_ascii=False)

    print(f"✅ Saved: {filepath}")
