In [1]:
import spacy
from datetime import datetime
import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import json
import os

In [2]:
#Dieses Modell enthält Regeln und Algorithmen, um Text auf Englisch zu verarbeiten
nlp = spacy.load('en_core_web_sm')

In [3]:
# Liste der Schlüsselwörter für Berufserfahrung
experience_keywords = [
    "experience", "Experience", "work history", "employment", "career", "professional experience",
    "work experience", "employment history", "career history", "Professional Experience", "WORKING EXPERIENCE"]
experience_keywords_regex = r"(?m)^\s*(" + "|".join(map(re.escape, experience_keywords)) + r")"

In [4]:
# List of common words associated with company names
company_keywords = [
    "Employed","Bank", "Boutique","Designer","Secretary","Security",
    "Inc.","inc", "Incorporated", "Ltd.","ltd", "Limited", "LLC", "LLP", "PLC",
    "Corp", "Corporation", "Group", "Agency", "Consultancy", "Partners",
    "Holdings", "Associates", "Independent", "Innovative", "GmbH","american express"
    "ENTERPRISES","Institut","Institutes","Institute","Mediatech"
]
company_keywords_regex = r"(?:\w+\s+){0,3}\b(" + "|".join(map(re.escape, company_keywords)) + r")\b(?:\s+\w+){0,3}"

In [5]:
# List of common job position keywords
job_position_keywords = [
    "Developer", "Engineer", "Manager", "Consultant", "Technician","Fixed"
    "Analyst", "Specialist", "Supervisor", "Coordinator", "HR", "Recruiter",
    "Director", "Driver", "Sales", "Customer Support", "Account Executive",
    "Data Scientist", "Software", "Product Manager", "Project Manager",
    "Freelance", "Software", "Administrator", "Technician", "Team Leader",
    "Designer","Secretary","Security","associate","Designers","business",
    "analyst","Junior","Senior","Mecanical"
]
job_position_keywords_regex = r"(?:\w+\s+){0,3}\b(" + "|".join(map(re.escape, job_position_keywords)) + r")\b(?:\s+\w+){0,3}"

In [6]:
pattern = r"[-:/\._;!?@#\$%\^&\*\(\)\[\]{}<>'\"\\\/|`~+=]"

In [7]:
#EXTRACT EXPERIENCE SECTION
def extract_experience_section(pdf_file):
    end_keywords = r"^(personal|Academic|Hobbies|education|skills|certifications|achievements|summary|references|DEGREE)"
    experience_section = ""
    inside_experience_section = False
    try:
        # Iterate over pages
        for page_layout in extract_pages(pdf_file):
            for element in page_layout:
                if isinstance(element, LTTextContainer):
                    section_text = ""
                    for text_line in element:
                        line_text = text_line.get_text().strip()
                        if not line_text:
                            continue  # Skip empty lines
                        # If we're inside the experience section, add text to the section
                        if inside_experience_section:
                            # Stop if we encounter an end keyword
                            if re.search(end_keywords, line_text, re.I):
                                return experience_section.strip()  # End of the section
                            section_text += line_text + "\n"
                        # Check if the current line is the start of the experience section
                        if not inside_experience_section and re.search(experience_keywords_regex, line_text, re.I):
                            inside_experience_section = True
                            section_text += line_text + "\n"
                    if section_text:
                        experience_section += section_text
    except Exception as e:
        print(f"Error while extracting experience section: {e}")
    return experience_section.strip() if experience_section else "No experience section found."

In [8]:
#EXTRACT EXPERIENCE DATES
date_pattern = re.compile(r"""
    (?P<start_date>                                      # Named capture group for the start date
        ([A-Za-z]+[-\s]?\d{4}|                           # Matches month name followed by a year, e.g., "April 2010" or "April-2010"
        \d{1,2}[/-]\d{1,2}[/-]\d{2,4}|
        \d{2,4}|                  # Matches dates in DD/MM/YYYY, MM/DD/YYYY, etc., e.g., "12/11/1992" or "11-12-1992"
        \d{4}[/-]\d{1,2}[/-]\d{1,2})                    # Matches dates in YYYY/MM/DD, e.g., "1992/11/12" or "1992-11-12"
    )
    \s?([tT]o|–|-|–|—)\s?                                  # Matches various separators like 'to', '-', '–' used between the start and end dates
    (?P<end_date>                                        # Named capture group for the end date
        ([A-Za-z]+[-\s]?\d{4}|                           # Matches end date month-year, e.g., "April-2011" or "April 2011"
        \d{1,2}[/-]\d{1,2}[/-]\d{2,4}|                   # Matches end date in DD/MM/YYYY or MM/DD/YYYY
        \d{4}[/-]\d{1,2}[/-]\d{1,2}|                     # Matches end date in YYYY/MM/DD
        ([pP]resent|[tT]oDate|[tT]ill now))           # Matches keywords like "Present", "ToDate", "Till now"
    )
""", re.VERBOSE)
# extract date from experience section
def extract_experience_date(text):
    lines = text.split('\n')
    list_dates = []
    for line in lines:
        line = line.strip()  # Remove extra spaces around the line
        if re.search(date_pattern, line):  # If the line contains a date
            list_dates.append(re.search(date_pattern, line).group())  # Assign id="date"
    return list_dates

In [9]:
def get_full_month_name(month_str_year):
    # Dictionary mapping the first three letters to full month names
    month_mapping = {
        'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April',
        'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
        'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'
    }
    parts = month_str_year.split()
    if len(parts) != 2:
        return month_str_year  # Return as is if it doesn't have both a month and year
    month_abbr = parts[0][:3].capitalize()  # Extract and capitalize first 3 letters of the month
    year = parts[1]  # The year remains the same
    full_month = month_mapping.get(month_abbr, None)
    if full_month:
        return f"{full_month} {year}"
    else:
        return month_str_year  # Return the original string if no match is found

def convert_to_full_date(date_str):
    try:
        date_str=get_full_month_name(date_str)
        if date_str in ["ToDate", "Present", "Till now"]:
            print("IM IN CONVERT FUNCTION")
            return datetime.now().strftime("%B %Y")
        match = re.match(r"([A-Za-z]+)(\d{4})", date_str)
        if match:
            month = match.group(1)  # Extract the month part
            year = match.group(2)  # Extract the year part
            return f"{month} {year}"
        if '-' in date_str:
            return datetime.strptime(date_str, "%b-%Y").strftime("%B %Y")
        else:
            return datetime.strptime(date_str, "%b %Y").strftime("%B %Y")
    except ValueError:
        return date_str

def process_date_range(date_range):
    date_range = re.sub(r"[-–_—]", " to ", date_range)
    if 'to' in date_range:
        parts = date_range.split('to')
        start_date = parts[0].strip()
        end_date = parts[1].strip()
        start_date_full = convert_to_full_date(start_date)
        end_date_full = convert_to_full_date(end_date)
        if len(start_date_full.split()) == 1:
            start_date_full = f"January {start_date_full}"
        if len(end_date_full.split()) == 1:
            end_date_full = f"January {end_date_full}"
        start = datetime.strptime(start_date_full, '%B %Y')
        end = datetime.strptime(end_date_full, '%B %Y')
        return f"[{start.strftime('%m/%Y')} - {end.strftime('%m/%Y')}]"
    return date_range

In [10]:
def clean_lines(lines):
    words_to_remove = ["company", "designation", "role", "position", "title"]
    remove_words_regex = r"^\s*(?:" + "|".join(words_to_remove) + r")\s+"
    cleaned_lines = [re.sub(r"[\uf000-\uffff]", "", line).strip() for line in lines]
    cleaned_line = [re.sub(remove_words_regex, "", line, flags=re.IGNORECASE) for line in cleaned_lines]
    cleaned_line = [re.sub(r"\s{2,}", " ", line).strip() for line in cleaned_line ]
    formatted_lines = [line.rstrip(':') for line in cleaned_line]
    return formatted_lines

In [None]:
# Function to extract company name from a line
def extract_company_name(text):
    doc=nlp(text)
    for ent in doc.ents:
        if ent.label_ == "ORG" :
            return ent.text
        for keyword in company_keywords:
            if ent.label_ == "ORG" or keyword in ent.text:
                return ent.text
    return ""  # Return empty string if no company name found

In [None]:
# Function to extract job position from a line
def extract_position_name(line_s):
    if any(keyword.lower() in line_s.lower() for keyword in job_position_keywords):
        return line_s
    doc = nlp(line_s)
    for token in doc:
        if token.text.lower() in ['as', 'hired', 'position', 'served']:
            if token.i + 2 <= len(doc):
                position = doc[token.i + 1: token.i + 3]
                return position.text
    return ""

In [11]:
def correct_dates_in_lines(lines):
    corrected_lines = []
    current_date=datetime.now().strftime("%b %Y")
    for line in lines:
        line = line.lower()
        day_month_year_pattern = r'\b\d{1,2}\s+([A-Za-z]+)\s+(\d{4})\b'
        cleaned_text = re.sub(day_month_year_pattern, r'\1 \2', line)
        corrected_date= re.sub(r"(\b[A-Za-z]+)[-–.](\d{4})", r"\1 \2", cleaned_text)
        corrected_line = re.sub(r'([a-zA-Z]+)(\d{4})', r'\1 \2', corrected_date)
        corrected_line = re.sub(r'(\d{4})(-)([a-zA-Z])', r'\1 - \3', corrected_line)
        corrected_line = re.sub(r'(\d{4})(\s)(\b(present|till|todate)\b)', r'\1 - \3', corrected_line)
        corrected_line = re.sub(r'\b(present|till|todate)\b', current_date, corrected_line, flags=re.IGNORECASE)
        corrected_lines.append(corrected_line)
    return corrected_lines

In [12]:
# Function to process each line of the CV
def process_experience_section(text):
    lines = text.split('\n')
    lines = correct_dates_in_lines(lines)
    result = []
    processed_dates = set()
    processed_company=set()
    processed_position=set()
    # Iterate over each line and check if it matches the date pattern
    for i, line in enumerate(lines):
        line = line.strip()
        def matching(line):
            company_name = ""
            position_name = ""
            match = re.search(date_pattern, line)
            if match:
                date = match.group()
                if date not in processed_dates:
                    processed_dates.add(date)
                    output = process_date_range(date)
                    start = max(0, i - 2)
                    end = min(len(lines), i + 3)
                    surrounding_lines = lines[start:end]
                    surrounding_lines=clean_lines(surrounding_lines)
                    index = 0
                    while index < len(surrounding_lines):
                        print(surrounding_lines[index], index)
                        pos_match = re.search(job_position_keywords_regex, surrounding_lines[index], re.IGNORECASE)
                        company_match = re.search(company_keywords_regex, surrounding_lines[index], re.IGNORECASE)
                        if pos_match and not position_name:
                            position = pos_match.group(0).strip()
                            if position not in processed_position:
                                processed_position.add(position)
                                position_name=position

                        if company_match and not company_name:
                            company = company_match.group(0).strip()
                            if company not in processed_company:
                                processed_company.add(company)
                                company_name=company
                        index += 1
                    for line_s in surrounding_lines:
                        # Extract company name
                        if not company_name:
                            company_name = extract_company_name(line_s)
                        # Extract position name
                        if not position_name:
                            position_name = extract_position_name(line_s)
                    result.append({
                        'dates': output,
                        'company':company_name if company_name else "Not found",
                        'position': position_name if position_name else "Not found"
                    })
                return result
        if re.search(date_pattern, line) :
            matching(line)
        doc = nlp(line)
        # Use SpaCy's NER to find organization entities (DATE)
        for ent in doc.ents:
            if ent.label_ == "DATE":
                print(doc.text,'TEXT')
                correct_dates_in_lines(doc.text)
                matching(doc.text)
    return result

In [13]:
def process_multiple_pdfs(pdf_dir, output_file):
    results = {}
    for filename in os.listdir(pdf_dir):
        if filename.endswith('.pdf'):
            try:
                pdf_path = os.path.join(pdf_dir, filename)
                # Extract experience section
                experience_text = extract_experience_section(pdf_path)
                resume = process_experience_section(experience_text)
                # Store the result for this PDF
                results[filename] = resume
            except FileNotFoundError as e:
                print(f"Fehler beim Verarbeiten der Datei {filename}: {e}")
                continue
            except Exception as e:
                print(f"Ein unerwarteter Fehler ist aufgetreten: {e}")
                continue
    # Ergebnisse in einer JSON-Datei speichern
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)
        print(f"Ergebnisse wurden in {output_file} gespeichert.")

In [None]:
process_multiple_pdfs("cv_30", 'label_pred.json')