In [None]:
import os
import re
import csv
from pdfminer.high_level import extract_text


# Function to extract the first line of text from a PDF file
def extract_sections_from_pdf(pdf_file_path):
    try:
        # Extract text from the PDF file
        text = extract_text(pdf_file_path)
        cleaned_text = re.sub(r"City\s*,\s*State", "", text)

        # Split the text into lines and take the first line
        lines = cleaned_text.split("\n")

        # Initialize variables to store extracted sections
        first_line = lines[0].strip()  # Remove leading/trailing whitespace

        Edu = "Education|Education and Training|Educational Background|Specialized Training"
        Ski = "Skills|Skill Highlights|Computer Skills|Technical Skills"
        reg_str_beg = "(?im)(?:"
        reg_str_end = (
            ")\n+([\S\s]+?)\n+(?=\x0c|Summary|Work History|Core Qualifications|Highlights|Experience|Projects|Certifications|References|Interest|Languages|Awards|Additional Information|Affiliations|Personal Information|"
            + Edu
            + "|"
            + Ski
            + ")"
        )

        education_text = re.findall(reg_str_beg + Edu + reg_str_end, text)
        education_list = "".join(education_text)

        # Checking the "Highlights" section
        highlights_text = re.findall(reg_str_beg + "Highlights" + reg_str_end, text)

        # Checking the "Skills" section
        skills_text = re.findall(reg_str_beg + Ski + reg_str_end, text)

        # Concatenating the matches
        highlights_skills_text = highlights_text + skills_text
        # Splitting the contents of the matches in a 2D list
        highlights_skills_list = [
            re.split(r"[,;\n]", x) for x in highlights_skills_text
        ]
        # Stripping the whitespace and converting the list to 1D
        processed_list = [
            x.strip() for y in highlights_skills_list for x in y if x.strip()
        ]
        # Removing duplicates by passing the elements through set
        processed_list = list(set(processed_list))
        # Sorting the list
        processed_list.sort()

        # Extract the file name from the full file path
        file_name = os.path.basename(pdf_file_path)

        return [
            file_name,
            first_line,
            education_list,
            ", ".join(processed_list),
        ]  # Join the list into a single string

    except Exception as e:
        print(f"Error extracting text from {pdf_file_path}: {e}")
        return None


# Function to extract the first line from multiple PDF files in a directory and its subdirectories
def extract_text_from_pdfs(root_directory):
    extracted_sections = []

    # Walk through the directory tree
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_file_path = os.path.join(root, file)
                print(f"Extracting from: {pdf_file_path}")

                # Extract the sections from the current PDF file
                extracted_lines = extract_sections_from_pdf(pdf_file_path)

                if extracted_lines:
                    extracted_sections.append(extracted_lines)

    return extracted_sections


# Replace 'root_directory' with the path to the root folder containing your PDFs
root_directory = r"C:\Users\srbbh\Desktop\CODE\data"

# Extract the first lines from all PDFs in the specified directory and subdirectories
extracted_sections = extract_text_from_pdfs(root_directory)

# Specify the CSV file path to save the results
csv_file_path = "CV_Details.csv"

# Write the sections to a CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['File Name', 'Job Role', 'Education', 'Skills and Highlights'])  # Header row
    
    for section_data in extracted_sections:
        csv_writer.writerow(section_data)

print(f"Extracted sections from {len(extracted_sections)} PDF files. Saved to {csv_file_path}")
