# Import the necessary Librairies

In [13]:
"""This module provides a portable way of using operating system dependent functionality. If you
just want to read or write a file see open(), if you want to manipulate paths, see the os.path 
module, and if you want to read all the lines in all the files on the command line see the 
fileinput module"""
import os
"""
pdfplumber is a powerful library that allows for easy extraction of text and data from PDFs, 
making it a valuable tool for data analysis and automation tasks.
"""
import pdfplumber
"""The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing"""
import nltk
from nltk.corpus import stopwords
"""scikit-learn is a Python module for machine learning built on top of SciPy and is distributed
under the 3-Clause BSD license."""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
"""
The csv library provides functionality to both read from and write to CSV files. Designed to work out
of the box with Excel-generated CSV files, it is easily adapted to work with a variety of CSV formats.
The csv library contains objects and other code to read, write, and process data from and to CSV files.
"""
import pandas as pd

from pdfquery import PDFQuery
# Define the directory containing your PDF files
pdf_directory=r"C:\Users\kabir\Desktop\Projects\Resume_Screening\data\raw\TestData"

In [15]:

# Initialize an empty list to store the extracted resume data
resume_data = []

# Iterate through subdirectories (categories) in the specified directory
for category in os.listdir(pdf_directory):
    category_path = os.path.join(pdf_directory, category)
    if os.path.isdir(category_path):
        for pdf_file in os.listdir(category_path):
            if pdf_file.lower().endswith(".pdf"):
                pdf_path = os.path.join(category_path, pdf_file)
                try:
                    # Load the PDF using PDFQuery
                    pdf = PDFQuery(pdf_path)
                    pdf.load()

                    # Extract text elements from the PDF
                    text_elements = pdf.pq("LTTextLineHorizontal")
                    resume_text = " ".join([t.text for t in text_elements])

                    # Append the extracted data to the list
                    resume_data.append({"Category": category, "Resume Text": resume_text})
                except Exception as e:
                    # Handle any errors during processing
                    print(f"Error processing {pdf_file}: {e}")

# Create a DataFrame from the list of resume data
resume_df = pd.DataFrame(resume_data)

# Save the data to a CSV file named 'resumes.csv'
resume_df.to_csv("resumes.csv", index=False)

# Print a success message
print("Data extraction and CSV creation completed successfully!")


Data extraction and CSV creation completed successfully!
