In [9]:
import os
import pdfplumber
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [2]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file_path):
    all_text = ""
    try:
        with pdfplumber.open(pdf_file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:  # Ensure there's text extracted
                    all_text += text + "\n"
    except Exception as e:
        print(f"Error reading {pdf_file_path}: {e}")
    return all_text

In [4]:
# Function to process resumes from a given folder
def process_resumes(folder_path, job_title):
    resumes_data = []  # List to store resume data

    # Traverse the folder to find all PDF files
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_file_path = os.path.join(folder_path, filename)
            resume_text = extract_text_from_pdf(pdf_file_path)
            resumes_data.append({
                'job_title': job_title,
                'resume_text': resume_text,
                'filename': filename
            })

    return resumes_data

In [5]:
# Main function to process multiple job folders
def process_resumes_from_folders(root_folder):
    all_resumes = []
# Loop through each job category folder
    for job_title in os.listdir(root_folder):
        job_folder_path = os.path.join(root_folder, job_title)
        if os.path.isdir(job_folder_path):  # Check if it's a directory
            resumes_data = process_resumes(job_folder_path, job_title)
            all_resumes.extend(resumes_data)  # Add to the main list

    # Convert to DataFrame and save to CSV
    resumes_df = pd.DataFrame(all_resumes)
    resumes_df.to_csv('resumes_data.csv', index=False)
    print("Resumes extracted and saved to resumes_data.csv")

In [6]:
# Function to extract skills from job descriptions
def extract_skills(job_descriptions):
    text = ' '.join(job_descriptions)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return filtered_tokens

In [7]:
# Function to process job description CSV
def process_job_descriptions(csv_file_path):
    job_desc_df = pd.read_csv(csv_file_path)  # Load the CSV file
    skills_list = extract_skills(job_desc_df['job_description'])
    
    # Create a DataFrame for frequency analysis
    skills_df = pd.DataFrame(skills_list, columns=['skill'])
    skills_df['count'] = 1
    skills_count = skills_df.groupby('skill').count().reset_index()
    skills_count = skills_count.sort_values(by='count', ascending=False)
    
    # Display top 10 skills
    print("Top 10 Skills from Job Descriptions:")
    print(skills_count.head(10))
    
    # Create job skills mapping
    job_skills_mapping = {}
    for _, row in job_desc_df.iterrows():
        job_title = row['job_title']
        job_description = row['job_description']
        skills = [skill for skill in skills_count['skill'].values if skill.lower() in job_description.lower()]
        job_skills_mapping[job_title] = skills

    # Convert the mapping into a DataFrame
    job_skills_df = pd.DataFrame(list(job_skills_mapping.items()), columns=['job_title', 'skills'])
    print("\nJob Skills Mapping:")
    print(job_skills_df.head())
    
    return job_skills_df

In [8]:
def main():
    # Process resumes
    root_folder = 'C://Users//santhoshs.s//jupyter//resumes//data//data'  # Replace with the path to your folders
    process_resumes_from_folders(root_folder)

    # Process job descriptions
    csv_file_path = 'C://Users//santhoshs.s//jupyter//combined_job_descriptions.csv'  # Replace with the path to your job descriptions CSV
    job_skills_df = process_job_descriptions(csv_file_path)

if __name__ == "__main__":
    main()

Resumes extracted and saved to resumes_data.csv


KeyError: 'job_description'