## Read Resumes from Folders

In [2]:
import os

In [3]:
folder_path = "Datasets/data" # Path of the resume

In [4]:
# def read_resumes_from_folder(folder_path):
#     resumes = []
#     for filename in os.listdir(folder_path):
#         if filename.endswith(".pdf") or filename.endswith(".docx"):
#             file_path = os.path.join(folder_path, filename)
#             print(f"Processing file: {file_path}")  # Debugging: Print the file being processed
#             text = extract_text_from_file(file_path)  # Use your resume parsing function
#             print(f"Extracted text (first 100 chars): {text[:100]}...")  # Debugging: Print a snippet of the text
#             resumes.append(text)
#     print(f"Total resumes processed: {len(resumes)}")  # Debugging: Print the total number of resumes
#     return resumes

### Validate the Number of Files

In [5]:
# def read_resumes_from_folder(folder_path):
#     resumes = []
#     total_files = 0
#     for filename in os.listdir(folder_path):
#         if filename.endswith(".pdf") or filename.endswith(".docx"):
#             total_files += 1
#             file_path = os.path.join(folder_path, filename)
#             print(f"Processing file: {file_path}")  # Debugging: Print the file being processed
#             text = extract_text_from_file(file_path)  # Use your resume parsing function
#             print(f"Extracted text (first 100 chars): {text[:100]}...")  # Debugging: Print a snippet of the text
#             resumes.append(text)
#     print(f"Total files found: {total_files}")  # Debugging: Print total PDF/DOCX files
#     print(f"Total resumes processed: {len(resumes)}")  # Debugging: Print the total number of resumes
#     return resumes

### Check for Errors in FIle Reading

In [6]:
# import os

# def read_resumes_from_folder(folder_path):
#     resumes = []
#     total_files = 0
#     for filename in os.listdir(folder_path):
#         if filename.endswith(".pdf") or filename.endswith(".docx"):
#             total_files += 1
#             file_path = os.path.join(folder_path, filename)
#             print(f"Processing file: {file_path}")  # Debugging: Print the file being processed
#             try:
#                 text = extract_text_from_file(file_path)  # Use your resume parsing function
#                 print(f"Extracted text (first 100 chars): {text[:100]}...")  # Debugging: Print a snippet of the text
#                 resumes.append(text)
#             except Exception as e:
#                 print(f"Error processing file {file_path}: {e}")  # Debugging: Print any errors
#     print(f"Total files found: {total_files}")  # Debugging: Print total PDF/DOCX files
#     print(f"Total resumes processed: {len(resumes)}")  # Debugging: Print the total number of resumes
#     return resumes

### Verify Folder Structure

In [7]:
# dataset_path = "Datasets/data/"
# job_roles = os.listdir(dataset_path)
# resumes_by_role = {}

# for role in job_roles:
#     role_path = os.path.join(dataset_path, role)
#     if os.path.isdir(role_path):  # Ensure it's a directory
#         print(f"Processing role: {role}")  # Debugging: Print the role being processed
#         resumes_by_role[role] = read_resumes_from_folder(role_path)

In [8]:
# # Test with a single folder
# test_folder = "Datasets/data/data/ACCOUNTANT"
# resumes = read_resumes_from_folder(test_folder)

##  **Updated Code to Handle Nested Folder Structure**

In [9]:
import os

def extract_text_from_file(file_path):
    """
    Extracts text from a PDF or DOCX file.
    """
    if file_path.endswith(".pdf"):
        from PyPDF2 import PdfReader
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    elif file_path.endswith(".docx"):
        from docx import Document
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    return text

def read_resumes_from_folder(root_folder):
    """
    Recursively reads all PDF and DOCX files from a root folder and its subfolders.
    """
    resumes = []
    total_files = 0
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(".pdf") or filename.endswith(".docx"):
                total_files += 1
                file_path = os.path.join(dirpath, filename)
                try:
                    text = extract_text_from_file(file_path)
                    resumes.append(text)  # Append the extracted text, not 1
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
    print(f"Total files found: {total_files}")
    print(f"Total resumes processed: {len(resumes)}")
    return resumes

# Example usage
dataset_path = "Datasets/data/data"  # Root folder containing all resumes
resumes = read_resumes_from_folder(dataset_path)

Total files found: 2484
Total resumes processed: 2484


### **Store Resumes by Job Role**

In [10]:
job_roles = os.listdir(dataset_path)
resumes_by_role = {}

for role in job_roles:
    role_path = os.path.join(dataset_path, role)
    resumes_by_role[role] = read_resumes_from_folder(role_path)

Total files found: 118
Total resumes processed: 118
Total files found: 118
Total resumes processed: 118
Total files found: 63
Total resumes processed: 63
Total files found: 97
Total resumes processed: 97
Total files found: 103
Total resumes processed: 103
Total files found: 36
Total resumes processed: 36
Total files found: 117
Total resumes processed: 117
Total files found: 115
Total resumes processed: 115
Total files found: 22
Total resumes processed: 22
Total files found: 120
Total resumes processed: 120
Total files found: 118
Total resumes processed: 118
Total files found: 112
Total resumes processed: 112
Total files found: 115
Total resumes processed: 115
Total files found: 107
Total resumes processed: 107
Total files found: 96
Total resumes processed: 96
Total files found: 118
Total resumes processed: 118
Total files found: 118
Total resumes processed: 118
Total files found: 117
Total resumes processed: 117
Total files found: 115
Total resumes processed: 115
Total files found: 110

## Load All Resumes

In [11]:
# Combine all resumes into a single list
all_resumes = []
for role, resumes in resumes_by_role.items():
    all_resumes.extend(resumes)

print(f"Total resumes loaded: {len(all_resumes)}")

Total resumes loaded: 2484


## **2. TF-IDF Implementation**

Steps for TF-IDF

### 2.1 Preprocess Text
Tokenize, remove stopwords, and normalize text (e.g., lowercase).

In [12]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# def preprocess_text(texts):
#     vectorizer = TfidfVectorizer(stop_words="english", lowercase=True)
#     tfidf_matrix = vectorizer.fit_transform(texts)
#     return tfidf_matrix, vectorizer

### 2.2 Compare Resumes with Job Criteria
Use cosine similarity to compare resumes with job descriptions.



In [13]:
# from sklearn.metrics.pairwise import cosine_similarity

# def rank_resumes(job_description, resumes, vectorizer):
#     job_tfidf = vectorizer.transform([job_description])
#     resume_tfidf = vectorizer.transform(resumes)
#     similarities = cosine_similarity(job_tfidf, resume_tfidf)
#     ranked_indices = similarities.argsort()[0][::-1]
#     return ranked_indices

### 2.3 Rank Resumes 
Use the ranked indices to sort resumes

In [14]:
# def get_ranked_resumes(job_description, resumes):
#     tfidf_matrix, vectorizer = preprocess_text(resumes)
#     ranked_indices = rank_resumes(job_description, resumes, vectorizer)
#     ranked_resumes = [resumes[i] for i in ranked_indices]
#     return ranked_resumes

In [15]:
9

9

In [16]:
print("Hello Amanullah Shah.")

Hello Amanullah Shah.


In [17]:
# come back to coding world. 

## TF-IDF Code Implementation

In [18]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Define a function to compute TF-IDF and rank resumes
def rank_resumes_using_tfidf(job_description, resumes):
    """
    This function takes a job description and a list of resumes,
    computes TF-IDF vectors, and ranks the resumes based on cosine similarity.
    
    Parameters:
        job_description (str): The job description text.
        resumes (list): A list of resume texts.
    
    Returns:
        ranked_indices (list): Indices of resumes sorted by relevance.
        tfidf_matrix: The TF-IDF matrix for resumes.
        vectorizer: The TF-IDF vectorizer object.
    """
    
    # Step 2: Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words="english", lowercase=True)
    
    # Step 3: Fit and transform the resumes into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform(resumes)
    
    # Step 4: Transform the job description into a TF-IDF vector
    job_tfidf = vectorizer.transform([job_description])
    
    # Step 5: Compute cosine similarity between the job description and resumes
    similarities = cosine_similarity(job_tfidf, tfidf_matrix)
    
    # Step 6: Sort resumes by similarity scores (highest to lowest)
    ranked_indices = similarities.argsort()[0][::-1]
    
    return ranked_indices, tfidf_matrix, vectorizer

### **Define a Job Description**

In [19]:
# Create a job description that you want to use for ranking the resumes. 

job_description = """
We are looking for a software engineer with experience in Python, machine learning, and data analysis.
The ideal candidate should have strong problem-solving skills and a background in software development.
"""

### Use of the "rank_resumes_using_tfidf function"

In [20]:
# Rank resumes using TF-IDF
ranked_indices, tfidf_matrix, vectorizer = rank_resumes_using_tfidf(job_description, all_resumes)

# Print ranked resumes
# for idx in ranked_indices:
#     print(f"Rank {ranked_indices.tolist().index(idx) + 1}: Resume {idx + 1}")
#     print(all_resumes[idx][:500] + "...")  # Print the first 500 characters of each resume
#     print()

## **Save Ranked Resumes to a File**

In [21]:
# Save ranked resumes to a file with UTF-8 encoding
with open("ranked_resumes.txt", "w", encoding="utf-8") as file:
    for idx in ranked_indices:
        file.write(f"Rank {ranked_indices.tolist().index(idx) + 1}: Resume {idx + 1}\n")
        file.write(all_resumes[idx] + "\n\n")

## Save Results to a CSV File

In [22]:
import pandas as pd

# Create a DataFrame to store the ranked resumes
ranked_data = []
for idx in ranked_indices:
    rank = ranked_indices.tolist().index(idx) + 1
    resume_text = all_resumes[idx]
    ranked_data.append({
        "Rank": rank,
        "Resume Index": idx + 1,
        "Resume Text": resume_text[:500] + "..."  # Truncate to 500 characters
    })

# Convert to DataFrame
df = pd.DataFrame(ranked_data)

# Save to CSV
df.to_csv("ranked_resumes.csv", index=False, encoding="utf-8")

In [27]:
df.head()

Unnamed: 0,Rank,Resume Index,Resume Text,Feedback
0,1,1460,ENGINEERING OPERATIONS DIRECTOR\nExecutive Pro...,"Missing skills: analysis, data, problem, solvi..."
1,2,2112,INFORMATION TECHNOLOGY SPECIALIST\nExperience\...,"Missing skills: python, candidate, strong, ide..."
2,3,1465,ENGINEERING AND QUALITY TECHNICIAN\nCareer Ove...,"Missing skills: problem, solving, candidate, e..."
3,4,1935,HR REPRESENTATIVE\nSummary\nA motivated busine...,"Missing skills: analysis, python, software, en..."
4,5,513,DATA ANALYST\nProfessional Summary\nIndustrial...,"Missing skills: software, candidate, engineer,..."


## Add feedback to the CSV

In [23]:
import pandas as pd

# Function to generate feedback (from earlier code)
def generate_feedback(job_description, resume, vectorizer):
    job_tfidf = vectorizer.transform([job_description])
    resume_tfidf = vectorizer.transform([resume])
    feature_names = vectorizer.get_feature_names_out()
    job_keywords = set(feature_names[job_tfidf.indices])
    resume_keywords = set(feature_names[resume_tfidf.indices])
    missing_skills = job_keywords - resume_keywords
    feedback = f"Missing skills: {', '.join(missing_skills)}" if missing_skills else "Good match!"
    return feedback

# Create a DataFrame with feedback
ranked_data = []
for idx in ranked_indices:
    rank = ranked_indices.tolist().index(idx) + 1
    resume_text = all_resumes[idx]
    feedback = generate_feedback(job_description, resume_text, vectorizer)
    ranked_data.append({
        "Rank": rank,
        "Resume Index": idx + 1,
        "Resume Text": resume_text[:500] + "...",
        "Feedback": feedback
    })

# Save to CSV
df = pd.DataFrame(ranked_data)
df.to_csv("ranked_resumes_with_feedback.csv", index=False, encoding="utf-8")

### The output:

In [24]:
import pandas as pd

df = pd.read_csv("ranked_resumes_with_feedback.csv")
print(df.head())  # Display the top 5 rows

   Rank  Resume Index                                        Resume Text  \
0     1          1460  ENGINEERING OPERATIONS DIRECTOR\nExecutive Pro...   
1     2          2112  INFORMATION TECHNOLOGY SPECIALIST\nExperience\...   
2     3          1465  ENGINEERING AND QUALITY TECHNICIAN\nCareer Ove...   
3     4          1935  HR REPRESENTATIVE\nSummary\nA motivated busine...   
4     5           513  DATA ANALYST\nProfessional Summary\nIndustrial...   

                                            Feedback  
0  Missing skills: analysis, data, problem, solvi...  
1  Missing skills: python, candidate, strong, ide...  
2  Missing skills: problem, solving, candidate, e...  
3  Missing skills: analysis, python, software, en...  
4  Missing skills: software, candidate, engineer,...  


In [26]:
df.head()

Unnamed: 0,Rank,Resume Index,Resume Text,Feedback
0,1,1460,ENGINEERING OPERATIONS DIRECTOR\nExecutive Pro...,"Missing skills: analysis, data, problem, solvi..."
1,2,2112,INFORMATION TECHNOLOGY SPECIALIST\nExperience\...,"Missing skills: python, candidate, strong, ide..."
2,3,1465,ENGINEERING AND QUALITY TECHNICIAN\nCareer Ove...,"Missing skills: problem, solving, candidate, e..."
3,4,1935,HR REPRESENTATIVE\nSummary\nA motivated busine...,"Missing skills: analysis, python, software, en..."
4,5,513,DATA ANALYST\nProfessional Summary\nIndustrial...,"Missing skills: software, candidate, engineer,..."


In [28]:
df['Resume Text']

0       ENGINEERING OPERATIONS DIRECTOR\nExecutive Pro...
1       INFORMATION TECHNOLOGY SPECIALIST\nExperience\...
2       ENGINEERING AND QUALITY TECHNICIAN\nCareer Ove...
3       HR REPRESENTATIVE\nSummary\nA motivated busine...
4       DATA ANALYST\nProfessional Summary\nIndustrial...
                              ...                        
2479    PH INPATIENT/FINANCIAL ADVOCATE SUPERVISOR\nEd...
2480    INSTRUCTOR/WRITER\nExperience\nInstructor/Writ...
2481    SUPERVISOR\nSummary\nI was a supervisor for tw...
2482    EXECUTIVE DIRECTOR\nProfessional Experience\nE...
2483                                                  ...
Name: Resume Text, Length: 2484, dtype: object

In [29]:
df.iloc[3,2]

'HR REPRESENTATIVE\nSummary\nA motivated business partner who communicates and collaborates effectively with all levels of personnel by relying on outstanding interpersonal\nand customer service skills, excellent sense of urgency and time management skills and taking pride in delivering high quality work.\nHighlights\nprovide a high level of service to both internal and\nexternal clients and candidates\npossess excellent communication skills\nan organizational guru communication skills,\nhave the ability ...'