# 📌 Cell 1: List All PDF Files in the Dataset

In [1]:
import os

# Walk through the Kaggle input directory and list only PDF files.
pdf_files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(dirname, filename)
            pdf_files.append(file_path)
            #print(file_path)

print(f"\nTotal PDF files found: {len(pdf_files)}")



Total PDF files found: 2484


In [2]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading 

# 📌 Cell 2: Improved Text Extraction from PDFs

In [3]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfplumber.
    
    Parameters:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        text (str): Extracted text.
    """
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text.strip()

# Example: Test the function on the first PDF file (uncomment to test)
# print(extract_text_from_pdf(pdf_files[0])[:500])


# 📌 Cell 3: Load and Process All PDF Resumes

In [4]:
def read_resumes_from_files(file_list):
    """
    Reads all PDF resumes from a list of file paths and extracts text.
    
    Parameters:
        file_list (list): List of PDF file paths.
    
    Returns:
        resumes (list): List of extracted resume texts.
    """
    resumes = []
    for file_path in file_list:
        text = extract_text_from_pdf(file_path)
        if text:  # Only add if text extraction was successful
            resumes.append(text)
    print(f"Total resumes processed: {len(resumes)}")
    return resumes

# Load all resumes from the collected PDF file paths.
all_resumes = read_resumes_from_files(pdf_files)


Total resumes processed: 2483


# 📌 Cell 4: Preprocess the Extracted Text

In [5]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words("english"))

def clean_text(text):
    """
    Cleans resume text by converting to lowercase, removing special characters,
    extra spaces, and stopwords.
    
    Parameters:
        text (str): Original text.
    
    Returns:
        cleaned_text (str): Cleaned text.
    """
    text = text.lower()  # Lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces/newlines
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    cleaned_text = " ".join(tokens)
    return cleaned_text

# Preprocess all resumes.
cleaned_resumes = [clean_text(resume) for resume in all_resumes]

print("✅ Text preprocessing complete!")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
✅ Text preprocessing complete!


# 📌 Cell 5: Train a Word2Vec Model on the Resumes

In [6]:
from gensim.models import Word2Vec

def train_word2vec(resume_texts):
    """
    Trains a Word2Vec model on a list of resume texts.
    
    Parameters:
        resume_texts (list): List of cleaned resume texts.
    
    Returns:
        word2vec_model: Trained Word2Vec model.
    """
    # Tokenize each resume into words.
    tokenized_resumes = [word_tokenize(text) for text in resume_texts]
    # Train Word2Vec model with specified parameters.
    word2vec_model = Word2Vec(sentences=tokenized_resumes, vector_size=100, window=5, min_count=2, workers=4)
    return word2vec_model

# Train the model on the cleaned resumes.
w2v_model = train_word2vec(cleaned_resumes)
print("✅ Word2Vec model trained successfully!")


✅ Word2Vec model trained successfully!


# 📌 Cell 6: Bias Mitigation – Remove Demographic Indicators

In [7]:
import spacy

# Load spaCy's small English model for NER (make sure it's available on Kaggle)
nlp = spacy.load("en_core_web_sm")

def remove_demographic_indicators(text):
    """
    Removes demographic indicators (e.g., names, locations) using spaCy's NER.
    
    Parameters:
        text (str): Input text.
    
    Returns:
        cleaned_text (str): Text with demographic entities removed.
    """
    doc = nlp(text)
    # Keep tokens that are not tagged as PERSON or GPE (geopolitical entity)
    tokens = [token.text for token in doc if token.ent_type_ not in ["PERSON", "GPE"]]
    return " ".join(tokens)

# Apply bias mitigation on the cleaned resumes.
debiased_resumes = [remove_demographic_indicators(text) for text in cleaned_resumes]

print("✅ Bias mitigation applied on resume texts!")


✅ Bias mitigation applied on resume texts!


# 📌 Cell 7: Enhanced Feedback Generation

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define a sample job description for ranking (modify as needed)
job_description = "We are seeking a skilled designer with strong experience in graphic design, UI/UX, and creative problem solving."

# Create a TF-IDF vectorizer and fit on the debiased resumes.
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(debiased_resumes)

# Transform the job description into a TF-IDF vector.
job_tfidf = tfidf_vectorizer.transform([job_description])

# Compute cosine similarity between the job description and each resume.
similarities = cosine_similarity(job_tfidf, tfidf_matrix)[0]

# Get ranked indices (highest similarity first).
ranked_indices = similarities.argsort()[::-1]

print("Ranking complete. Top 5 similarity scores:")
for i in range(min(5, len(similarities))):
    print(f"Rank {i+1}: File Index {ranked_indices[i]} with similarity {similarities[ranked_indices[i]]:.4f}")


Ranking complete. Top 5 similarity scores:
Rank 1: File Index 93 with similarity 0.4327
Rank 2: File Index 69 with similarity 0.4123
Rank 3: File Index 45 with similarity 0.3064
Rank 4: File Index 803 with similarity 0.2972
Rank 5: File Index 104 with similarity 0.2938


# 📌 Cell 8: Main Execution – Process, Generate Feedback, and Save Results

In [9]:
def recruiter_feedback(resume_text):
    """
    Generates feedback for recruiters based on resume quality.
    
    Parameters:
        resume_text (str): Processed resume text.
    
    Returns:
        feedback (str): Feedback message.
    """
    if len(resume_text.split()) < 50:
        return "This resume may lack sufficient details."
    elif "experience" not in resume_text:
        return "Consider looking for resumes with clear experience details."
    else:
        return "Resume appears well-detailed."

def job_seeker_feedback(resume_text):
    """
    Provides feedback for job seekers on how to improve their resumes.
    
    Parameters:
        resume_text (str): Processed resume text.
    
    Returns:
        feedback (str): Feedback message.
    """
    missing_keywords = []
    essential_keywords = ["experience", "skills", "education", "projects"]
    
    for keyword in essential_keywords:
        if keyword not in resume_text:
            missing_keywords.append(keyword)
    
    if missing_keywords:
        return f"Consider adding: {', '.join(missing_keywords)}."
    else:
        return "Your resume appears comprehensive!"

import pandas as pd

results = []
# Use the ranking from TF-IDF; ranked_indices gives the order (best first).
for rank, idx in enumerate(ranked_indices, start=1):
    resume_text = debiased_resumes[idx]
    rec_feedback = recruiter_feedback(resume_text)
    cand_feedback = job_seeker_feedback(resume_text)
    
    results.append({
        "Rank": rank,
        "File Index": idx + 1,  # converting 0-index to 1-index for display
        "Similarity Score": similarities[idx],
        "Recruiter Feedback": rec_feedback,
        "Job Seeker Feedback": cand_feedback,
        "Resume Snippet": resume_text[:500] + "..."
    })

# Convert the results to a DataFrame.
results_df = pd.DataFrame(results)

# Save the results to a CSV file in the working directory.
output_path = "/kaggle/working/resume_feedback_results_with_ranking.csv"
results_df.to_csv(output_path, index=False)
print(f"✅ Results saved to {output_path}")

# Display the first few rows of the results.
results_df.head()


✅ Results saved to /kaggle/working/resume_feedback_results_with_ranking.csv


Unnamed: 0,Rank,File Index,Similarity Score,Recruiter Feedback,Job Seeker Feedback,Resume Snippet
0,1,94,0.432674,Resume appears well-detailed.,Your resume appears comprehensive!,freelance ux ui interaction designer summary c...
1,2,70,0.412347,Resume appears well-detailed.,Your resume appears comprehensive!,lead ux ui designer executive profile insightf...
2,3,46,0.306386,Resume appears well-detailed.,Your resume appears comprehensive!,graphic designer summary highly creative multi...
3,4,804,0.297165,Consider looking for resumes with clear experi...,Consider adding: experience.,owner senior graphic designer ux designer app ...
4,5,105,0.293776,Resume appears well-detailed.,Consider adding: projects.,multimedia designer graphic designer portfolio...


In [2]:
results_df.head()

NameError: name 'results_df' is not defined

# 📌 Cell 9: View a Specific CV (e.g., Top-Ranked Resume)

In [1]:
from IPython.display import IFrame

# To view the top-ranked resume, use the first element from ranked_indices.
top_resume_index = ranked_indices[0]
top_resume_file = pdf_files[top_resume_index]

print(f"Displaying the top-ranked resume from file: {top_resume_file}")

# Display the PDF inline using IFrame (adjust width and height as desired).
IFrame(top_resume_file, width=800, height=600)


NameError: name 'ranked_indices' is not defined

In [12]:
!lscpu


Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   4
  On-line CPU(s) list:    0-3
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.20GHz
    CPU family:           6
    Model:                79
    Thread(s) per core:   2
    Core(s) per socket:   2
    Socket(s):            1
    Stepping:             0
    BogoMIPS:             4400.45
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht sysc
                          all nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xt
                          opology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq
                           ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
                           aes xsave avx f16c rdrand hypervisor 