# 📌 Cell 1: List All PDF Files in the Dataset

In [1]:
import os

# Walk through the Kaggle input directory and list only PDF files.
pdf_files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(dirname, filename)
            pdf_files.append(file_path)
            #print(file_path)

print(f"\nTotal PDF files found: {len(pdf_files)}")



Total PDF files found: 2484


In [2]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading pypdfium

# 📌 Cell 2: Improved Text Extraction from PDFs

In [3]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfplumber.
    
    Parameters:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        text (str): Extracted text.
    """
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text.strip()

# Test the function on the first PDF file if needed:
# print(extract_text_from_pdf(pdf_files[0])[:500])


# 📌 Cell 3: Load and Process All PDF Resumes

In [4]:
def read_resumes_from_files(file_list):
    """
    Reads all PDF resumes from a list of file paths and extracts text.
    
    Parameters:
        file_list (list): List of PDF file paths.
    
    Returns:
        resumes (list): List of extracted resume texts.
    """
    resumes = []
    for file_path in file_list:
        text = extract_text_from_pdf(file_path)
        if text:  # Only add if text extraction was successful
            resumes.append(text)
    print(f"Total resumes processed: {len(resumes)}")
    return resumes

# Load all resumes from the collected PDF file paths.
all_resumes = read_resumes_from_files(pdf_files)


Total resumes processed: 2483


# Cell 4: Enhanced Preprocessing with spaCy (Lemmatization & Stopword Removal)

In [5]:
import re
import spacy

# Load spaCy's English model (make sure this model is available on Kaggle)
nlp = spacy.load("en_core_web_sm")

def preprocess_text_spacy(text):
    """
    Preprocesses resume text by removing extra spaces, lowercasing, lemmatizing, 
    and removing stopwords and punctuation using spaCy.
    
    Parameters:
        text (str): Original text.
    
    Returns:
        processed_text (str): Preprocessed text.
    """
    # Clean up spaces/newlines
    text = re.sub(r'\s+', ' ', text)
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Preprocess all resumes.
processed_resumes = [preprocess_text_spacy(resume) for resume in all_resumes]
print("✅ Text preprocessing complete with spaCy!")


✅ Text preprocessing complete with spaCy!


# Cell 5: Compute Sentence‑BERT Embeddings (Replace Word2Vec/TF‑IDF)

In [6]:
!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer
import torch

# Load a pre-trained Sentence-BERT model.
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for each processed resume.
resume_embeddings = sbert_model.encode(processed_resumes, convert_to_tensor=True)
print("✅ Sentence-BERT embeddings computed!")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

✅ Sentence-BERT embeddings computed!


# 📌 Cell 6: Bias Mitigation – Remove Demographic Indicators

In [7]:
def remove_demographic_indicators(text):
    """
    Removes demographic indicators (e.g., names, locations) using spaCy's NER.
    
    Parameters:
        text (str): Input text.
    
    Returns:
        cleaned_text (str): Text with demographic entities removed.
    """
    doc = nlp(text)
    tokens = [token.text for token in doc if token.ent_type_ not in ["PERSON", "GPE"]]
    return " ".join(tokens)

# Apply bias mitigation on the processed resumes.
debiased_resumes = [remove_demographic_indicators(text) for text in processed_resumes]
print("✅ Bias mitigation applied on resume texts!")

# (Optional) Recompute embeddings on debiased resumes for ranking:
debiased_embeddings = sbert_model.encode(debiased_resumes, convert_to_tensor=True)


✅ Bias mitigation applied on resume texts!


Batches:   0%|          | 0/78 [00:00<?, ?it/s]

# 📌 Cell 7: Enhanced Feedback Generation Ranking with Sentence‑BERT

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define your job description for ranking.
job_description = "We are seeking a skilled designer with strong experience in graphic design, UI/UX, and creative problem solving."

# Compute the job description embedding.
job_embedding = sbert_model.encode(job_description, convert_to_tensor=True)

# Convert tensors to NumPy arrays (if needed for cosine similarity)
job_embedding_np = job_embedding.cpu().numpy()
debiased_embeddings_np = debiased_embeddings.cpu().numpy()

# Compute cosine similarity between the job description and each resume.
similarities = cosine_similarity([job_embedding_np], debiased_embeddings_np)[0]

# Get ranked indices (highest similarity first).
ranked_indices = np.argsort(similarities)[::-1]

print("Ranking complete. Top 5 similarity scores:")
for i in range(min(5, len(similarities))):
    print(f"Rank {i+1}: Resume Index {ranked_indices[i]} with similarity {similarities[ranked_indices[i]]:.4f}")

# Feedback functions (you can later extend these with more advanced interpretable methods).
def recruiter_feedback(resume_text):
    if len(resume_text.split()) < 50:
        return "This resume may lack sufficient details."
    elif "experience" not in resume_text:
        return "Consider looking for resumes with clear experience details."
    else:
        return "Resume appears well-detailed."

def job_seeker_feedback(resume_text):
    missing_keywords = []
    essential_keywords = ["experience", "skills", "education", "projects"]
    for keyword in essential_keywords:
        if keyword not in resume_text:
            missing_keywords.append(keyword)
    if missing_keywords:
        return f"Consider adding: {', '.join(missing_keywords)}."
    else:
        return "Your resume appears comprehensive!"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Ranking complete. Top 5 similarity scores:
Rank 1: Resume Index 104 with similarity 0.6209
Rank 2: Resume Index 91 with similarity 0.5814
Rank 3: Resume Index 52 with similarity 0.5730
Rank 4: Resume Index 77 with similarity 0.5720
Rank 5: Resume Index 35 with similarity 0.5714


# 📌 Cell 8: Main Execution – Process, Generate Feedback, and Save Results

In [9]:
import pandas as pd

results = []
# Use the ranking from the cosine similarity computed on debiased embeddings.
for rank, idx in enumerate(ranked_indices, start=1):
    resume_text = debiased_resumes[idx]
    rec_feedback = recruiter_feedback(resume_text)
    cand_feedback = job_seeker_feedback(resume_text)
    
    results.append({
        "Rank": rank,
        "Resume Index": idx + 1,  # converting 0-index to 1-index for display
        "Similarity Score": similarities[idx],
        "Recruiter Feedback": rec_feedback,
        "Job Seeker Feedback": cand_feedback,
        "Resume Snippet": resume_text[:500] + "..."
    })

# Convert the results to a DataFrame.
results_df = pd.DataFrame(results)

# Save the results to a CSV file.
output_path = "/kaggle/working/resume_feedback_results_with_ranking.csv"
results_df.to_csv(output_path, index=False)
print(f"✅ Results saved to {output_path}")

# Display the first few rows of the results.
results_df.head()


✅ Results saved to /kaggle/working/resume_feedback_results_with_ranking.csv


Unnamed: 0,Rank,Resume Index,Similarity Score,Recruiter Feedback,Job Seeker Feedback,Resume Snippet
0,1,105,0.620872,Resume appears well-detailed.,Consider adding: projects.,multimedia designer graphic designer portfolio...
1,2,92,0.58139,Resume appears well-detailed.,Consider adding: projects.,creative graphic designer summary review post ...
2,3,53,0.572957,Resume appears well-detailed.,Consider adding: projects.,product web designer summary career 34 year in...
3,4,78,0.572001,Resume appears well-detailed.,Consider adding: projects.,freelance graphic designer highlights web prin...
4,5,36,0.571389,Resume appears well-detailed.,Consider adding: projects.,graphic designer summary driven graphic artist...


In [10]:
results_df.head()

Unnamed: 0,Rank,Resume Index,Similarity Score,Recruiter Feedback,Job Seeker Feedback,Resume Snippet
0,1,105,0.620872,Resume appears well-detailed.,Consider adding: projects.,multimedia designer graphic designer portfolio...
1,2,92,0.58139,Resume appears well-detailed.,Consider adding: projects.,creative graphic designer summary review post ...
2,3,53,0.572957,Resume appears well-detailed.,Consider adding: projects.,product web designer summary career 34 year in...
3,4,78,0.572001,Resume appears well-detailed.,Consider adding: projects.,freelance graphic designer highlights web prin...
4,5,36,0.571389,Resume appears well-detailed.,Consider adding: projects.,graphic designer summary driven graphic artist...


# 📌 Cell 9: View a Specific CV (e.g., Top-Ranked Resume)

In [11]:
from IPython.display import IFrame

# To view the top-ranked resume, use the first element from ranked_indices.
top_resume_index = ranked_indices[0]
top_resume_file = pdf_files[top_resume_index]

print(f"Displaying the top-ranked resume from file: {top_resume_file}")
IFrame(top_resume_file, width=800, height=600)


Displaying the top-ranked resume from file: /kaggle/input/data/data/DESIGNER/29147100.pdf


In [12]:
!lscpu


Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   4
  On-line CPU(s) list:    0-3
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.20GHz
    CPU family:           6
    Model:                79
    Thread(s) per core:   2
    Core(s) per socket:   2
    Socket(s):            1
    Stepping:             0
    BogoMIPS:             4399.99
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht sysc
                          all nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xt
                          opology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq
                           ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
                           aes xsave avx f16c rdrand hypervisor 