In [None]:
pip install pandas numpy scikit-learn nltk transformers sentence-transformers pdfminer.six faiss-cpu matplotlib seaborn tqdm

Collecting pdfminer.six
  Downloading pdfminer_six-20250416-py3-none-any.whl.metadata (4.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0

In [None]:
import pandas as pd
import numpy as np
import re
import os
import pickle
from pdfminer.high_level import extract_text
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import time
import glob

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


def clean_text(text):
    """Basic text cleaning."""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

def preprocess_text(text):
    """Preprocess text by cleaning and removing stopwords."""
    text = clean_text(text)
    if not text:
        return ""

    stops = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stops and len(word) > 2]
    return ' '.join(words)

def tokenize_into_sentences(text):
    """Split text into sentences."""
    if not isinstance(text, str) or not text:
        return []

    # First clean the text
    text = clean_text(text)
    if not text:
        return []

    # Split into sentences
    sentences = sent_tokenize(text)

    # Further clean each sentence
    sentences = [preprocess_text(sentence) for sentence in sentences if len(sentence) > 10]
    return sentences

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        return extract_text(pdf_path)
    except Exception as e:
        print(f"Error extracting text from PDF ({pdf_path}): {e}")
        return ""

def extract_resume_sections(text):
    """Extract relevant sections from a resume."""
    # Define section names and possible next sections
    sections = {
        "education": ["experience", "skills", "projects", "certifications", "achievements", "publications"],
        "experience": ["education", "skills", "projects", "certifications", "achievements", "publications"],
        "skills": ["education", "experience", "projects", "certifications", "achievements", "publications"],
        "projects": ["education", "experience", "skills", "certifications", "achievements", "publications"]
    }

    result = {}
    for section, next_sections in sections.items():
        next_sections_pattern = "|".join(next_sections)
        pattern = fr"(?i)(?:{section})\s*[:\-]*\s*(.*?)(?:(?:{next_sections_pattern})|$)"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            result[section] = match.group(1).strip()

    # If we couldn't find structured sections, use the whole text
    if all(not value for value in result.values()):
        result["full_text"] = text

    return result

def get_document_embedding(sentences, model):
    """Get document embedding by averaging sentence embeddings."""
    if not sentences:
        return np.zeros(384)  # Default embedding dimension for MiniLM-L6

    # Get embeddings for each sentence
    embeddings = model.encode(sentences)

    # Average the embeddings
    return np.mean(embeddings, axis=0)

def extract_skills(text):
    """Extract skills from resume text."""
    # Simple skill extraction using regex patterns
    skills_section_pattern = re.compile(
        r"(?i)(?:skills|technical skills|programming languages|technologies)\s*[:\-]*\s*(.+?)(?:\n\s*\n|$)",
        re.DOTALL
    )
    match = skills_section_pattern.search(text)
    if match:
        skills_text = match.group(1).strip()
        # Split by common delimiters
        skills = re.split(r'[;,\n•]', skills_text)
        # Clean up each skill
        skills = [skill.strip() for skill in skills if skill.strip()]
        return skills
    else:
        return []

def process_resume_file(file_path, model):
    """Process a single resume file and return its data."""
    text = extract_text_from_pdf(file_path)
    if not text:
        return None

    # Extract sections
    sections = extract_resume_sections(text)
    section_embeddings = {}

    # Process each section
    for section_name, section_text in sections.items():
        if section_text:
            sentences = tokenize_into_sentences(section_text)
            if sentences:
                section_embeddings[f"{section_name}_embedding"] = get_document_embedding(sentences, model)

    # Process full text
    all_sentences = tokenize_into_sentences(text)
    if not all_sentences:
        return None

    # Extract skills
    skills = extract_skills(text)

    # Create resume data
    resume_data = {
        "filename": os.path.basename(file_path),
        "path": file_path,
        "raw_text": text,
        "sections": sections,
        "skills": skills,
        "full_embedding": get_document_embedding(all_sentences, model),
        **section_embeddings  # Add section-specific embeddings
    }

    return resume_data

def create_resume_embeddings(resume_files, model):
    """Create embeddings for a list of resume files."""
    resume_data = []

    for file in tqdm(resume_files, desc="Processing resumes"):
        resume_info = process_resume_file(file, model)
        if resume_info:
            resume_data.append(resume_info)

    return resume_data

def main():
    # Start timing
    start_time = time.time()

    # Set up the model
    print("Loading sentence transformer model...")
    model_name = 'paraphrase-MiniLM-L6-v2'  # You can change this to a different model
    model = SentenceTransformer(model_name)

    # Get resume files
    resume_dir = "resumes"  # Change this to your resume directory
    resume_files = []

    # Look for PDF files
    print(f"Looking for resume PDFs in {resume_dir}...")
    if os.path.exists(resume_dir):
        resume_files = glob.glob(os.path.join(resume_dir, "*.pdf"))

    # If no files found in the directory, use specific file list
    if not resume_files:
        print("No PDFs found in directory, using specific file list...")
        resume_files = [
            'nlp_1.pdf',
            'nlp_2.pdf',
            'nlp_3.pdf',
            'nlp_4.pdf',
            'nlp_5.pdf',
            'nlp_6.pdf',
            'nlp_7.pdf',
            'nlp_8.pdf',
            'nlp_9.pdf',
            'nlp_10.pdf'
            # Add more files as needed
        ]

    print(f"Found {len(resume_files)} resume files")

    # Process resumes
    print("Creating resume embeddings...")
    resume_data = create_resume_embeddings(resume_files, model)

    # Save the embeddings
    print(f"Saving {len(resume_data)} resume embeddings...")
    with open('resume_embeddings.pkl', 'wb') as f:
        pickle.dump(resume_data, f)

    # Save a summary CSV with basic info
    summary_data = [{
        'filename': res['filename'],
        'sections_found': len(res['sections']),
        'skills_count': len(res.get('skills', [])),
        'text_length': len(res['raw_text'])
    } for res in resume_data]

    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv('resume_summary.csv', index=False)

    # Calculate and print execution time
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Processing completed in {execution_time:.2f} seconds")
    print(f"Generated embeddings for {len(resume_data)} resumes")
    print(f"Saved embeddings to resume_embeddings.pkl")
    print(f"Saved summary to resume_summary.csv")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Loading sentence transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Looking for resume PDFs in resumes...
No PDFs found in directory, using specific file list...
Found 10 resume files
Creating resume embeddings...


Processing resumes: 100%|██████████| 10/10 [00:09<00:00,  1.07it/s]

Saving 10 resume embeddings...
Processing completed in 16.92 seconds
Generated embeddings for 10 resumes
Saved embeddings to resume_embeddings.pkl
Saved summary to resume_summary.csv



