In [None]:
pip install pandas numpy scikit-learn nltk transformers sentence-transformers pdfminer.six faiss-cpu matplotlib seaborn tqdm

Collecting pdfminer.six
  Downloading pdfminer_six-20250416-py3-none-any.whl.metadata (4.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0

In [None]:
pip install gradio

Collecting gradio
  Downloading gradio-5.28.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [None]:
import gradio as gr
import numpy as np
import pandas as pd
import pickle
import os
import re
import traceback
from pdfminer.high_level import extract_text
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import time

# Download required NLTK data with better error handling
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    print("NLTK data downloaded successfully")
except Exception as nltk_err:
    print(f"Warning: NLTK download issue: {nltk_err}")

# Load pre-computed job embeddings
def load_job_embeddings(job_file="job_embeddings.pkl"):
    """Load precomputed job embeddings from file."""
    try:
        if not os.path.exists(job_file):
            error_msg = f"Error: Job embeddings file '{job_file}' not found in {os.getcwd()}"
            print(error_msg)
            return None, error_msg

        with open(job_file, 'rb') as f:
            job_data = pickle.load(f)

        print(f"Successfully loaded {len(job_data)} job embeddings from {job_file}")
        return job_data, None
    except Exception as e:
        error_msg = f"Error loading job embeddings: {str(e)}"
        print(error_msg)
        return None, error_msg

# Text preprocessing functions
def clean_text(text):
    """Basic text cleaning."""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

def preprocess_text(text):
    """Preprocess text by cleaning and removing stopwords."""
    text = clean_text(text)
    if not text:
        return ""

    try:
        stops = set(stopwords.words('english'))
        words = [word for word in text.split() if word not in stops and len(word) > 2]
        return ' '.join(words)
    except Exception as e:
        print(f"Error in preprocess_text: {e}")
        return text

def tokenize_into_sentences(text):
    """Split text into sentences."""
    if not isinstance(text, str) or not text:
        return []

    try:
        # First clean the text
        text = clean_text(text)
        if not text:
            return []

        # Split into sentences
        sentences = sent_tokenize(text)

        # Further clean each sentence
        sentences = [preprocess_text(sentence) for sentence in sentences if len(sentence) > 10]
        return sentences
    except Exception as e:
        print(f"Error in tokenize_into_sentences: {e}")
        return []

def extract_resume_sections(text):
    """Extract relevant sections from a resume."""
    try:
        # Define section names and possible next sections
        sections = {
            "education": ["experience", "skills", "projects", "certifications", "achievements", "publications"],
            "experience": ["education", "skills", "projects", "certifications", "achievements", "publications"],
            "skills": ["education", "experience", "projects", "certifications", "achievements", "publications"],
            "projects": ["education", "experience", "skills", "certifications", "achievements", "publications"]
        }

        result = {}
        for section, next_sections in sections.items():
            next_sections_pattern = "|".join(next_sections)
            pattern = fr"(?i)(?:{section})\s*[:\-]*\s*(.*?)(?:(?:{next_sections_pattern})|$)"
            match = re.search(pattern, text, re.DOTALL)
            if match:
                result[section] = match.group(1).strip()

        # If we couldn't find structured sections, use the whole text
        if all(not value for value in result.values()):
            result["full_text"] = text

        return result
    except Exception as e:
        print(f"Error in extract_resume_sections: {e}")
        return {"full_text": text}

def extract_skills(text):
    """Extract skills from resume text."""
    try:
        # Common technical skills
        tech_skills = [
            "python", "java", "javascript", "c++", "c#", "ruby", "php", "html", "css",
            "sql", "nosql", "mongodb", "mysql", "postgresql", "oracle", "react", "angular",
            "vue", "node.js", "express", "django", "flask", "spring", "tensorflow",
            "pytorch", "keras", "scikit-learn", "pandas", "numpy", "excel", "tableau",
            "power bi", "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "git",
            "machine learning", "deep learning", "nlp", "computer vision", "data science",
            "data analysis", "data visualization", "big data", "hadoop", "spark", "r"
        ]

        # Find skills in the text
        text_lower = text.lower()
        found_skills = []

        for skill in tech_skills:
            # Match whole words only
            if re.search(r'\b' + re.escape(skill) + r'\b', text_lower):
                found_skills.append(skill)

        # Look for skills section
        skills_section_pattern = re.compile(
            r"(?i)(?:skills|technical skills|programming languages|technologies)\s*[:\-]*\s*(.+?)(?:\n\s*\n|$)",
            re.DOTALL
        )
        match = skills_section_pattern.search(text)
        if match:
            skills_text = match.group(1)
            # Split by common delimiters
            section_skills = re.split(r'[;,\n•]', skills_text)
            # Clean up each skill
            section_skills = [s.strip().lower() for s in section_skills if s.strip()]
            # Add to found skills
            found_skills.extend([s for s in section_skills if len(s) > 2 and s not in found_skills])

        return found_skills
    except Exception as e:
        print(f"Error in extract_skills: {e}")
        return []

def get_document_embedding(sentences, model):
    """Get document embedding by averaging sentence embeddings."""
    try:
        if not sentences:
            return np.zeros(384)  # Default embedding dimension for MiniLM-L6

        # Get embeddings for each sentence
        embeddings = model.encode(sentences)

        # Average the embeddings
        return np.mean(embeddings, axis=0)
    except Exception as e:
        print(f"Error in get_document_embedding: {e}")
        return np.zeros(384)  # Return zero vector on error

def process_resume(resume_file, model):
    """Process a resume file and extract information."""
    try:
        start_time = time.time()

        # Check if file exists
        if not os.path.exists(resume_file):
            return None, f"Error: Resume file not found at {resume_file}", [], None, 0

        # Extract text from PDF
        try:
            text = extract_text(resume_file)
            if not text or len(text.strip()) == 0:
                return None, "Error: Failed to extract text from PDF. The file may be empty, corrupted, or not a valid PDF.", [], None, 0
        except Exception as pdf_err:
            return None, f"Error extracting text from PDF: {str(pdf_err)}", [], None, 0

        # Extract sections
        sections = extract_resume_sections(text)

        # Extract skills
        skills = extract_skills(text)

        # Process text for embedding
        all_sentences = tokenize_into_sentences(text)
        if not all_sentences:
            return text, "Warning: Failed to extract meaningful sentences from resume text.", skills, None, 0

        # Get document embedding
        embedding = get_document_embedding(all_sentences, model)

        processing_time = time.time() - start_time

        return text, sections, skills, embedding, processing_time
    except Exception as e:
        error_trace = traceback.format_exc()
        error_msg = f"Error processing resume: {str(e)}\n\n{error_trace}"
        print(error_msg)
        return None, error_msg, [], None, 0

def build_faiss_index(job_data):
    """Build a FAISS index for fast similarity search."""
    try:
        # Extract embeddings
        job_embeddings = np.array([job["embedding"] for job in job_data]).astype('float32')

        # Create index
        dimension = job_embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(job_embeddings)

        return index, None
    except Exception as e:
        error_msg = f"Error building FAISS index: {str(e)}"
        print(error_msg)
        return None, error_msg

def match_resume_to_jobs_faiss(resume_embedding, job_data, job_embeddings_index, k=10):
    """Match a resume to jobs using FAISS index."""
    try:
        # Convert embedding to proper format
        query_vector = np.array([resume_embedding]).astype('float32')

        # Search the index
        distances, indices = job_embeddings_index.search(query_vector, k)

        # Get the matching jobs
        matches = []
        for i, idx in enumerate(indices[0]):
            if idx < len(job_data):  # Safeguard against index out of bounds
                job = job_data[idx]
                matches.append({
                    "job_id": job["job_id"],
                    "job_title": job["job_title"],
                    "similarity_score": 1 / (1 + distances[0][i]),  # Convert distance to similarity score
                    "experience_level": job.get("formatted_experience_level", ""),
                    "location": job.get("location", ""),
                    "remote_allowed": job.get("remote_allowed", False),
                    "work_type": job.get("work_type", "")
                })

        return matches, None
    except Exception as e:
        error_msg = f"Error in FAISS matching: {str(e)}"
        print(error_msg)
        return [], error_msg

def match_resume_to_jobs_cosine(resume_embedding, job_data, k=10):
    """Match a resume to jobs using cosine similarity."""
    try:
        # Reshape resume embedding for sklearn cosine_similarity
        query_vector = resume_embedding.reshape(1, -1)

        # Extract all job embeddings
        job_embeddings = np.array([job["embedding"] for job in job_data])

        # Calculate cosine similarity
        similarities = cosine_similarity(query_vector, job_embeddings)[0]

        # Get indices of top k matches
        top_indices = np.argsort(similarities)[::-1][:k]

        # Get the matching jobs
        matches = []
        for i, idx in enumerate(top_indices):
            job = job_data[idx]
            matches.append({
                "job_id": job["job_id"],
                "job_title": job["job_title"],
                "similarity_score": similarities[idx],  # Cosine similarity score
                "experience_level": job.get("formatted_experience_level", ""),
                "location": job.get("location", ""),
                "remote_allowed": job.get("remote_allowed", False),
                "work_type": job.get("work_type", "")
            })

        return matches, None
    except Exception as e:
        error_msg = f"Error in cosine similarity matching: {str(e)}"
        print(error_msg)
        return [], error_msg

def format_matches_for_display(matches):
    """Format matches for display in the UI."""
    try:
        if not matches:
            return "No matches found."

        table_rows = []
        for i, match in enumerate(matches, 1):
            row = [
                i,
                match["job_title"],
                f"{match['similarity_score']:.4f}",
                match.get("location", ""),
                "Yes" if match.get("remote_allowed", False) else "No"
            ]
            table_rows.append(row)

        df = pd.DataFrame(
            table_rows,
            columns=["Rank", "Job Title", "Similarity Score", "Location", "Remote?"]
        )
        return df
    except Exception as e:
        error_msg = f"Error formatting matches: {str(e)}"
        print(error_msg)
        return f"Error: {error_msg}"

def recommend_jobs(resume_file):
    """Main function to process resume and recommend jobs."""
    try:
        print(f"\n--- Processing resume: {resume_file} ---")

        # Load model
        try:
            model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
            print("Sentence transformer model loaded successfully")
        except Exception as model_err:
            error_msg = f"Error loading sentence transformer model: {str(model_err)}"
            print(error_msg)
            return error_msg, None, None, None, None, None

        # Load job embeddings
        job_data, job_load_error = load_job_embeddings()
        if job_data is None:
            return job_load_error, None, None, None, None, None

        # Process resume
        text, sections_or_error, skills, embedding, processing_time = process_resume(resume_file, model)

        # Check if there was an error in processing
        if embedding is None:
            return sections_or_error, None, None, None, None, text

        # Build FAISS index
        job_embeddings_index, faiss_error = build_faiss_index(job_data)
        if job_embeddings_index is None:
            return faiss_error, None, None, None, None, text

        # Match using FAISS
        faiss_start = time.time()
        faiss_matches, faiss_error = match_resume_to_jobs_faiss(embedding, job_data, job_embeddings_index, k=10)
        faiss_time = time.time() - faiss_start

        if faiss_error:
            return f"FAISS matching error: {faiss_error}", skills, None, None, None, text

        # Match using cosine similarity
        cosine_start = time.time()
        cosine_matches, cosine_error = match_resume_to_jobs_cosine(embedding, job_data, k=10)
        cosine_time = time.time() - cosine_start

        if cosine_error:
            return f"Cosine similarity matching error: {cosine_error}", skills, faiss_matches, None, None, text

        # Format results for display
        if isinstance(sections_or_error, dict):
            sections = sections_or_error
            resume_info = f"**Extracted Sections:**\n"
            for section, content in sections.items():
                if len(content) > 300:
                    content = content[:300] + "..."
                resume_info += f"- **{section.title()}**: {content}\n\n"

            resume_info += f"**Processing time:** {processing_time:.2f} seconds\n"
        else:
            resume_info = sections_or_error  # It's an error message

        # Format skills
        skills_text = ", ".join(skills) if skills else "No skills extracted"

        # Format match results
        faiss_df = format_matches_for_display(faiss_matches)
        cosine_df = format_matches_for_display(cosine_matches)

        performance_metrics = f"""
        **Performance Metrics:**
        - FAISS matching time: {faiss_time:.4f} seconds
        - Cosine similarity matching time: {cosine_time:.4f} seconds
        - Resume processing time: {processing_time:.2f} seconds
        - Total job embeddings: {len(job_data)}
        """

        print("Processing complete.")
        return resume_info, skills_text, faiss_df, cosine_df, performance_metrics, text

    except Exception as e:
        error_trace = traceback.format_exc()
        error_msg = f"Error in recommend_jobs: {str(e)}\n\n{error_trace}"
        print(error_msg)
        return error_msg, None, None, None, None, None

# Create the Gradio interface
def create_ui():
    with gr.Blocks(title="Resume-Job Matcher Demo") as demo:
        gr.Markdown("# Resume-Job Matcher Demo")
        gr.Markdown("Upload a resume PDF to find matching jobs using both FAISS and cosine similarity methods.")

        with gr.Row():
            with gr.Column(scale=1):
                # Input components
                resume_upload = gr.File(label="Upload Resume (PDF)")
                submit_btn = gr.Button("Find Matching Jobs", variant="primary")

                # Output tabs for results
                with gr.Accordion("Resume Text", open=False):
                    resume_text_output = gr.Textbox(label="Extracted Resume Text", show_label=False)

            with gr.Column(scale=2):
                # Output components
                with gr.Tab("Resume Information"):
                    resume_info = gr.Markdown(label="Resume Information")
                    skills_output = gr.Textbox(label="Extracted Skills")

                with gr.Tab("FAISS Matches"):
                    faiss_matches = gr.Dataframe(label="Job Matches (FAISS)")

                with gr.Tab("Cosine Similarity Matches"):
                    cosine_matches = gr.Dataframe(label="Job Matches (Cosine Similarity)")

                with gr.Tab("Performance Metrics"):
                    metrics_output = gr.Markdown(label="Performance Metrics")

        # Set up the submit action
        submit_btn.click(
            fn=recommend_jobs,
            inputs=[resume_upload],
            outputs=[resume_info, skills_output, faiss_matches, cosine_matches, metrics_output, resume_text_output],
            api_name="process_resume"
        )

        # Examples section removed as it depends on specific files

    return demo

# Launch the app
if __name__ == "__main__":
    # Print system information for debugging
    print(f"Current working directory: {os.getcwd()}")
    print(f"Files in directory: {os.listdir('.')}")

    # Check for job embeddings file
    job_file = "job_embeddings.pkl"
    if os.path.exists(job_file):
        print(f"Job embeddings file found: {job_file}")
        file_size = os.path.getsize(job_file) / (1024 * 1024)  # Size in MB
        print(f"File size: {file_size:.2f} MB")
    else:
        print(f"WARNING: Job embeddings file not found: {job_file}")

    # Launch the demo
    demo = create_ui()
    demo.launch()

NLTK data downloaded successfully
Current working directory: /content
Files in directory: ['.config', 'job_embeddings.pkl', 'sample_data']
Job embeddings file found: job_embeddings.pkl
File size: 31.14 MB
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://24c2b300d0c7aad58c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
