In [None]:
pip install pandas numpy scikit-learn nltk transformers sentence-transformers pdfminer.six faiss-cpu matplotlib seaborn tqdm

Collecting pdfminer.six
  Downloading pdfminer_six-20250416-py3-none-any.whl.metadata (4.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0

In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

def load_embeddings(resume_file="resume_embeddings.pkl", job_file="job_embeddings.pkl"):
    """Load precomputed embeddings from files."""
    # Check if files exist
    if not os.path.exists(resume_file):
        print(f"Error: Resume embeddings file {resume_file} not found.")
        return None, None

    if not os.path.exists(job_file):
        print(f"Error: Job embeddings file {job_file} not found.")
        return None, None

    # Load resume embeddings
    with open(resume_file, 'rb') as f:
        resume_data = pickle.load(f)

    # Load job embeddings
    with open(job_file, 'rb') as f:
        job_data = pickle.load(f)

    print(f"Loaded {len(resume_data)} resume embeddings and {len(job_data)} job embeddings")
    return resume_data, job_data

def load_test_dataset(test_file="test_dataset.xlsx"):
    """Load test dataset with known correct job titles for each resume."""
    if not os.path.exists(test_file):
        print(f"Test dataset file {test_file} not found.")
        return None

    test_data = pd.read_excel(test_file)
    print(f"Loaded test dataset with {len(test_data)} resume-job title pairs")
    return test_data

def match_resume_to_jobs_cosine(resume_embedding, job_data, k=10):
    """Match a resume to jobs using cosine similarity."""
    # Reshape resume embedding for sklearn cosine_similarity
    query_vector = resume_embedding.reshape(1, -1)

    # Extract all job embeddings
    job_embeddings = np.array([job["embedding"] for job in job_data])

    # Calculate cosine similarity
    similarities = cosine_similarity(query_vector, job_embeddings)[0]

    # Get indices of top k matches
    top_indices = np.argsort(similarities)[::-1][:k]

    # Get the matching jobs
    matches = []
    for i, idx in enumerate(top_indices):
        job = job_data[idx]
        matches.append({
            "job_id": job["job_id"],
            "job_title": job["job_title"],
            "similarity_score": similarities[idx],  # Cosine similarity score
            "experience_level": job.get("formatted_experience_level", ""),
            "location": job.get("location", ""),
            "remote_allowed": job.get("remote_allowed", False),
            "work_type": job.get("work_type", "")
        })

    return matches

def print_resume_filenames(resume_data):
    """Print the actual filenames in the resume embeddings to help with debugging."""
    print("\nActual resume filenames in embeddings:")
    for i, resume in enumerate(resume_data):
        print(f"{i+1}. {resume.get('filename', 'No filename found')}")

def map_test_names_to_actual_names(resume_data, test_data):
    """Create a mapping from test dataset names to actual resume filenames."""
    # First, print the filenames to help with debugging
    print_resume_filenames(resume_data)

    # Extract all filenames from resume data
    all_filenames = [resume.get('filename', '') for resume in resume_data]

    # Try to create a mapping based on position (assuming the order matches)
    # This is just a fallback approach
    test_names = test_data['Resume_title'].tolist()
    position_mapping = {}

    for i, test_name in enumerate(test_names):
        if i < len(all_filenames):
            position_mapping[test_name] = all_filenames[i]

    print("\nProposed mapping from test names to actual filenames:")
    for test_name, actual_name in position_mapping.items():
        print(f"{test_name} → {actual_name}")

    # Ask user to confirm or modify the mapping
    print("\nIs this mapping correct? (y/n)")
    response = input().strip().lower()

    if response == 'y':
        return position_mapping

    # If not correct, allow manual mapping
    print("\nPlease enter the correct mapping in format 'test_name:actual_filename' (one per line)")
    print("Press Enter twice when done")
    manual_mapping = {}

    while True:
        line = input().strip()
        if not line:
            break

        if ':' in line:
            test_name, actual_name = line.split(':', 1)
            manual_mapping[test_name.strip()] = actual_name.strip()

    return manual_mapping if manual_mapping else position_mapping

def evaluate_job_title_matches_cosine(resume_data, job_data, test_data, output_dir="test_results_cosine"):
    """Evaluate how well the system matches resumes to the correct job titles using cosine similarity."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Create a mapping from test dataset names to actual resume filenames
    filename_mapping = map_test_names_to_actual_names(resume_data, test_data)

    # Convert test_data to a dictionary for easier lookup
    test_dict = dict(zip(test_data['Resume_title'], test_data['Job Title']))

    # Create a dictionary to store results
    results = {
        "top_1": 0,
        "top_5": 0,
        "top_10": 0,
        "total_tested": 0,
        "detailed_results": []
    }

    # Create a dictionary for resume lookup by filename
    resume_dict = {resume["filename"]: resume for resume in resume_data}

    # Process each resume in the test dataset
    for test_resume_title, expected_job_title in tqdm(test_dict.items(), desc="Evaluating resume-job matches (cosine)"):
        # Get the actual filename from the mapping
        actual_filename = filename_mapping.get(test_resume_title)

        if not actual_filename:
            print(f"Warning: No mapping found for {test_resume_title}")
            continue

        # Find the resume in our data
        resume = resume_dict.get(actual_filename)
        if not resume:
            print(f"Warning: Resume {actual_filename} not found in embeddings")
            continue

        # Match resume to jobs using cosine similarity
        matches = match_resume_to_jobs_cosine(resume["full_embedding"], job_data, k=10)

        # If no matches found, continue to next resume
        if not matches:
            print(f"Warning: No job matches found for resume {test_resume_title}")
            continue

        # Extract job titles
        matched_job_titles = [match["job_title"] for match in matches]

        # Check if the expected job title is in the top matches
        # Note: We'll do a partial match to account for variations in title naming
        in_top_1 = any(expected_job_title.lower() in title.lower() for title in [matched_job_titles[0]])
        in_top_5 = any(expected_job_title.lower() in title.lower() for title in matched_job_titles[:5])
        in_top_10 = any(expected_job_title.lower() in title.lower() for title in matched_job_titles[:10])

        # Update counts
        results["total_tested"] += 1
        if in_top_1:
            results["top_1"] += 1
        if in_top_5:
            results["top_5"] += 1
        if in_top_10:
            results["top_10"] += 1

        # Store detailed results for this resume
        results["detailed_results"].append({
            "resume_title": test_resume_title,
            "actual_filename": actual_filename,
            "expected_job_title": expected_job_title,
            "top_match": matched_job_titles[0],
            "in_top_1": in_top_1,
            "in_top_5": in_top_5,
            "in_top_10": in_top_10,
            "similarity_score": matches[0]["similarity_score"],
            "all_matches": matched_job_titles[:10]
        })

        # Create visualization for this match
        visualize_match(test_resume_title, expected_job_title, matches[:10], output_dir)

    # Calculate accuracies
    if results["total_tested"] > 0:
        results["top_1_accuracy"] = results["top_1"] / results["total_tested"]
        results["top_5_accuracy"] = results["top_5"] / results["total_tested"]
        results["top_10_accuracy"] = results["top_10"] / results["total_tested"]
    else:
        results["top_1_accuracy"] = 0
        results["top_5_accuracy"] = 0
        results["top_10_accuracy"] = 0

    # Save detailed results to CSV
    detailed_df = pd.DataFrame(results["detailed_results"])
    detailed_df.to_csv(os.path.join(output_dir, "detailed_results.csv"), index=False)

    # Create summary visualization
    visualize_accuracy(results, output_dir)

    # Create a confusion matrix
    create_confusion_matrix(results["detailed_results"], output_dir)

    return results

def visualize_match(resume_title, expected_job_title, matches, output_dir):
    """Create visualization for a single resume match."""
    plt.figure(figsize=(10, 6))

    # Extract job titles and scores
    job_titles = [match['job_title'][:30] + '...' if len(match['job_title']) > 30 else match['job_title']
                 for match in matches]
    scores = [match['similarity_score'] for match in matches]

    # Highlight the expected job title if it's in the matches
    colors = ['#3498db' for _ in range(len(job_titles))]
    for i, title in enumerate(job_titles):
        if expected_job_title.lower() in title.lower():
            colors[i] = '#2ecc71'  # Green for correct matches

    # Create horizontal bar chart
    bars = plt.barh(range(len(job_titles)), scores, color=colors)
    plt.yticks(range(len(job_titles)), job_titles)
    plt.title(f"Job Matches for {resume_title} (Cosine Similarity)\nExpected: {expected_job_title}")
    plt.xlabel('Similarity Score')
    plt.tight_layout()

    # Add score labels
    for i, bar in enumerate(bars):
        plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                f'{scores[i]:.3f}', va='center')

    # Save figure
    plt.savefig(os.path.join(output_dir, f"match_{resume_title}.png"))
    plt.close()

def visualize_accuracy(results, output_dir):
    """Create visualization for overall accuracy results."""
    plt.figure(figsize=(10, 6))

    metrics = ["Top-1", "Top-5", "Top-10"]
    values = [results["top_1_accuracy"], results["top_5_accuracy"], results["top_10_accuracy"]]

    bars = plt.bar(metrics, values, color=['#3498db', '#2ecc71', '#f39c12'])

    plt.title("Resume-Job Matching Accuracy (Cosine Similarity)")
    plt.ylabel("Accuracy")
    plt.ylim(0, 1.0)

    # Add percentage labels
    for i, v in enumerate(values):
        plt.text(i, v + 0.01, f"{v:.1%}", ha='center')

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "accuracy_metrics_cos.png"))
    plt.close()

    # Create text summary
    with open(os.path.join(output_dir, "accuracy_summary_cos.txt"), 'w') as f:
        f.write("Resume-Job Matching Accuracy Summary (Cosine Similarity)\n")
        f.write("==================================================\n\n")
        f.write(f"Total resumes tested: {results['total_tested']}\n\n")
        f.write(f"Top-1 Accuracy: {results['top_1_accuracy']:.2%} ({results['top_1']}/{results['total_tested']})\n")
        f.write(f"Top-5 Accuracy: {results['top_5_accuracy']:.2%} ({results['top_5']}/{results['total_tested']})\n")
        f.write(f"Top-10 Accuracy: {results['top_10_accuracy']:.2%} ({results['top_10']}/{results['total_tested']})\n")

def create_confusion_matrix(detailed_results, output_dir):
    """Create a confusion matrix of expected vs. actual top-1 job titles."""
    # Check if we have results
    if not detailed_results:
        print("No detailed results to create confusion matrix")
        return

    # Extract expected and actual job titles
    y_true = [result["expected_job_title"] for result in detailed_results]
    y_pred = [result["top_match"] for result in detailed_results]

    # Get unique job titles
    all_titles = sorted(list(set(y_true + y_pred)))

    # Create confusion matrix
    cm = np.zeros((len(all_titles), len(all_titles)))
    for i, true_title in enumerate(all_titles):
        for j, pred_title in enumerate(all_titles):
            for result in detailed_results:
                if (result["expected_job_title"] == true_title and
                    result["top_match"] == pred_title):
                    cm[i, j] += 1

    # Visualize confusion matrix
    plt.figure(figsize=(12, 10))
    # Using .0f format for float values
    sns.heatmap(cm, annot=True, fmt=".0f", cmap="Blues",
                xticklabels=all_titles, yticklabels=all_titles)
    plt.title("Confusion Matrix: Expected vs. Actual Job Titles (Cosine Similarity)")
    plt.ylabel("Expected Job Title")
    plt.xlabel("Actual Job Title (Top Match)")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "confusion_matrix_cos.png"))
    plt.close()

def generate_test_report(results, output_dir="test_results_cosine"):
    """Generate a comprehensive test report."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Check if we have results
    if not results["detailed_results"]:
        print("No detailed results to generate report")
        return

    # Create detailed report in markdown format
    with open(os.path.join(output_dir, "test_report_cos.md"), 'w') as f:
        f.write("# Resume-Job Matcher Test Report (Cosine Similarity)\n\n")

        f.write("## Summary\n\n")
        f.write(f"Total resumes tested: {results['total_tested']}\n\n")
        f.write("| Metric | Score | Count |\n")
        f.write("|--------|-------|-------|\n")
        f.write(f"| Top-1 Accuracy | {results['top_1_accuracy']:.2%} | {results['top_1']}/{results['total_tested']} |\n")
        f.write(f"| Top-5 Accuracy | {results['top_5_accuracy']:.2%} | {results['top_5']}/{results['total_tested']} |\n")
        f.write(f"| Top-10 Accuracy | {results['top_10_accuracy']:.2%} | {results['top_10']}/{results['total_tested']} |\n\n")

        f.write("## Detailed Results\n\n")
        f.write("| Resume | Expected Job Title | Top Match | In Top-1 | In Top-5 | In Top-10 |\n")
        f.write("|--------|-------------------|-----------|----------|----------|----------|\n")

        for result in results["detailed_results"]:
            f.write(f"| {result['resume_title']} | {result['expected_job_title']} | {result['top_match']} | ")
            f.write(f"{'✓' if result['in_top_1'] else '✗'} | {'✓' if result['in_top_5'] else '✗'} | {'✓' if result['in_top_10'] else '✗'} |\n")

        f.write("\n## Analysis\n\n")

        # Calculate success rate by resume type
        resume_types = {}
        for result in results["detailed_results"]:
            if result["expected_job_title"] not in resume_types:
                resume_types[result["expected_job_title"]] = {"total": 0, "correct": 0}

            resume_types[result["expected_job_title"]]["total"] += 1
            if result["in_top_1"]:
                resume_types[result["expected_job_title"]]["correct"] += 1

        f.write("### Success Rate by Expected Job Title\n\n")
        f.write("| Job Title | Success Rate |\n")
        f.write("|-----------|-------------|\n")

        for job_title, stats in resume_types.items():
            success_rate = stats["correct"] / stats["total"] if stats["total"] > 0 else 0
            f.write(f"| {job_title} | {success_rate:.2%} ({stats['correct']}/{stats['total']}) |\n")

        f.write("\n### Visualization\n\n")
        f.write("See the following files in the test_results_cosine directory:\n\n")
        f.write("- accuracy_metrics.png: Bar chart showing Top-1, Top-5, and Top-10 accuracy\n")
        f.write("- confusion_matrix.png: Heatmap showing expected vs. actual job titles\n")
        f.write("- match_*.png: Individual visualizations for each resume match\n")

        f.write("\n## Mapping Information\n\n")
        f.write("| Test Name | Actual Filename |\n")
        f.write("|-----------|----------------|\n")

        for result in results["detailed_results"]:
            f.write(f"| {result['resume_title']} | {result['actual_filename']} |\n")

    print(f"Generated test report at {os.path.join(output_dir, 'test_report_cos.md')}")

def main():
    # Start timing
    start_time = time.time()

    # Set up output directory
    output_dir = "test_results_cosine"
    os.makedirs(output_dir, exist_ok=True)

    # Load embeddings
    resume_data, job_data = load_embeddings()
    if resume_data is None or job_data is None:
        return

    # Load test dataset
    test_data = load_test_dataset()
    if test_data is None:
        print("Cannot run evaluation without test dataset.")
        return

    # Run evaluation using cosine similarity
    print("Evaluating resume-job matching using cosine similarity...")
    results = evaluate_job_title_matches_cosine(resume_data, job_data, test_data, output_dir)

    # Generate test report
    print("Generating test report...")
    generate_test_report(results, output_dir)

    # Print summary
    print("\nTest Results (Cosine Similarity):")
    print(f"Top-1 Accuracy: {results['top_1_accuracy']:.2%} ({results['top_1']}/{results['total_tested']})")
    print(f"Top-5 Accuracy: {results['top_5_accuracy']:.2%} ({results['top_5']}/{results['total_tested']})")
    print(f"Top-10 Accuracy: {results['top_10_accuracy']:.2%} ({results['top_10']}/{results['total_tested']})")

    # Calculate and print execution time
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"\nExecution completed in {execution_time:.2f} seconds")
    print(f"Results saved to {output_dir} directory")

if __name__ == "__main__":
    main()

Loaded 10 resume embeddings and 19824 job embeddings
Loaded test dataset with 10 resume-job title pairs
Evaluating resume-job matching using cosine similarity...

Actual resume filenames in embeddings:
1. nlp_1.pdf
2. nlp_2.pdf
3. nlp_3.pdf
4. nlp_4.pdf
5. nlp_5.pdf
6. nlp_6.pdf
7. nlp_7.pdf
8. nlp_8.pdf
9. nlp_9.pdf
10. nlp_10.pdf

Proposed mapping from test names to actual filenames:
nlp_1 → nlp_1.pdf
nlp_2 → nlp_2.pdf
nlp_3 → nlp_3.pdf
nlp_4 → nlp_4.pdf
nlp_5 → nlp_5.pdf
nlp_6 → nlp_6.pdf
nlp_7 → nlp_7.pdf
nlp_8 → nlp_8.pdf
nlp_9 → nlp_9.pdf
nlp_10 → nlp_10.pdf

Is this mapping correct? (y/n)
y


Evaluating resume-job matches (cosine): 100%|██████████| 10/10 [00:04<00:00,  2.22it/s]


Generating test report...
Generated test report at test_results_cosine/test_report_cos.md

Test Results (Cosine Similarity):
Top-1 Accuracy: 10.00% (1/10)
Top-5 Accuracy: 50.00% (5/10)
Top-10 Accuracy: 60.00% (6/10)

Execution completed in 7.97 seconds
Results saved to test_results_cosine directory
