In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict
import random

# job description
JOB_DESCRIPTION = {
    "text": "We are looking for a Data Scientist with strong Python skills...",
    "required_skills": ["Python", "Machine Learning", "Tableau", "SQL", "Pandas", "Scikit-learn"],
    "education": {
        "min_degree": "Master",
        "fields": ["Computer Science", "Data Science", "Statistics"]
    }
}

# Mock resume generator
def generate_test_resumes(num_samples=100) -> List[Dict]:
    """Generate realistic test resumes with ground truth labels"""
    test_resumes = []
    degree_tiers = {
        "PhD": 1.0,
        "Master": 0.8,
        "Bachelor": 0.6,
        "None": 0.3
    }

    for i in range(num_samples):

        skills = random.sample(JOB_DESCRIPTION["required_skills"],
                             k=random.randint(2, len(JOB_DESCRIPTION["required_skills"])))


        transferable_map = {
            "Tableau": ["PowerBI", "Looker"],
            "Python": ["R"],
            "Pandas": ["dplyr"]
        }
        for skill, alts in transferable_map.items():
            if skill in skills and random.random() > 0.7:
                skills.append(random.choice(alts))


        degree = random.choice(list(degree_tiers.keys()))
        education_score = degree_tiers[degree]


        experience = random.randint(0, 10)
        experience_score = min(experience / 5, 1.0)


        true_scores = {
            "skills": len(skills) / len(JOB_DESCRIPTION["required_skills"]),
            "education": education_score,
            "experience": experience_score,
            "total": 0.5*(len(skills)/len(JOB_DESCRIPTION["required_skills"])) +
                    0.2*education_score +
                    0.3*experience_score
        }

        should_accept = (
            true_scores["skills"] >= 0.6 and
            degree in ["Master", "PhD"] and
            experience >= 2
        )

        test_resumes.append({
            "text": f"""
                Name: Candidate_{i}
                Education: {degree} in {random.choice(JOB_DESCRIPTION['education']['fields'])}
                Experience: {experience} years
                Skills: {', '.join(skills)}
                Projects: Worked with {random.choice(skills)} on data analysis
            """,
            "true_scores": true_scores,
            "should_accept": should_accept,
            "skills": skills,
            "education": degree,
            "experience": experience
        })

    return test_resumes

class ModelEvaluator:
    def __init__(self, processor, scorer, job_desc, threshold=0.3):
        self.processor = processor
        self.scorer = scorer
        self.job_desc = job_desc
        self.threshold = threshold

    def evaluate(self, test_resumes: List[Dict]) -> Dict:
        """Full evaluation pipeline"""
        results = {"resumes": [], "metrics": {}}

        for resume in test_resumes:
            try:
                features = self.processor.extract_features(resume["text"])
                scores = self.scorer.calculate_scores(features, resume["text"])

                results["resumes"].append({
                    **resume,
                    "pred_scores": scores,
                    "pred_accept": scores["total_score"] >= self.threshold,
                    "matched_skills": list(set(features["skills"]) &
                                         set(self.job_desc["required_skills"]))
                })
            except Exception as e:
                print(f"Error processing resume: {str(e)}")


        self._compute_metrics(results)


        self._generate_plots(results)

        return results

    def _compute_metrics(self, results: Dict):
        """Calculate accuracy metrics"""
        df = pd.DataFrame(results["resumes"])


        results["metrics"] = {
            "skill_mae": mean_absolute_error(
                df["true_scores"].apply(lambda x: x["skills"]),
                df["pred_scores"].apply(lambda x: x["skill_score"])
            ),
            "education_mae": mean_absolute_error(
                df["true_scores"].apply(lambda x: x["education"]),
                df["pred_scores"].apply(lambda x: x["edu_score"])
            ),
            "experience_mae": mean_absolute_error(
                df["true_scores"].apply(lambda x: x["experience"]),
                df["pred_scores"].apply(lambda x: x["exp_score"])
            )
        }

        y_true = df["should_accept"]
        y_pred = df["pred_accept"]

        results["metrics"].update({
            "accuracy": np.mean(y_true == y_pred),
            "confusion_matrix": confusion_matrix(y_true, y_pred),
            "classification_report": classification_report(y_true, y_pred,
                                                         target_names=["Rejected", "Accepted"])
        })


In [5]:
if __name__ == "__main__":
    from HCAI_Project_latest import ResumeProcessor, ResumeScorer
    processor = ResumeProcessor()
    scorer = ResumeScorer(JOB_DESCRIPTION)


    test_resumes = generate_test_resumes(100)


    evaluator = ModelEvaluator(processor, scorer, JOB_DESCRIPTION)
    results = evaluator.evaluate(test_resumes)


    print("=== Evaluation Metrics ===")
    print(f"Skill MAE: {results['metrics']['skill_mae']:.3f}")
    print(f"Education MAE: {results['metrics']['education_mae']:.3f}")
    print(f"Experience MAE: {results['metrics']['experience_mae']:.3f}")
    print(f"\nAccuracy: {results['metrics']['accuracy']:.1%}")
    print("\nConfusion Matrix:")
    print(results["metrics"]["confusion_matrix"])
    print("\nClassification Report:")
    print(results["metrics"]["classification_report"])


=== Evaluation Metrics ===
Skill MAE: 0.172
Education MAE: 0.118
Experience MAE: 0.215

Accuracy: 77.0%

Confusion Matrix:
[[44, 12], [11, 33]]

Classification Report:

              precision    recall  f1-score   support

    Rejected       0.80      0.79      0.79        56
    Accepted       0.73      0.75      0.74        44

    accuracy                           0.77       100
   macro avg       0.77      0.77      0.77       100
            
