# 03 - Skill Extraction Testing

This notebook prototypes and tests NLP-based skill extraction.

## Objectives
- Test skill extraction accuracy
- Tune confidence thresholds
- Explore spaCy NLP features
- Evaluate taxonomy coverage

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Load spaCy
import spacy
nlp = spacy.load('en_core_web_md')
print(f"Loaded spaCy model: {nlp.meta['name']} v{nlp.meta['version']}")
print(f"Vector size: {nlp.vocab.vectors_length}")

## 1. Skill Extraction from Sample Text

In [None]:
from src.feature_engineering.skill_extractor import extract_skills, load_skills_taxonomy

# Load taxonomy
taxonomy = load_skills_taxonomy()
all_skills = [skill for skills in taxonomy.values() for skill in skills]
print(f"Total skills in taxonomy: {len(all_skills)}")

In [None]:
# Test samples
test_texts = [
    "Senior Python developer with experience in Django, Flask, and machine learning using TensorFlow.",
    "Full-stack engineer skilled in React, Node.js, and AWS. Strong leadership and communication skills.",
    "Data scientist proficient in pandas, numpy, scikit-learn. Experience in healthcare and finance domains.",
    "DevOps engineer with expertise in Docker, Kubernetes, and CI/CD pipelines using Jenkins."
]

for i, text in enumerate(test_texts, 1):
    print(f"\n{'='*60}")
    print(f"Test {i}: {text[:60]}...")
    print('='*60)
    
    skills = extract_skills(text, taxonomy)
    
    # Group by category
    by_category = {'technical': [], 'soft': [], 'domain': []}
    for skill in skills:
        by_category[skill['category']].append(f"{skill['skill_name']} ({skill['confidence']:.0%})")
    
    for category, skill_list in by_category.items():
        if skill_list:
            print(f"  {category.upper()}: {', '.join(skill_list)}")

## 2. Confidence Threshold Analysis

In [None]:
# Test different confidence thresholds
sample_text = """
Experienced software engineer with Python, JavaScript, and Java skills.
Proficient in React and Django. Some knowledge of AWS and Docker.
Good communication and teamwork abilities.
"""

thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
results = []

for threshold in thresholds:
    skills = extract_skills(sample_text, taxonomy, min_confidence=threshold)
    results.append({
        'threshold': threshold,
        'count': len(skills),
        'skills': [s['skill_name'] for s in skills]
    })
    print(f"Threshold {threshold}: {len(skills)} skills - {[s['skill_name'] for s in skills]}")

In [None]:
# Visualize threshold impact
plt.figure(figsize=(10, 5))
plt.bar([r['threshold'] for r in results], [r['count'] for r in results], color='steelblue')
plt.xlabel('Confidence Threshold')
plt.ylabel('Number of Skills Extracted')
plt.title('Impact of Confidence Threshold on Skill Extraction')
plt.xticks(thresholds)
for i, r in enumerate(results):
    plt.text(r['threshold'], r['count'] + 0.2, str(r['count']), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

## 3. SpaCy NLP Exploration

In [None]:
# Explore spaCy features
doc = nlp(sample_text)

# Named entities
print("Named Entities:")
for ent in doc.ents:
    print(f"  {ent.text} ({ent.label_})")

print("\nNoun Phrases:")
for chunk in doc.noun_chunks:
    print(f"  {chunk.text}")

In [None]:
# Word vectors similarity
skills_to_test = ['Python', 'JavaScript', 'programming', 'coding']

print("Word Similarity Matrix:")
for skill1 in skills_to_test:
    sims = []
    for skill2 in skills_to_test:
        sim = nlp(skill1).similarity(nlp(skill2))
        sims.append(f"{sim:.2f}")
    print(f"  {skill1:12}: {', '.join(sims)}")

## 4. Accuracy Metrics

Based on v0.1 requirements:
- Precision ≥70%
- Recall ≥60%
- Categorization accuracy ≥80%

In [None]:
# Test with labeled ground truth
test_case = {
    'text': 'Python developer with React experience and strong communication skills.',
    'expected_skills': ['Python', 'React', 'Communication']
}

extracted = extract_skills(test_case['text'], taxonomy)
extracted_names = [s['skill_name'] for s in extracted]

# Calculate metrics
true_positives = len(set(extracted_names) & set(test_case['expected_skills']))
precision = true_positives / len(extracted_names) if extracted_names else 0
recall = true_positives / len(test_case['expected_skills']) if test_case['expected_skills'] else 0

print(f"Expected: {test_case['expected_skills']}")
print(f"Extracted: {extracted_names}")
print(f"\nPrecision: {precision:.0%}")
print(f"Recall: {recall:.0%}")