# 01 - Data Exploration

This notebook explores the resume and job description data to understand patterns and prepare for feature engineering.

## Objectives
- Load and examine sample resumes
- Analyze text structure and patterns
- Explore skill distributions
- Understand data quality issues

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Load Skills Taxonomy

In [None]:
# Load skills taxonomy
taxonomy_path = Path('../data/datasets/skills_taxonomy.json')

with open(taxonomy_path, 'r') as f:
    skills_taxonomy = json.load(f)

print(f"Skill categories: {list(skills_taxonomy.keys())}")
for category, skills in skills_taxonomy.items():
    print(f"  {category}: {len(skills)} skills")

In [None]:
# Visualize skill distribution by category
categories = list(skills_taxonomy.keys())
counts = [len(skills_taxonomy[cat]) for cat in categories]

plt.figure(figsize=(8, 5))
plt.bar(categories, counts, color=['#3498db', '#2ecc71', '#e74c3c'])
plt.title('Skills Taxonomy Distribution')
plt.xlabel('Category')
plt.ylabel('Number of Skills')
for i, v in enumerate(counts):
    plt.text(i, v + 2, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

## 2. Sample Resume Analysis

In [None]:
# Sample resume text for analysis
sample_resume = """
John Doe
john.doe@email.com | (555) 123-4567 | linkedin.com/in/johndoe

SUMMARY
Senior Software Engineer with 5+ years of experience in Python and JavaScript.
Expert in web development using React and Django frameworks.
Strong background in machine learning and data science.

EXPERIENCE
Senior Developer, Tech Corp (2020-Present)
- Led development of ML pipeline using Python and TensorFlow
- Built REST APIs with Django and Flask
- Deployed applications on AWS using Docker and Kubernetes

Software Engineer, StartupXYZ (2017-2020)
- Full-stack development with React and Node.js
- Implemented machine learning features using scikit-learn

EDUCATION
Master of Science in Computer Science
University of Technology, 2017

SKILLS
Python, JavaScript, React, Django, TensorFlow, AWS, Docker, Git, Machine Learning
"""

print(f"Resume length: {len(sample_resume)} characters")
print(f"Number of lines: {len(sample_resume.strip().split(chr(10)))}")

In [None]:
# Parse structured info from resume
from src.preprocessing.parser import parse_resume_info, extract_years_of_experience, extract_education_level

parsed = parse_resume_info(sample_resume)
print("Parsed Information:")
for key, value in parsed.items():
    print(f"  {key}: {value}")

print(f"\nYears of Experience: {extract_years_of_experience(sample_resume)}")
print(f"Education Level: {extract_education_level(sample_resume)}")

## 3. Text Statistics

In [None]:
# Analyze text patterns
import re
from collections import Counter

# Word frequency analysis
words = re.findall(r'\b[a-zA-Z]+\b', sample_resume.lower())
word_freq = Counter(words)

# Most common words
print("Top 15 most common words:")
for word, count in word_freq.most_common(15):
    print(f"  {word}: {count}")

In [None]:
# Section detection
from src.preprocessing.text_cleaner import extract_sections

sections = extract_sections(sample_resume)
print("Detected Sections:")
for section, content in sections.items():
    print(f"\n--- {section.upper()} ---")
    print(content[:200] + "..." if len(content) > 200 else content)

## 4. Next Steps

- Test with real PDF resumes
- Analyze multi-column layouts
- Build training dataset for skill extraction
- Evaluate extraction accuracy