In [None]:
import re
import json
import pandas as pd
import requests
from docx import Document
import PyPDF2

# ============================================================
# 📄 Resume & JD Extraction
# ============================================================
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return " ".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text


# ============================================================
# 🧩 Skill Extraction
# ============================================================
import re

import re

def extract_skills(text, skill_list):
    text_l = text.lower()
    
    # Pre-clean text (keep key chars and spaces)
    clean_text = re.sub(r'[^a-z0-9+#.\s]', ' ', text_l)
    clean_text = re.sub(r'\s+', ' ', clean_text)

    found = set()

    for skill in skill_list:
        s = skill.lower().strip()
        if not s:
            continue

        # Skip overly short skills (like "R") unless they are surrounded by word boundaries
        if len(s) <= 2:
            pattern = rf'\b{re.escape(s)}\b'
            if re.search(pattern, clean_text):
                found.add(skill)
            continue

        # For multi-word or special-char skills
        if re.search(r'[\s+.#]', s):
            pattern = re.escape(s)
            if re.search(pattern, clean_text):
                found.add(skill)
        else:
            pattern = rf'\b{re.escape(s)}\b'
            if re.search(pattern, clean_text):
                found.add(skill)

    return list(found)




# ============================================================
# ⚖️ JD Skill Weighting (Context Aware)
# ============================================================
def weight_jd_skills(jd_text, all_skills):
    jd_text = jd_text.lower()
    jd_skills_with_weights = {}
    priority_patterns = [
        (3, r"(must have|required)(.*?)(?=good to have|preferred|nice to have|optional|$)"),
        (2, r"(good to have|preferred|nice to have)(.*?)(?=must have|required|optional|$)"),
        (1, r"(optional)(.*?)(?=must have|required|good to have|preferred|nice to have|$)")
    ]

    for weight, pattern in priority_patterns:
        for match in re.finditer(pattern, jd_text, flags=re.DOTALL):
            section = match.group(0)
            for skill in all_skills:
                if skill.lower() in section:
                    jd_skills_with_weights[skill] = max(jd_skills_with_weights.get(skill, 0), weight)

    # Fallback for general mentions
    for skill in all_skills:
        if skill.lower() in jd_text and skill not in jd_skills_with_weights:
            jd_skills_with_weights[skill] = 1

    return jd_skills_with_weights


# ============================================================
# 📊 Weighted Skill Match Calculator
# ============================================================
def calculate_weighted_match(user_skills, jd_skills_with_weights):
    total_weight = sum(jd_skills_with_weights.values())
    matched_weight, matched, missing = 0, [], []

    for skill, weight in jd_skills_with_weights.items():
        if skill in user_skills:
            matched.append(skill)
            matched_weight += weight
        else:
            missing.append(skill)

    match_percentage = round((matched_weight / total_weight) * 100, 2) if total_weight else 0
    return {
        "match_percentage": match_percentage,
        "matched_skills": matched,
        "missing_skills": missing
    }


# ============================================================
# 🌐 Dynamic Roadmap (Real-time Resource Fetching)
# ============================================================

def fetch_youtube_links(skill, max_results=3):
    """Fetch top YouTube tutorial links dynamically."""
    try:
        response = requests.get(
            f"https://www.googleapis.com/youtube/v3/search",
            params={
                "part": "snippet",
                "maxResults": max_results,
                "q": f"{skill} tutorial for beginners",
                "key": "YOUR_YOUTUBE_API_KEY",  # replace with valid key
                "type": "video"
            }
        )
        data = response.json()
        return [
            f"https://www.youtube.com/watch?v={item['id']['videoId']}"
            for item in data.get("items", [])
        ]
    except Exception as e:
        return [f"Error fetching YouTube data: {e}"]


def fetch_github_projects(skill, max_results=3):
    """Fetch trending GitHub projects dynamically."""
    try:
        response = requests.get(
            f"https://api.github.com/search/repositories",
            params={"q": f"{skill} projects", "sort": "stars", "per_page": max_results}
        )
        data = response.json()
        return [repo["html_url"] for repo in data.get("items", [])]
    except Exception as e:
        return [f"Error fetching GitHub data: {e}"]


def generate_practical_roadmap(unmatched_skills):
    """Generate roadmap with live learning & project resources."""
    roadmap = []
    for skill in unmatched_skills:
        youtube_links = fetch_youtube_links(skill)
        github_links = fetch_github_projects(skill)

        roadmap.append({
            "skill": skill,
            "steps": [
                f"1️⃣ Learn fundamentals of {skill} (Crash course 5–10 hrs)",
                f"2️⃣ Build 1–2 hands-on projects with {skill}",
                f"3️⃣ Contribute to open-source repos using {skill}",
                f"4️⃣ Apply for internships requiring {skill}"
            ],
            "resources": {
                "YouTube Tutorials": youtube_links,
                "GitHub Projects": github_links,
                "Internship Portal": f"https://internshala.com/internships/keywords-{skill}"
            }
        })
    return roadmap


# ============================================================
# 🚀 Main Pipeline
# ============================================================
def skill_match_pipeline(resume_path, jd_text, all_skills):
    # Extract resume text
    if resume_path.endswith(".docx"):
        resume_text = extract_text_from_docx(resume_path)
    elif resume_path.endswith(".pdf"):
        resume_text = extract_text_from_pdf(resume_path)
    else:
        raise ValueError("Unsupported file format")

    # Extract and weight
    user_skills = extract_skills(resume_text, all_skills)
    jd_skills = weight_jd_skills(jd_text, all_skills)
    match_result = calculate_weighted_match(user_skills, jd_skills)

    # Add roadmap
    match_result["roadmap"] = generate_practical_roadmap(match_result["missing_skills"])
    return match_result


# ============================================================
# 🧪 Example Run
# ============================================================
if __name__ == "__main__":
    all_skills = pd.read_csv("skills.csv")["skill"].dropna().tolist()

    jd_text = """
    We are hiring a Data Science Intern.
    Must have: Python, SQL, C,C++,Machine Learning.
    Good to have: Tableau, Power BI, Git.
    """

    result = skill_match_pipeline("Bhoomika_agrawal.resume.pdf", jd_text, all_skills)
    print(json.dumps(result, indent=4))


In [6]:
import re
import json
import pandas as pd
import requests
from docx import Document
import PyPDF2

# ============================================================
# 📄 Resume & JD Extraction
# ============================================================
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return " ".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

# ============================================================
# 🧩 Strict Skill Extraction
# ============================================================
def extract_skills(text, skill_list):
    text_l = text.lower()
    clean_text = re.sub(r'[^a-z0-9+#.\s]', ' ', text_l)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    found = set()

    for skill in skill_list:
        s = skill.lower().strip()
        if not s:
            continue
        # Short skills like "R"
        if len(s) <= 2:
            pattern = rf'\b{re.escape(s)}\b'
            if re.search(pattern, clean_text):
                found.add(skill)
            continue
        # Multi-word or special skills
        if re.search(r'[\s+.#]', s):
            if re.search(re.escape(s), clean_text):
                found.add(skill)
        else:
            pattern = rf'\b{re.escape(s)}\b'
            if re.search(pattern, clean_text):
                found.add(skill)
    return list(found)

# ============================================================
# ⚖️ JD Skill Weighting (Context-Aware)
# ============================================================
def weight_jd_skills(jd_text, all_skills):
    jd_text_l = jd_text.lower()
    jd_skills = extract_skills(jd_text, all_skills)
    weights = {}

    sections = [
        (3, r"(must have|required)(.*?)(?=good to have|preferred|nice to have|optional|$)"),
        (2, r"(good to have|preferred|nice to have)(.*?)(?=must have|required|optional|$)"),
        (1, r"(optional)(.*?)(?=must have|required|good to have|preferred|nice to have|$)")
    ]

    for weight, pattern in sections:
        for match in re.finditer(pattern, jd_text_l, flags=re.DOTALL):
            window = match.group(0)
            for skill in jd_skills:
                if re.search(rf'\b{re.escape(skill.lower())}\b', window):
                    weights[skill] = max(weights.get(skill, 0), weight)

    # Default weight if mentioned but not in a section
    for skill in jd_skills:
        if skill not in weights:
            weights[skill] = 2

    return weights

# ============================================================
# 📊 Weighted Skill Match Calculator
# ============================================================
def calculate_weighted_match(user_skills, jd_skills_with_weights):
    total_weight = sum(jd_skills_with_weights.values())
    matched_weight, matched, missing = 0, [], []

    for skill, weight in jd_skills_with_weights.items():
        if skill in user_skills:
            matched.append(skill)
            matched_weight += weight
        else:
            missing.append(skill)

    match_percentage = round((matched_weight / total_weight) * 100, 2) if total_weight else 0
    return {
        "match_percentage": match_percentage,
        "matched_skills": matched,
        "missing_skills": missing
    }

# ============================================================
# 🌐 Dynamic Roadmap Generator
# ============================================================
def fetch_youtube_links(skill, max_results=3):
    try:
        response = requests.get(
            "https://www.googleapis.com/youtube/v3/search",
            params={
                "part": "snippet",
                "q": f"{skill} tutorial for beginners",
                "type": "video",
                "maxResults": max_results,
                "key": "YOUR_YOUTUBE_API_KEY"
            }
        )
        data = response.json()
        return [f"https://www.youtube.com/watch?v={item['id']['videoId']}" for item in data.get("items", [])]
    except Exception as e:
        return [f"Error fetching YouTube: {e}"]

def fetch_github_projects(skill, max_results=3):
    try:
        response = requests.get(
            "https://api.github.com/search/repositories",
            params={"q": f"{skill} projects", "sort": "stars", "per_page": max_results}
        )
        data = response.json()
        return [repo["html_url"] for repo in data.get("items", [])]
    except Exception as e:
        return [f"Error fetching GitHub: {e}"]

def generate_practical_roadmap(unmatched_skills):
    roadmap = []
    for skill in unmatched_skills:
        youtube_links = fetch_youtube_links(skill)
        github_links = fetch_github_projects(skill)
        roadmap.append({
            "skill": skill,
            "steps": [
                f"1️⃣ Learn fundamentals of {skill} (Crash course 5–10 hrs)",
                f"2️⃣ Build 1–2 hands-on projects with {skill}",
                f"3️⃣ Contribute to open-source repos using {skill}",
                f"4️⃣ Apply for internships requiring {skill}"
            ],
            "resources": {
                "YouTube Tutorials": youtube_links,
                "GitHub Projects": github_links,
                "Internship Portal": f"https://internshala.com/internships/keywords-{skill}"
            }
        })
    return roadmap

# ============================================================
# 🚀 Main Pipeline
# ============================================================
def skill_match_pipeline(resume_path, jd_text, all_skills):
    # Extract resume text
    if resume_path.endswith(".docx"):
        resume_text = extract_text_from_docx(resume_path)
    elif resume_path.endswith(".pdf"):
        resume_text = extract_text_from_pdf(resume_path)
    else:
        raise ValueError("Unsupported file format")

    user_skills = extract_skills(resume_text, all_skills)
    jd_skills_with_weights = weight_jd_skills(jd_text, all_skills)
    match_result = calculate_weighted_match(user_skills, jd_skills_with_weights)

    # Add roadmap
    match_result["roadmap"] = generate_practical_roadmap(match_result["missing_skills"])

    # Debug Printout for Evaluation
    print("\n========== DEBUG OUTPUT ==========")
    print("📘 JD Extracted Skills:", list(jd_skills_with_weights.keys()))
    print("👩‍💻 Resume Extracted Skills:", user_skills)
    print("✅ Matched Skills:", match_result["matched_skills"])
    print("❌ Missing Skills:", match_result["missing_skills"])
    print("==================================\n")

    return match_result

# ============================================================
# 🧪 Example Run
# ============================================================
if __name__ == "__main__":
    all_skills = pd.read_csv("skills.csv")["skill"].dropna().tolist()

    jd_text = """
    Sample data analyst job description
At [Company X], we’re proud to stand at the forefront of the Big Data revolution. Using the latest analytics tools and processes, we’re able to maximize our offerings and deliver unparalleled service and support. To help carry us even further, we’re searching for an experienced data analyst to join our team. The ideal candidate will be highly skilled in all aspects of data analytics, including mining, generation, and visualization. Additionally, this person should be committed to transforming data into readable, goal-oriented reports that drive innovation and growth.

Objectives of this role
Develop, implement, and maintain leading-edge analytics systems, taking complicated problems and building simple frameworks
Identify trends and opportunities for growth through analysis of complex datasets
Evaluate organizational methods and provide source-to-target mappings and information-model specification documents for datasets
Create best-practice reports based on data mining, analysis, and visualization
Evaluate internal systems for efficiency, problems, and inaccuracies, and develop and maintain protocols for handling, processing, and cleaning data
Work directly with managers and users to gather requirements, provide status updates, and build relationships
Responsibilities
Work closely with project managers to understand and maintain focus on their analytics needs, including critical metrics and KPIs, and deliver actionable insights to relevant decision-makers
Proactively analyze data to answer key questions for stakeholders or yourself, with an eye on what drives business performance, and investigate and communicate which areas need improvement in efficiency and productivity
Create and maintain rich interactive visualizations through data interpretation and analysis, with reporting components from multiple data sources
Define and implement data acquisition and integration logic, selecting an appropriate combination of methods and tools within the defined technology stack to ensure optimal scalability and performance of the solution
Develop and maintain databases by acquiring data from primary and secondary sources, and build scripts that will make our data evaluation process more flexible or scalable across datasets
Required skills and qualifications
Three or more years of experience mining data as a data analyst
Proven analytics skills, including mining, evaluation, and visualization
Technical writing experience in relevant areas, including queries, reports, and presentations
Strong SQL or Excel skills, with aptitude for learning other analytics tools
Preferred skills and qualifications
Bachelor’s degree (or equivalent) in mathematics, computer science, economics, or statistics
Experience with database and model design and segmentation techniques
Strong programming experience with frameworks, including XML, JavaScript, and ETL
Practical experience in statistical analysis through the use of statistical packages, including Excel, SPSS, and SAS
Proven success in a collaborative, team-oriented environment


Post your data analyst job now.

Share your open role with qualified data analysts using the world’s largest professional network.

Learn more
Post a free
    """

    result = skill_match_pipeline("Bhoomika_agrawal.resume.pdf", jd_text, all_skills)
    print(json.dumps(result, indent=4))



📘 JD Extracted Skills: ['SQL', 'Excel', 'Statistics', 'JavaScript']
👩‍💻 Resume Extracted Skills: ['CSS', 'Statistics', 'MySQL', 'C', 'Natural Language Processing', 'HTML', 'Leadership', 'GitHub', 'SQL', 'JavaScript', 'Python', 'Java']
✅ Matched Skills: ['SQL', 'Statistics', 'JavaScript']
❌ Missing Skills: ['Excel']

{
    "match_percentage": 70.0,
    "matched_skills": [
        "SQL",
        "Statistics",
        "JavaScript"
    ],
    "missing_skills": [
        "Excel"
    ],
    "roadmap": [
        {
            "skill": "Excel",
            "steps": [
                "1\ufe0f\u20e3 Learn fundamentals of Excel (Crash course 5\u201310 hrs)",
                "2\ufe0f\u20e3 Build 1\u20132 hands-on projects with Excel",
                "3\ufe0f\u20e3 Contribute to open-source repos using Excel",
                "4\ufe0f\u20e3 Apply for internships requiring Excel"
            ],
            "resources": {
                "YouTube Tutorials": [],
                "GitHub Projects": [

In [31]:
import re
import json
import pandas as pd
from docx import Document
import PyPDF2

# ============================================================
# 📄 Resume & JD Extraction
# ============================================================
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return " ".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

# ============================================================
# 🧩 Skill Extraction (only CSV-based)
# ============================================================
def extract_skills(text, skill_list):
    text_l = text.lower()
    clean_text = re.sub(r'[^a-z0-9+#.\s]', ' ', text_l)
    clean_text = re.sub(r'\s+', ' ', clean_text)

    found = set()

    for skill in skill_list:
        s = skill.lower().strip()
        if not s:
            continue
        if len(s) <= 2:
            pattern = rf'\b{re.escape(s)}\b'
            if re.search(pattern, clean_text):
                found.add(skill)
            continue
        if re.search(r'[\s+.#]', s):
            if re.search(re.escape(s), clean_text):
                found.add(skill)
        else:
            pattern = rf'\b{re.escape(s)}\b'
            if re.search(pattern, clean_text):
                found.add(skill)

    return list(found)

# ============================================================
# ⚖️ JD Skill Weighting
# ============================================================
def weight_jd_skills(jd_text, all_skills):
    jd_skills = extract_skills(jd_text, all_skills)
    jd_text_l = jd_text.lower()
    weights = {}

    sections = [
        (3, r"(must have|required)(.*?)(?=good to have|preferred|nice to have|optional|$)"),
        (2, r"(good to have|preferred|nice to have)(.*?)(?=must have|required|optional|$)"),
        (1, r"(optional)(.*?)(?=must have|required|good to have|preferred|nice to have|$)")
    ]

    for weight, pattern in sections:
        for match in re.finditer(pattern, jd_text_l, flags=re.DOTALL):
            window = match.group(0)
            for skill in jd_skills:
                if re.search(rf'\b{re.escape(skill.lower())}\b', window):
                    weights[skill] = max(weights.get(skill, 0), weight)

    for skill in jd_skills:
        if skill not in weights:
            weights[skill] = 2

    return weights

# ============================================================
# 📊 Weighted Match Calculator
# ============================================================
def calculate_weighted_match(user_skills, jd_skills_with_weights):
    total_weight = sum(jd_skills_with_weights.values())
    matched_weight, matched, missing = 0, [], []

    for skill, weight in jd_skills_with_weights.items():
        if skill in user_skills:
            matched.append(skill)
            matched_weight += weight
        else:
            missing.append(skill)

    match_percentage = round((matched_weight / total_weight) * 100, 2) if total_weight else 0
    return {
        "match_percentage": match_percentage,
        "matched_skills": matched,
        "missing_skills": missing
    }

# ============================================================
# 🚀 Main Pipeline
# ============================================================
def skill_match_pipeline(resume_path, jd_text, all_skills):
    if resume_path.endswith(".docx"):
        resume_text = extract_text_from_docx(resume_path)
    elif resume_path.endswith(".pdf"):
        resume_text = extract_text_from_pdf(resume_path)
    else:
        raise ValueError("Unsupported file format")

    user_skills = extract_skills(resume_text, all_skills)
    jd_skills_with_weights = weight_jd_skills(jd_text, all_skills)
    match_result = calculate_weighted_match(user_skills, jd_skills_with_weights)

    # Debug prints
    print("\n📘 JD Skills:", list(jd_skills_with_weights.keys()))
    print("👩‍💻 Resume Skills:", user_skills)
    print("✅ Matched Skills:", match_result["matched_skills"])
    print("❌ Missing Skills:", match_result["missing_skills"])

    return match_result

# ============================================================
# 🧪 Example Run
# ============================================================
if __name__ == "__main__":
    all_skills = pd.read_csv("skills.csv")["skill"].dropna().tolist()

    jd_text = """
    Electrify Services is one of the fastest growing engineering services start-ups and is looking for a Software development leader to play a key role in helping our client improve their product and take their business to the next level. This is an individual contributor role with a direct impact on company growth. We would like to see this position grow as the company evolves.

If you are looking for a high-impact and challenging role, this job is a perfect destination for you!

Major Job Duties

Involvement in the full software lifecycle: requirements, design, development, test, release, and maintenance.

Ø Design, build, and maintain efficient, reusable, and reliable C++ & C# code

Ø Programming experience using C# with .NET Core / .NET Framework

Ø Experience within scrum teams and familiarity with Agile methodologies

Ø Relational database experience with data modeling, reporting, query optimization, indexing a SQL Datastore

Ø Hands-on experience in development and testing phases of the development life cycle

Ø Understanding basics of cloud-based architecture (AWS preferred, but Microsoft Azure or Google Cloud experience is good too)

Ø Thoroughly review users' requirements and assist with integration

Ø Device optimizations and solutions for performance bottlenecks and bugs.

Ø Help maintain code quality, revision control, and organization.

Ø Analyze customer issues and identify and implement solutions.

Ø Participating in and implementing in analysis and development of test plans.

Ø Custom API development and its integration.

Ø Develop and debug source code which includes:

o Write Multi-threaded applications

o Interfaces with various peripheral devices.

o Requirements collection from internal, and external clients and documentation

o Design, development, optimize, and performance tuning of custom applications

o Unit testing and debugging of applications in various stages of the development life cycle

o Work collaboratively with peers in all the stages of the development life cycle

o Estimate and scope development work

o Produce documentation of code functionality and features consistently throughout the development cycle.

o Participate proactively in requirements analysis and design reviews

o Perform peer code reviews to discover issues and recommend changes that improve software quality

o VFD application programming and support

o Keeping a positive attitude and contributing to team effort

o Perform additional software development duties as required

Ø Produce quality software documentation:

o Design

o Unit test plans

o Release notes

Ø Assist in testing the developed software and provide support for field problems

Ø Provides guidance and mentoring to less experienced staff members.

These responsibilities are just the start! At Electrify, we encourage you to contribute wherever your interests take you — and shape your role accordingly. And this isn't just a philosophical bent: we give you 4 hours a week (10% of the work week) to pursue passion projects outside of your role responsibilities.

We are Looking for People Who Have

Ø Highly Skilled in C++ ( MFC ) and C# is a MUST

Ø 1+ years of technical experience in software architecture and development.

Ø Experience leading development efforts through all phases of SDLC.

Ø Strong communication skills: able to communicate with technical and non-technical stakeholders

Ø Strong problem-solving capabilities

Ø Experienced in an agile software development environment

Ø Capabilities to understand and troubleshoot C# code

Ø Strong Software Engineering skills including design, programming (C, C++, script), and debugging

Ø Highly motivated and self-directed

Ø Excellent debugging skills

Ø Thrive in a collaborative environment and can communicate while confidently driving multiple projects across many teams.

Ø Obsessively passionate and inquisitive and seeks to solve everyday problems in innovative ways.

Ø Laser-focused on the smallest details that are meaningful to our customers.

Job Types: Part-time, Freelance
Part-time hours: 15 per week

Salary: ₹300.00 - ₹1,000.00 per hour

    """

    result = skill_match_pipeline("Bhoomika_agrawal.resume.pdf", jd_text, all_skills)
    print(json.dumps(result, indent=4))



📘 JD Skills: ['Communication', 'C', 'Agile', 'Mentoring', 'Design', 'Google Cloud', 'Azure', 'AWS', 'C++', 'Problem Solving', 'SQL']
👩‍💻 Resume Skills: ['CSS', 'Education', 'Statistics', 'MySQL', 'C', 'Natural Language Processing', 'HTML', 'Leadership', 'GitHub', 'SQL', 'JavaScript', 'Python', 'Java']
✅ Matched Skills: ['C', 'SQL']
❌ Missing Skills: ['Communication', 'Agile', 'Mentoring', 'Design', 'Google Cloud', 'Azure', 'AWS', 'C++', 'Problem Solving']
{
    "match_percentage": 18.52,
    "matched_skills": [
        "C",
        "SQL"
    ],
    "missing_skills": [
        "Communication",
        "Agile",
        "Mentoring",
        "Design",
        "Google Cloud",
        "Azure",
        "AWS",
        "C++",
        "Problem Solving"
    ]
}


In [33]:


import re
import json
import pandas as pd
from docx import Document
import PyPDF2

# ============================================================
# 📄 Resume & JD Extraction
# ============================================================
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return " ".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

# ============================================================
# 🧩 Skill Extraction (CSV only)
# ============================================================
def extract_skills(text, skill_list):
    text_l = text.lower()
    clean_text = re.sub(r'[^a-z0-9+#.\s]', ' ', text_l)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    
    found = set()
    for skill in skill_list:
        s = skill.lower().strip()
        if not s:
            continue
        # exact match for short skills
        if len(s) <= 2:
            if re.search(rf'\b{re.escape(s)}\b', clean_text):
                found.add(skill)
            continue
        # match multi-word skills or special chars
        if re.search(r'[\s+.#]', s):
            if re.search(re.escape(s), clean_text):
                found.add(skill)
        else:
            if re.search(rf'\b{re.escape(s)}\b', clean_text):
                found.add(skill)
    return list(found)

# ============================================================
# ⚖️ JD Skill Weighting
# ============================================================
def weight_jd_skills(jd_text, all_skills):
    jd_skills = extract_skills(jd_text, all_skills)
    jd_text_l = jd_text.lower()
    weights = {}
    
    sections = [
        (3, r"(must have|required)(.*?)(?=good to have|preferred|nice to have|optional|$)"),
        (2, r"(good to have|preferred|nice to have)(.*?)(?=must have|required|optional|$)"),
        (1, r"(optional)(.*?)(?=must have|required|good to have|preferred|nice to have|$)")
    ]
    
    for weight, pattern in sections:
        for match in re.finditer(pattern, jd_text_l, flags=re.DOTALL):
            window = match.group(0)
            for skill in jd_skills:
                if re.search(rf'\b{re.escape(skill.lower())}\b', window):
                    weights[skill] = max(weights.get(skill, 0), weight)
    
    for skill in jd_skills:
        if skill not in weights:
            weights[skill] = 2  # default weight
    
    return weights

# ============================================================
# 📊 Weighted Match Calculator
# ============================================================
def calculate_weighted_match(user_skills, jd_skills_with_weights):
    total_weight = sum(jd_skills_with_weights.values())
    matched_weight, matched, missing = 0, [], []
    
    for skill, weight in jd_skills_with_weights.items():
        if skill in user_skills:
            matched.append(skill)
            matched_weight += weight
        else:
            missing.append(skill)
    
    match_percentage = round((matched_weight / total_weight) * 100, 2) if total_weight else 0
    return {
        "match_percentage": match_percentage,
        "matched_skills": matched,
        "missing_skills": missing
    }

# ============================================================
# 🚀 Skill-Roadmap Generator
# ============================================================
def generate_skill_roadmap(missing_skills):
    """
    Generates realistic roadmap for missing skills.
    Each skill has subtopics with curated learning resources.
    Progress can be tracked per subtopic.
    """
    
    # Example skill -> subtopic -> resource mapping
    curated_resources = {
        "Python": [
            {"name": "Basics & Syntax", "resource": "https://www.geeksforgeeks.org/python/", "done": False},
            {"name": "OOP in Python", "resource": "https://www.tutorialspoint.com/python/python_classes_objects.htm", "done": False},
            {"name": "Data Analysis with Pandas", "resource": "https://pandas.pydata.org/pandas-docs/stable/getting_started/index.html", "done": False},
            {"name": "Visualization with Matplotlib & Seaborn", "resource": "https://matplotlib.org/stable/tutorials/index.html", "done": False}
        ],
        "SQL": [
            {"name": "SELECT Queries", "resource": "https://www.w3schools.com/sql/sql_select.asp", "done": False},
            {"name": "JOIN Operations", "resource": "https://www.geeksforgeeks.org/sql-joins/", "done": False},
            {"name": "Indexing & Optimization", "resource": "https://www.sqlshack.com/sql-indexing-best-practices/", "done": False},
            {"name": "Stored Procedures", "resource": "https://www.tutorialspoint.com/sql/sql-stored-procedures.htm", "done": False}
        ],
        "Excel": [
            {"name": "Formulas & Functions", "resource": "https://support.microsoft.com/en-us/excel", "done": False},
            {"name": "Pivot Tables", "resource": "https://www.gcflearnfree.org/excel2016/pivottables/", "done": False},
            {"name": "Charts & Graphs", "resource": "https://www.excel-easy.com/examples/charts.html", "done": False},
            {"name": "Macros", "resource": "https://www.tutorialspoint.com/excel_vba/index.htm", "done": False}
        ],
        "JavaScript": [
            {"name": "Basics & DOM Manipulation", "resource": "https://www.javascript.com/", "done": False},
            {"name": "ES6 & Beyond", "resource": "https://www.tutorialspoint.com/es6/index.htm", "done": False},
            {"name": "Asynchronous JS & APIs", "resource": "https://developer.mozilla.org/en-US/docs/Learn/JavaScript/Asynchronous", "done": False},
            {"name": "Frameworks Overview (React/Vue/Angular)", "resource": "https://reactjs.org/tutorial/tutorial.html", "done": False}
        ],
        "ETL": [
            {"name": "ETL Concepts", "resource": "https://www.guru99.com/etl-testing.html", "done": False},
            {"name": "ETL Tools Overview", "resource": "https://www.talend.com/resources/what-is-etl/", "done": False},
            {"name": "Data Pipeline Design", "resource": "https://www.datacamp.com/community/tutorials/data-engineering-pipelines-python", "done": False},
            {"name": "Hands-on ETL Project", "resource": "https://www.kaggle.com/", "done": False}
        ]
    }
    
    roadmap = []
    
    for skill in missing_skills:
        roadmap.append({
            "skill": skill,
            "subtopics": curated_resources.get(skill, [{"name": f"{skill} Basics", "resource": "https://www.google.com/search?q="+skill, "done": False}]),
            "progress_percentage": 0
        })
        
    return roadmap

# ============================================================
# 🚀 Main Pipeline (Resume → JD → Roadmap)
# ============================================================
def skill_match_and_roadmap(resume_path, jd_text, all_skills):
    # Extract resume text
    if resume_path.endswith(".docx"):
        resume_text = extract_text_from_docx(resume_path)
    elif resume_path.endswith(".pdf"):
        resume_text = extract_text_from_pdf(resume_path)
    else:
        raise ValueError("Unsupported file format")
    
    # Extract skills
    user_skills = extract_skills(resume_text, all_skills)
    
    # Weight JD skills
    jd_skills_with_weights = weight_jd_skills(jd_text, all_skills)
    
    # Match calculation
    match_result = calculate_weighted_match(user_skills, jd_skills_with_weights)
    
    # Generate roadmap for missing skills
    roadmap = generate_skill_roadmap(match_result["missing_skills"])
    
    # Final structured output
    output = {
        "jd_skills": list(jd_skills_with_weights.keys()),
        "user_skills": user_skills,
        "matched_skills": match_result["matched_skills"],
        "missing_skills": match_result["missing_skills"],
        "match_percentage": match_result["match_percentage"],
        "roadmap": roadmap
    }
    
    return output

# ============================================================
# 🧪 Example Run
# ============================================================
if __name__ == "__main__":
    all_skills = pd.read_csv("skills.csv")["skill"].dropna().tolist()
    
    jd_text = """
    Required skills: SQL, Deep Learning, Statistics
    Good to have: JavaScript, XML, ETL, SAS, SPSS
    """
    
    result = skill_match_and_roadmap("Bhoomika_agrawal.resume.pdf", jd_text, all_skills)
    
    # Pretty JSON
    print(json.dumps(result, indent=4))


{
    "jd_skills": [
        "SQL",
        "Deep Learning",
        "Statistics",
        "JavaScript"
    ],
    "user_skills": [
        "CSS",
        "Education",
        "Statistics",
        "MySQL",
        "C",
        "Natural Language Processing",
        "HTML",
        "Leadership",
        "GitHub",
        "SQL",
        "JavaScript",
        "Python",
        "Java"
    ],
    "matched_skills": [
        "SQL",
        "Statistics",
        "JavaScript"
    ],
    "missing_skills": [
        "Deep Learning"
    ],
    "match_percentage": 72.73,
    "roadmap": [
        {
            "skill": "Deep Learning",
            "subtopics": [
                {
                    "name": "Deep Learning Basics",
                    "resource": "https://www.google.com/search?q=Deep Learning",
                    "done": false
                }
            ],
            "progress_percentage": 0
        }
    ]
}


In [2]:
import re
import json
import pandas as pd
from docx import Document
import PyPDF2

# ============================================================
# 📄 Resume & JD Extraction
# ============================================================
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return " ".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

# ============================================================
# 🧩 Skill Extraction (CSV-based)
# ============================================================
def extract_skills(text, skill_list):
    text_l = text.lower()
    clean_text = re.sub(r'[^a-z0-9+#.\s]', ' ', text_l)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    
    found = set()
    for skill in skill_list:
        s = skill.lower().strip()
        if not s:
            continue
        if len(s) <= 2:
            if re.search(rf'\b{re.escape(s)}\b', clean_text):
                found.add(skill)
            continue
        if re.search(r'[\s+.#]', s):
            if re.search(re.escape(s), clean_text):
                found.add(skill)
        else:
            if re.search(rf'\b{re.escape(s)}\b', clean_text):
                found.add(skill)
    return list(found)

# ============================================================
# ⚖️ JD Skill Weighting
# ============================================================
def weight_jd_skills(jd_text, all_skills):
    jd_skills = extract_skills(jd_text, all_skills)
    jd_text_l = jd_text.lower()
    weights = {}
    
    sections = [
        (3, r"(must have|required)(.*?)(?=good to have|preferred|nice to have|optional|$)"),
        (2, r"(good to have|preferred|nice to have)(.*?)(?=must have|required|optional|$)"),
        (1, r"(optional)(.*?)(?=must have|required|good to have|preferred|nice to have|$)")
    ]
    
    for weight, pattern in sections:
        for match in re.finditer(pattern, jd_text_l, flags=re.DOTALL):
            window = match.group(0)
            for skill in jd_skills:
                if re.search(rf'\b{re.escape(skill.lower())}\b', window):
                    weights[skill] = max(weights.get(skill, 0), weight)
    
    for skill in jd_skills:
        if skill not in weights:
            weights[skill] = 2  # default weight
    
    return weights

# ============================================================
# 📊 Weighted Match Calculator
# ============================================================
def calculate_weighted_match(user_skills, jd_skills_with_weights):
    total_weight = sum(jd_skills_with_weights.values())
    matched_weight, matched, missing = 0, [], []
    
    for skill, weight in jd_skills_with_weights.items():
        if skill in user_skills:
            matched.append(skill)
            matched_weight += weight
        else:
            missing.append(skill)
    
    match_percentage = round((matched_weight / total_weight) * 100, 2) if total_weight else 0
    return {
        "match_percentage": match_percentage,
        "matched_skills": matched,
        "missing_skills": missing
    }

# ============================================================
# 🚀 Phase 1: Roadmap Generator (Curated Resources)
# ============================================================
def generate_skill_roadmap(missing_skills):
    curated_resources = {
        "Python": [
            {"name": "Basics & Syntax", "resource": "https://www.geeksforgeeks.org/python/", "done": False},
            {"name": "OOP in Python", "resource": "https://www.tutorialspoint.com/python/python_classes_objects.htm", "done": False},
            {"name": "Data Analysis with Pandas", "resource": "https://pandas.pydata.org/docs/getting_started/index.html", "done": False},
            {"name": "Visualization with Matplotlib & Seaborn", "resource": "https://matplotlib.org/stable/tutorials/index.html", "done": False}
        ],
        "SQL": [
            {"name": "SELECT Queries", "resource": "https://www.w3schools.com/sql/sql_select.asp", "done": False},
            {"name": "JOIN Operations", "resource": "https://www.geeksforgeeks.org/sql-joins/", "done": False},
            {"name": "Indexing & Optimization", "resource": "https://www.sqlshack.com/sql-indexing-best-practices/", "done": False},
            {"name": "Stored Procedures", "resource": "https://www.tutorialspoint.com/sql/sql-stored-procedures.htm", "done": False}
        ],
        "Excel": [
            {"name": "Formulas & Functions", "resource": "https://support.microsoft.com/en-us/excel", "done": False},
            {"name": "Pivot Tables", "resource": "https://www.gcflearnfree.org/excel2016/pivottables/", "done": False},
            {"name": "Charts & Graphs", "resource": "https://www.excel-easy.com/examples/charts.html", "done": False},
            {"name": "Macros", "resource": "https://www.tutorialspoint.com/excel_vba/index.htm", "done": False}
        ]
    }
    
    roadmap = []
    for skill in missing_skills:
        roadmap.append({
            "skill": skill,
            "subtopics": curated_resources.get(skill, [{"name": f"{skill} Basics", "resource": "https://www.google.com/search?q="+skill, "done": False}]),
            "progress_percentage": 0
        })
    return roadmap

# ============================================================
# 🚀 Main Pipeline (Resume → JD → Roadmap)
# ============================================================
def skill_match_and_roadmap(resume_path, jd_text, all_skills):
    # Extract resume text
    if resume_path.endswith(".docx"):
        resume_text = extract_text_from_docx(resume_path)
    elif resume_path.endswith(".pdf"):
        resume_text = extract_text_from_pdf(resume_path)
    else:
        raise ValueError("Unsupported file format")
    
    # Extract skills
    user_skills = extract_skills(resume_text, all_skills)
    
    # Weight JD skills
    jd_skills_with_weights = weight_jd_skills(jd_text, all_skills)
    
    # Match calculation
    match_result = calculate_weighted_match(user_skills, jd_skills_with_weights)
    
    # Generate roadmap for missing skills
    roadmap = generate_skill_roadmap(match_result["missing_skills"])
    
    # Final structured output
    output = {
        "jd_skills": list(jd_skills_with_weights.keys()),
        "user_skills": user_skills,
        "matched_skills": match_result["matched_skills"],
        "missing_skills": match_result["missing_skills"],
        "match_percentage": match_result["match_percentage"],
        "roadmap": roadmap
    }
    
    return output

# ============================================================
# 🧪 Example Run
# ============================================================
if __name__ == "__main__":
    all_skills = pd.read_csv("skills.csv")["skill"].dropna().tolist()
    
    jd_text = """
    Required skills: SQL, Python, Excel
    Good to have: JavaScript, ETL
    """
    
    result = skill_match_and_roadmap("Bhoomika_agrawal.resume.pdf", jd_text, all_skills)
    print(json.dumps(result, indent=4))


{
    "jd_skills": [
        "Python",
        "Excel",
        "SQL",
        "JavaScript"
    ],
    "user_skills": [
        "Education",
        "Java",
        "HTML",
        "Python",
        "Leadership",
        "SQL",
        "Natural Language Processing",
        "MySQL",
        "Statistics",
        "C",
        "JavaScript",
        "CSS",
        "GitHub"
    ],
    "matched_skills": [
        "Python",
        "SQL",
        "JavaScript"
    ],
    "missing_skills": [
        "Excel"
    ],
    "match_percentage": 72.73,
    "roadmap": [
        {
            "skill": "Excel",
            "subtopics": [
                {
                    "name": "Formulas & Functions",
                    "resource": "https://support.microsoft.com/en-us/excel",
                    "done": false
                },
                {
                    "name": "Pivot Tables",
                    "resource": "https://www.gcflearnfree.org/excel2016/pivottables/",
                    "don