This is a smart and interactive Job Description (JD) parser tool built using Gradio and LLMs (LLaMA3 via Groq). It extracts, cleans, and rewrites job descriptions from two types of inputs:

1.Using URLs

2.Using Single pdf/Multiple pdf

## Installing All Libs

In [None]:
!pip install requests beautifulsoup4 gradio PyMuPDF pypdf groq pypdf2

**Importing** **all libs**

In [None]:
import os         # Handles operating system environment variables and file paths
import re         # Used for regular expressions (text pattern matching and substitution)
import unicodedata  # Used for normalizing and cleaning text (e.g., removing accents)
import gradio as gr  # Gradio library to build web UI for your JD parser

import pymupdf       # Library for extracting text from PDF files (also known as fitz)
from pypdf import PdfReader  # Fallback PDF reader in case PyMuPDF fails
from groq import Groq  # Groq client to interact with LLaMA3-based models via Groq API

import requests            # Used for making HTTP requests (scraping webpages, calling APIs)
from bs4 import BeautifulSoup  # For parsing and extracting text from HTML (job pages)
import json                # To format and handle JSON data (API payloads, outputs)

from google.colab import userdata  # Special Colab module to securely access API keys
import zipfile       # Used to compress and bundle multiple files into a ZIP archive


## PART 1.To Scrape Job Content From Job Portal URLs.

In [None]:
# === PART 1: URL Scraper + JD Extractor ===

# Set Groq API Key securely from Colab's userdata (better than hardcoding)
os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")   # Secure API key access
GROQ_API_KEY = os.getenv("GROQ_API_KEY")                    #  Load key from environment
GROQ_MODEL = "llama3-8b-8192"                               #  Use LLaMA3 8B model for extraction

#  Function to scrape any job page URL and extract full body text
def scrape_job_page_generic(url):
    headers = {"User-Agent": "Mozilla/5.0"}  # Spoof browser user-agent
    try:
        response = requests.get(url, headers=headers, timeout=15)  # Make HTTP GET request
        response.raise_for_status()  # Raise error for bad status codes (4xx, 5xx)
    except Exception as e:
        return {"error": f"Failed to fetch page: {str(e)}"}  #  Return error if fetch fails

    soup = BeautifulSoup(response.content, "html.parser")  # Parse HTML content
    body_text = soup.body.get_text(separator="\n", strip=True) if soup.body else "No body text found"  #  Extract visible text
    return {"content": body_text}  # Return extracted content in dict

#  Use LLaMA3 to clean and extract structured job fields from raw scraped text
def clean_with_llama3(raw_data):
    prompt = f"""
You are a smart job information extractor.

From the below raw text scraped from a job detail page, extract the following fields clearly:
- "Role"
- "Job Description"
- "Qualification"
- "Locations"
- "Additional Information"
- "About"
- "Important Notice"

Output the result in this format (line-by-line):

"Role": ...
"Job Description": ...
"Qualification": ...
"Locations": ...
"Additional Information": ...
"About": ...
"Important Notice": ...

If some data is not available, just write "Not found".

Raw Scraped Content:
{json.dumps(raw_data, indent=2)}
    """  # 📋 Prompt tells the LLM how to extract clean fields

    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",  #  Auth header
        "Content-Type": "application/json"          #  Set content type for JSON API
    }

    payload = {
        "model": GROQ_MODEL,  #  Use selected LLaMA3 model
        "messages": [         # Chat-style messages (system + user prompt)
            {"role": "system", "content": "You are a helpful assistant that extracts job content into labeled fields."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.3    # Low temperature for accurate, stable output
    }

    try:
        response = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload)  #  Send request to Groq LLM
        response.raise_for_status()  #  Raise error if API fails
        return response.json()["choices"][0]["message"]["content"].strip()  # Return cleaned output
    except Exception as e:
        return f"Error during LLM call: {str(e)}"  #  Return error string if call fails

# 🔄 Wrapper function to combine scraping and cleaning steps
def extract_job_details_from_url(url):
    scraped = scrape_job_page_generic(url)        #  Scrape raw job content
    if "error" in scraped:                         #  Handle errors in scraping
        return scraped["error"]
    return clean_with_llama3(scraped)              #  Clean and extract fields

# ✅ Example Run
print(extract_job_details_from_url("https://www.accenture.com/in-en/careers/jobdetails?id=ATCI-4995350-S1864578_en&title=Software+Development+Lead"))  # 🧪 Test with a real job URL


## PART 2: PDF/TXT Extractor + Cleaner + Rewriter


Single & Multi-File Handling

Single JD Mode: Processes one file and returns extracted, cleaned, and rewritten versions.

Multiple JD Mode: Loops over multiple files, rewrites each one, and saves them into a downloadable

In [None]:
# Initialize the Groq client with the API key from environment variables
client = Groq(api_key=os.environ["GROQ_API_KEY"])

#  Extract text from PDF or TXT file
def extract_text_from_path(file_path):
    try:
        if file_path.endswith('.pdf'):  #  If the uploaded file is a PDF
            text = ""
            with pymupdf.open(file_path) as doc:  # Try reading using PyMuPDF
                for page in doc:
                    text += page.get_text()  # Collect text from each page
            if text.strip():                  #  If text is found, return it
                return text.strip()

            # Fallback to PyPDF if PyMuPDF failed
            reader = PdfReader(file_path)
            fallback = "\n".join(page.extract_text() or "" for page in reader.pages)
            return fallback.strip()

        elif file_path.endswith('.txt'):               #If the uploaded file is a plain text file
            with open(file_path, "r", encoding="utf-8") as f:
                return f.read().strip()                #  Read and return text
    except Exception as e:
        return f"Error: {e}"                           # Handle any exceptions

    return "Unsupported file"                           #  If not PDF/TXT

#  Clean and normalize extracted text
def sanitize_text(text):
    clean = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')  #  Remove accents
    clean = re.sub(r'\s+', ' ', clean)        #Collapse all whitespace to single space
    return clean.strip()                      # Return cleaned string

#  Rewrite the JD using LLM (Groq's LLaMA3)
def rewrite_jd_with_llm(jd_text):
    prompt = f"""
You are a skilled HR content writer.

Your job is to rewrite the JD below:
- Professional & clear
- ATS-friendly
- Structured using markdown (**bold**, *italic*, - bullet points, etc.)

--- JD START ---
{jd_text}
--- JD END ---

Rewrite now:
"""
    try:
        response = client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[{"role": "user", "content": prompt}]  #  Send prompt to LLM
        )
        return response.choices[0].message.content.strip()  #  Return the rewritten JD
    except Exception as e:
        return f"LLM Error: {e}"  #  Catch and return any LLM error

# Handle a single uploaded JD file
def handle_single(file):
    if file is None:  #  Check if file is empty
        return "No file", "No cleaned", "No rewrite", None

    extracted = extract_text_from_path(file.name)     # Extract raw text
    cleaned = sanitize_text(extracted)                # Clean the text
    rewritten = rewrite_jd_with_llm(cleaned)          #  Rewrite using LLM

    out_path = file.name.replace(".pdf", "_rewritten.md").replace(".txt", "_rewritten.md")  #  Output path
    with open(out_path, "w", encoding="utf-8") as f:  # Save rewritten JD to file
        f.write(rewritten)

    return extracted[:1000], cleaned[:1000], rewritten[:1500], out_path  # Return results for display

#  Handle multiple uploaded JD files
def handle_multiple(files):
    results = []  #  List to collect all results
    zipf = zipfile.ZipFile("All_Rewritten_JDs.zip", "w", zipfile.ZIP_DEFLATED)  # 🗜️ Create zip archive

    for file in files:
        fname = os.path.basename(file.name)  #  Get filename
        raw = extract_text_from_path(file.name)  #  Extract raw text
        cleaned = sanitize_text(raw)             #  Clean it
        rewritten = rewrite_jd_with_llm(cleaned) #  Rewrite it

        rewritten_path = f"rewritten_{fname}.md"  #  Output markdown file path
        with open(rewritten_path, "w", encoding="utf-8") as f:
            f.write(rewritten)  #  Save rewritten JD

        zipf.write(rewritten_path)  #  Add file to zip
        results.append((fname, raw[:600], cleaned[:600], rewritten[:1000], rewritten_path))  #  Collect output

    zipf.close()  #  Close zip file after writing
    return results, "All_Rewritten_JDs.zip"  # Return data + zip file path


# Gradio Provides a interactive interface for JD processing



Supports multi-file upload, file previews, formatted output, and downloads




In [None]:
import gradio as gr    # Import Gradio for building the web-based interface

# Custom CSS to style the entire app (animations, colors, borders, etc.)

custom_css = """

@keyframes dash-light {
  0% {
    border-color: #b6d0e2;
    box-shadow: 0 0 3px #b6d0e2;
  }
  50% {
    border-color: #ffb6c1;
    box-shadow: 0 0 6px #ffb6c1;
  }
  100% {
    border-color: #b6d0e2;
    box-shadow: 0 0 3px #b6d0e2;
  }
}

#title-box {
    border: 2px dashed #b6d0e2;
    border-radius: 4px;
    padding: 10px 15px;
    background-color: #fff7fb;
    text-align: center;
    font-weight: 600;
    font-size: 18px;
    color: #ff1493;  /* Pink text */
    margin: 20px auto;
    width: 70%;
    animation: dash-light 6s infinite ease-in-out;
}





/* Optional entire background */
body {
    background-color: #b6d0e2 !important;
}



@keyframes animatedGradient {
    0% { background-position: 0% 50%; }
    50% { background-position: 100% 50%; }
    100% { background-position: 0% 50%; }
}
body {
    background: linear-gradient(-45deg, #f8e1ff, #fdf4ff, #ffe6f0, #f3f0ff);
    background-size: 400% 400%;
    animation: animatedGradient 15s ease infinite;
}
h1, h2, h3, .prose h2 {
    color: #ff69b4;
    font-weight: bold;
    text-align: center;
    animation: fadeIn 1s ease;
}
/* Purple Tab Borders Only */
button[role="tab"] {
    background-color: transparent; /* No background fill */
    color: #22c55e;               /* Optional: Text matches border */
    border: 2px solid #22c55e;    /* Border in purple */
    font-weight: bold;
    border-radius: 10px;
    margin: 5px;
    padding: 6px 12px;            /* Optional: Adds some space inside button */
}
button[role="tab"]:hover {
    transform: scale(1.05);
}
button.process-btn {
    background-color: #3b82f6;
    color: white;
    font-weight: bold;
    border-radius: 10px;
    padding: 10px 20px;
    transition: all 0.3s ease-in-out;
}
button.process-btn:hover {
    background-color: #4ade80;
    box-shadow: 0 0 10px #4ade80;
    transform: scale(1.05);
}
.upload-box {
    border: 2px solid #8a2be2;
    border-radius: 12px;
    padding: 15px;
    background-color: #f9f7ff;
    margin-bottom: 15px;
    transition: box-shadow 0.3s ease;
}
.upload-box:hover {
    box-shadow: 0 0 15px #a78bfa;
}

/* ✅ GREEN LABELS ONLY for Single JD tab */
.single-jd label, .single-jd span {
    color: #22c55e !important;
    font-weight: bold;
}

/* Don't touch Multiple JD tab */
textarea, .gr-textbox {
    border: 2px solid #8a2be2 !important;
    border-radius: 12px;
    background-color: #fdfdff;
}
footer, #footer {
    background-color: #ffe6f0;
    color: #ff1493;
    font-weight: bold;
    text-align: center;
    border-radius: 12px;
}
"""

# Create the Gradio app layout using Blocks

with gr.Blocks(css=custom_css) as app:
    with gr.Column(elem_classes=["app-border"]):# Main column wrapper
        gr.HTML('<div id="title-box">JD PARSER FROM PDFS </div>')

        with gr.Tabs(): # Create tabs for single and multiple JD parsing
            ...
            # Tab for single JD processing
            with gr.Tab("📄 Single JD"):
                with gr.Column(elem_classes=["upload-box", "single-jd"]):
                    file_in = gr.File(label="Upload JD", file_types=[".pdf", ".txt"])
                    go_btn = gr.Button("✨ Process JD", elem_classes=["process-btn"])

                with gr.Column(elem_classes=["single-jd"]):
                    raw = gr.Textbox(label="Raw Extracted", lines=6, elem_id="raw")# Display raw extracted text
                clean = gr.Textbox(label="Cleaned", lines=6, elem_id="clean")# Display cleaned text
                final = gr.Textbox(label=" Rewritten JD", lines=10, elem_id="final")# Display rewritten JD


                download = gr.File(label="⬇ Download Final JD")

                go_btn.click(handle_single, inputs=file_in, outputs=[raw, clean, final, download])

            # ✅ MULTIPLE JD SECTION - DEFAULT BLACK
            with gr.Tab("📁 Multiple JDs"):
                with gr.Column(elem_classes=["upload-box"]):
                    multi_in = gr.File(label="Upload JDs", file_types=[".pdf", ".txt"], file_count="multiple")# Upload multiple JDs
                    multi_btn = gr.Button("Process All JDs", elem_classes=["process-btn"])

                output_df = gr.Dataframe(headers=["File", "Raw", "Cleaned", "Rewritten", "Download"], wrap=True)
                zip_download = gr.File(label="⬇ Download All in ZIP")

                def run_multi(files):  # Process each file and return results
                    data, zipfile_path = handle_multiple(files)
                    return [[d[0], d[1], d[2], d[3], d[4]] for d in data], zipfile_path

                multi_btn.click(run_multi, inputs=multi_in, outputs=[output_df, zip_download])


# ✅ Launch app
app.launch()
