In [8]:
from pypdf import PdfReader
import re

In [3]:
pdf = PdfReader("./PawanKumar_Resume.pdf")
page = pdf.pages[0]

In [4]:
texts = page.extract_text()

In [6]:
# atext = []
# for text in texts:
#     atext.append(text)

# print("\n".join(atext))

In [10]:
text = re.sub(r"\s+", " ", texts).strip()
# print(text.strip())

In [12]:
sentences = re.split(r"(?<=[\.\!\?])\s+", text)
textList = [s.strip() for s in sentences if s.strip()]

In [20]:
# lines = sentences.splitlines()
# bullets = []
# for line in lines:
#     stripped = line.strip()
#     if stripped.startswith(("-", "‚Ä¢", "*")):
#         bullets.append(stripped.lstrip("-‚Ä¢* ").strip())
#     print(bullets)


In [21]:
# app.py
import io
import re
from typing import List, Dict, Tuple

import streamlit as st
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from PyPDF2 import PdfReader
import docx


# ==============
# CONFIG / CONSTANTS
# ==============

st.set_page_config(
    page_title="ML ATS Resume Analyzer",
    page_icon="üìÑ",
    layout="wide",
)

WEAK_PHRASES = [
    "responsible for",
    "worked on",
    "helped with",
    "assisted with",
    "participated in",
    "various tasks",
    "duties included",
    "hard-working",
    "team player",
    "result-oriented",
    "fast learner",
    "self-motivated",
    "detail-oriented"
]

ACTION_VERBS = [
    "achieved", "analyzed", "built", "created", "designed", "developed",
    "implemented", "led", "managed", "optimized", "reduced", "improved",
    "increased", "delivered", "launched", "owned", "resolved", "conducted",
    "orchestrated", "shipped", "enhanced", "automated", "deployed"
]

EXPECTED_SECTIONS = [
    "summary", "objective", "experience", "work experience", "professional experience",
    "education", "skills", "projects", "certifications", "achievements"
]


# ==============
# UTILS: FILE READING
# ==============

def read_pdf(file_bytes: bytes) -> str:
    reader = PdfReader(io.BytesIO(file_bytes))
    text = []
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text.append(page_text)
    return "\n".join(text)


def read_docx(file_bytes: bytes) -> str:
    file_stream = io.BytesIO(file_bytes)
    doc = docx.Document(file_stream)
    text = []
    for para in doc.paragraphs:
        text.append(para.text)
    return "\n".join(text)


def read_txt(file_bytes: bytes) -> str:
    try:
        return file_bytes.decode("utf-8")
    except UnicodeDecodeError:
        return file_bytes.decode("latin-1", errors="ignore")


def extract_text(uploaded_file) -> str:
    if uploaded_file is None:
        return ""
    file_bytes = uploaded_file.read()
    name = uploaded_file.name.lower()

    if name.endswith(".pdf"):
        return read_pdf(file_bytes)
    elif name.endswith(".docx"):
        return read_docx(file_bytes)
    elif name.endswith(".txt"):
        return read_txt(file_bytes)
    else:
        # fallback: try utf-8
        return read_txt(file_bytes)


# ==============
# NLP / ANALYSIS HELPERS
# ==============

def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def split_into_sentences(text: str) -> List[str]:
    # Simple heuristic sentence splitter
    sentences = re.split(r"(?<=[\.\!\?])\s+", text)
    return [s.strip() for s in sentences if s.strip()]


def extract_bullets(text: str) -> List[str]:
    lines = text.splitlines()
    bullets = []
    for line in lines:
        stripped = line.strip()
        if stripped.startswith(("-", "‚Ä¢", "*")):
            bullets.append(stripped.lstrip("-‚Ä¢* ").strip())
    return bullets


def contains_metric(text: str) -> bool:
    # Number, percentage or currency
    return bool(re.search(r"(\d+[%]?)|(\$\d+)", text))


def starts_with_action_verb(text: str) -> bool:
    if not text:
        return False
    first_word = text.split()[0].lower()
    return first_word in ACTION_VERBS


def section_coverage_score(text: str) -> Tuple[float, Dict[str, bool]]:
    found = {}
    lower = text.lower()
    for sec in EXPECTED_SECTIONS:
        found[sec] = sec in lower
    score = sum(found.values()) / len(EXPECTED_SECTIONS) if EXPECTED_SECTIONS else 0
    return score, found


def keyword_match_score(resume_text: str, jd_text: str) -> float:
    resume_text = clean_text(resume_text)
    jd_text = clean_text(jd_text)

    if not jd_text or len(jd_text.split()) < 5:
        return 0.0

    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform([resume_text, jd_text])
    sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
    return float(sim)


def detect_weak_phrases(text: str) -> List[Dict]:
    lower = text.lower()
    findings = []
    for phrase in WEAK_PHRASES:
        start = 0
        while True:
            idx = lower.find(phrase, start)
            if idx == -1:
                break
            findings.append({
                "phrase": phrase,
                "start": idx,
                "end": idx + len(phrase),
            })
            start = idx + len(phrase)
    return findings


def compute_ats_scores(resume_text: str, jd_text: str = "") -> Dict:
    resume_text = clean_text(resume_text)

    bullets = extract_bullets(resume_text)
    sentences = split_into_sentences(resume_text)

    # 1. Structure / sections
    section_score, section_found = section_coverage_score(resume_text)

    # 2. Keyword match vs JD (ML-ish part)
    kw_score = keyword_match_score(resume_text, jd_text)  # 0‚Äì1

    # 3. Action verbs
    if bullets:
        action_starts = sum(starts_with_action_verb(b) for b in bullets)
        action_score = action_starts / len(bullets)
    else:
        action_score = 0.3  # neutral-ish default

    # 4. Metrics in bullets
    if bullets:
        with_metrics = sum(contains_metric(b) for b in bullets)
        metric_score = with_metrics / len(bullets)
    else:
        metric_score = 0.2

    # 5. Length / readability
    word_count = len(resume_text.split())
    if word_count < 200:
        length_score = 0.3
    elif 200 <= word_count <= 800:
        length_score = 1.0
    elif 800 < word_count <= 1200:
        length_score = 0.7
    else:
        length_score = 0.4

    # Weighted ATS score (0‚Äì100)
    final_score = (
        section_score * 0.2 +
        kw_score * 0.3 +
        action_score * 0.2 +
        metric_score * 0.15 +
        length_score * 0.15
    ) * 100

    return {
        "final_score": round(final_score, 1),
        "section_score": round(section_score * 100, 1),
        "keyword_score": round(kw_score * 100, 1),
        "action_score": round(action_score * 100, 1),
        "metric_score": round(metric_score * 100, 1),
        "length_score": round(length_score * 100, 1),
        "word_count": word_count,
        "bullets_count": len(bullets),
        "section_found": section_found,
        "sentences": sentences,
        "bullets": bullets,
    }


def generate_suggestions(analysis: Dict, weak_phrases: List[Dict], has_jd: bool) -> List[str]:
    suggestions = []

    # Sections
    missing_sections = [sec for sec, present in analysis["section_found"].items() if not present]
    if missing_sections:
        suggestions.append(
            f"Add or strengthen sections: **{', '.join(missing_sections)}** "
            f"to match standard ATS expectations."
        )

    # Keyword match
    if has_jd:
        if analysis["keyword_score"] < 50:
            suggestions.append(
                "Your resume is not well-aligned with the job description. "
                "Include more role-specific keywords and mirror terminology used in the JD."
            )
        else:
            suggestions.append(
                "Good keyword alignment with the job description. Consider tailoring a few more bullet points "
                "to emphasise the most important responsibilities."
            )

    # Action verbs
    if analysis["action_score"] < 60:
        suggestions.append(
            "More bullet points should start with strong action verbs "
            f"(e.g., {', '.join(ACTION_VERBS[:8])}, etc.)."
        )

    # Metrics
    if analysis["metric_score"] < 40:
        suggestions.append(
            "Quantify your impact: add numbers (%, $ or counts) to show scale (e.g., "
            "\"Improved response time by 30%\", \"Managed a team of 5\")."
        )

    # Length
    if analysis["length_score"] < 60:
        if analysis["word_count"] < 200:
            suggestions.append(
                "The resume is quite short. Add more detail to your experience, skills, and projects."
            )
        elif analysis["word_count"] > 1200:
            suggestions.append(
                "The resume may be too long. Try to trim older or less relevant experience and keep it concise."
            )

    # Weak phrases
    if weak_phrases:
        unique_weak = sorted(set([w["phrase"] for w in weak_phrases]))
        suggestions.append(
            "Replace generic phrases like "
            + ", ".join(f"**{p}**" for p in unique_weak)
            + " with specific, impact-focused statements."
        )

    if not suggestions:
        suggestions.append("Your resume is in good shape. Consider minor polishing for clarity and consistency.")

    return suggestions


def highlight_text(text: str, weak_phrases: List[Dict]) -> str:
    """
    Returns HTML with weak phrases wrapped in <mark>.
    Assumes `weak_phrases` items have 'start' and 'end' indices.
    """
    if not weak_phrases:
        return text.replace("\n", "<br>")

    # sort by start index
    weak_phrases = sorted(weak_phrases, key=lambda x: x["start"])

    highlighted = []
    last_idx = 0
    for wp in weak_phrases:
        start, end = wp["start"], wp["end"]
        # add normal text before
        highlighted.append(
            escape_html(text[last_idx:start])
        )
        # add highlighted phrase
        highlighted.append(
            f"<mark>{escape_html(text[start:end])}</mark>"
        )
        last_idx = end

    # remaining text
    highlighted.append(escape_html(text[last_idx:]))

    return "".join(highlighted).replace("\n", "<br>")


def escape_html(text: str) -> str:
    return (
        text.replace("&", "&amp;")
            .replace("<", "&lt;")
            .replace(">", "&gt;")
    )


# ==============
# UI LAYOUT
# ==============

def main():
    st.title("üìÑ ML-based ATS Resume Analyzer")
    st.write(
        "Upload your resume and (optionally) a job description. "
        "This tool simulates an ATS-style screening and highlights weak phrases."
    )

    with st.sidebar:
        st.header("1. Upload Resume")
        uploaded_file = st.file_uploader(
            "Resume file (.pdf, .docx, .txt)",
            type=["pdf", "docx", "txt"],
        )

        st.header("2. Job Description (Optional)")
        jd_text = st.text_area(
            "Paste job description here to check keyword match",
            height=200,
            placeholder="Paste the JD here for better ATS keyword analysis..."
        )

        analyze_button = st.button("Analyze Resume üöÄ")

    if not analyze_button:
        st.info("Upload your resume and click **Analyze Resume üöÄ** in the sidebar to start.")
        return

    if not uploaded_file:
        st.error("Please upload a resume file first.")
        return

    resume_text = extract_text(uploaded_file)
    if not resume_text or len(resume_text.strip()) < 20:
        st.error("Could not extract enough text from the resume. Try another file or format.")
        return

    has_jd = bool(jd_text and len(jd_text.strip().split()) > 5)

    # ---- Run analysis ----
    analysis = compute_ats_scores(resume_text, jd_text if has_jd else "")
    weak_phrases = detect_weak_phrases(resume_text)
    suggestions = generate_suggestions(analysis, weak_phrases, has_jd)

    # ========== TOP SUMMARY ==========
    st.subheader("Overall ATS Score")

    col1, col2, col3 = st.columns([2, 1, 1])

    with col1:
        st.metric("ATS Score", f"{analysis['final_score']}/100")
        st.write(
            f"**Word count:** {analysis['word_count']} | "
            f"**Bullets:** {analysis['bullets_count']}"
        )

        st.progress(min(analysis["final_score"] / 100, 1.0))

    with col2:
        st.metric("Keyword Match" + (" (JD)" if has_jd else ""), f"{analysis['keyword_score']}%")
        st.metric("Sections Quality", f"{analysis['section_score']}%")

    with col3:
        st.metric("Action Verbs", f"{analysis['action_score']}%")
        st.metric("Quantification", f"{analysis['metric_score']}%")

    # ========== DETAILED BREAKDOWN ==========
    st.markdown("---")
    st.subheader("Score Breakdown")

    colA, colB = st.columns(2)

    with colA:
        st.markdown("#### Sections Detected")
        for sec, present in analysis["section_found"].items():
            emoji = "‚úÖ" if present else "‚ö†Ô∏è"
            st.write(f"{emoji} {sec.title()}")

    with colB:
        st.markdown("#### Length & Structure")
        st.write(f"- **Word count:** {analysis['word_count']}")
        st.write(f"- **Length score:** {analysis['length_score']}%")
        if analysis["bullets_count"]:
            st.write(f"- **Bullet points:** {analysis['bullets_count']}")
        else:
            st.write("- No bullet points detected (consider using bullets for experience).")

    # ========== WEAK PHRASES ==========
    st.markdown("---")
    st.subheader("Weak / Generic Phrases Detected")

    if not weak_phrases:
        st.success("No common weak phrases detected ‚úÖ")
    else:
        unique_weak = sorted(set(w["phrase"] for w in weak_phrases))
        st.warning(
            "These phrases are often considered vague or generic. "
            "Try to replace them with specific, action-oriented achievements."
        )
        st.write(", ".join(f"**{p}**" for p in unique_weak))

    # ========== SUGGESTIONS ==========
    st.markdown("---")
    st.subheader("Improvement Suggestions")

    for s in suggestions:
        st.markdown(f"- {s}")

    # ========== RESUME TEXT WITH HIGHLIGHTS ==========
    st.markdown("---")
    st.subheader("Resume Text (Highlighted)")

    highlighted_html = highlight_text(resume_text, weak_phrases)
    st.markdown(
        f"<div style='white-space: pre-wrap; font-family: monospace; font-size: 13px;'>{highlighted_html}</div>",
        unsafe_allow_html=True,
    )

    st.caption("Yellow highlights mark weak or generic phrases detected by the analyzer.")


if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'streamlit'