In [1]:
docs = [
    "resume/Aman_Kumar_Resume_entry_lvl_mld.pdf",
    "resume/resume (1).pdf",
    "resume/resume (2).pdf",
    "resume/resume (3).pdf",
    "resume/resume (4).pdf",
    "resume/resume (5).pdf",
    "resume/resume (6).pdf",
    "resume/resume (7).pdf",
]

In [None]:
import os
import re
import fitz
import spacy
import requests
import unicodedata
from spacy.matcher import PhraseMatcher
from concurrent.futures import ThreadPoolExecutor

# Load NLP model globally to avoid reloading
nlp = spacy.load("en_core_web_lg")

class ResumeParsingPipeline:
    def __init__(self, github_token=None):
        self.github_token = github_token or os.environ.get("GITHUB_TOKEN")
        self.skills_list = [
            "python", "java", "php", "laravel", "sql", "mysql", "javascript", 
            "typescript", "flask", "django", "fastapi", "node.js", "react", 
            "docker", "kubernetes", "aws", "ml", "machine learning", "nlp"
        ]
        self.matcher = self._build_matcher()

    def _build_matcher(self):
        matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        patterns = [nlp.make_doc(text) for text in self.skills_list]
        matcher.add("SKILLS", patterns)
        return matcher

    # --- Utility Methods ---
    def normalize_text(self, text):
        if not text: return ""
        text = unicodedata.normalize("NFKD", text).lower()
        text = re.sub(r"[‚Äì‚Äî‚àí]", "-", text)
        text = re.sub(r"[‚Ä¢‚óè‚ñ™‚ñ∫‚ñ†¬∑]", " ", text)
        text = re.sub(r"[^a-z0-9\.\-\+\s]", " ", text)
        return re.sub(r"\s+", " ", text).strip()

    def call_github_api(self, url):
        headers = {"Accept": "application/vnd.github+json"}
        if self.github_token:
            headers["Authorization"] = f"Bearer {self.github_token}"
        try:
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            return resp.json()
        except Exception as e:
            print(f"Error calling API {url}: {e}")
            return None

    # --- Extraction Stages ---
    def extract_from_pdf(self, pdf_path):
        doc = fitz.open(pdf_path)
        text, links = "", []
        for page in doc:
            text += page.get_text()
            links.extend([l['uri'] for l in page.get_links() if 'uri' in l])
        return text, links

    def parse_github_data(self, links):
        github_url = next((l for l in links if "github.com" in l), None)
        if not github_url: return None

        username = github_url.rstrip("/").split("/")[-1]
        profile = self.call_github_api(f"https://api.github.com/users/{username}")
        
        if profile:
            repos_data = self.call_github_api(profile['repos_url'])
            return {
                "username": username,
                "profile_url": github_url,
                "followers": profile.get("followers"),
                "repos": [{"name": r["name"], "lang": r["language"]} for r in repos_data[:5]] # Top 5
            }
        return None

    def extract_nlp_entities(self, text):
        doc = nlp(text)
        normalized = self.normalize_text(text)
        
        # Extract Name
        name = next((ent.text for ent in doc.ents if ent.label_ == "PERSON"), "Unknown")
        
        # Extract Skills using Matcher
        matches = self.matcher(doc)
        skills = list(set([doc[start:end].text.lower() for _, start, end in matches]))
        
        # Extract Education & Experience (Regex based)
        edu_keywords = ["bachelor", "b.tech", "master", "m.tech", "phd", "bsc"]
        education = [line for line in text.split('\n') if any(x in line.lower() for x in edu_keywords)]
        
        return {
            "name": name,
            "skills": skills,
            "education": education,
            "address": list(set([ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]))
        }

    # --- The Pipeline Runner ---
    def run(self, pdf_path):
        print(f"üöÄ Processing: {pdf_path}")
        
        # 1. Extraction
        raw_text, links = self.extract_from_pdf(pdf_path)
        
        # 2. NLP Analysis
        nlp_data = self.extract_nlp_entities(raw_text)
        
        # 3. Enrichment
        github_info = self.parse_github_data(links)
        
        # 4. Final Object
        return {
            "candidate_name": nlp_data["name"],
            "contact": {"links": links, "address": nlp_data["address"]},
            "qualifications": {
                "skills": nlp_data["skills"],
                "education": nlp_data["education"]
            },
            "external_profiles": {"github": github_info},
            "metadata": {"raw_text_length": len(raw_text)}
        }

# --- Execution ---
if __name__ == "__main__":
    pipeline = ResumeParsingPipeline()
    result = pipeline.run("resume/Aman_Kumar_Resume_entry_lvl_mld.pdf")
    
    import json
    print(json.dumps(result, indent=2))