In [1]:
docs = [
    "resume/Aman_Kumar_Resume_entry_lvl_mld.pdf",
    "resume/resume (1).pdf",
    "resume/resume (2).pdf",
    "resume/resume (3).pdf",
    "resume/resume (4).pdf",
    "resume/resume (5).pdf",
    "resume/resume (6).pdf",
    "resume/resume (7).pdf",
]

In [4]:
import os
import re
import fitz
import spacy
import requests
import unicodedata
from concurrent.futures import ThreadPoolExecutor
from spacy.matcher import PhraseMatcher

class ResumePipeline:
    def __init__(self, github_token=None):
        self.nlp = spacy.load("en_core_web_lg")
        self.github_token = github_token or os.environ.get("GITHUB_TOKEN")
        self.skills_list = [
            "python", "java", "php", "laravel", "sql", "mysql", "javascript", "typescript",
            "flask", "django", "fastapi", "node.js", "react", "vue", "ml", "machine learning",
            "nlp", "deep learning", "pytorch", "tensorflow", "docker", "aws", "rest api"
        ]
        
    def _normalize_text(self, text):
        if not text: return ""
        text = unicodedata.normalize("NFKD", text).lower()
        text = re.sub(r"[–—−]", "-", text)
        text = re.sub(r"[•●▪►■·]", " ", text)
        text = re.sub(r"[^a-z0-9\.\-\+\s]", " ", text)
        return re.sub(r"\s+", " ", text).strip()

    def _call_github_api(self, url):
        headers = {"Accept": "application/vnd.github+json", "User-Agent": "Resume-Pipeline"}
        if self.github_token:
            headers["Authorization"] = f"Bearer {self.github_token}"
        try:
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            return resp.json()
        except Exception as e:
            print(f"API Error: {e}")
            return None

    def extract_from_pdf(self, pdf_path):
        doc = fitz.open(pdf_path)
        text, links = "", []
        for page in doc:
            text += page.get_text()
            links.extend([l['uri'] for l in page.get_links() if 'uri' in l])
        return text, list(set(links))

    def get_github_data(self, links):
        github_url = next((l for l in links if "github.com" in l), None)
        if not github_url: return None

        username = github_url.rstrip("/").split("/")[-1]
        profile = self._call_github_api(f"https://api.github.com/users/{username}")
        
        if not profile: return None

        # Parallelize sub-calls (repos, followers, following)
        with ThreadPoolExecutor(max_workers=3) as executor:
            future_repos = executor.submit(self._call_github_api, profile['repos_url'])
            future_followers = executor.submit(self._call_github_api, profile['followers_url'])
            
            repos = future_repos.result() or []
            
            # Extract detailed repo info with nested language calls
            repo_details = []
            for r in repos[:10]: # Limiting to 10 for performance
                repo_details.append({
                    "id": r["id"],
                    "node_id": r["node_id"],
                    "name": r["name"],
                    "full_name": r["full_name"],
                    "private": r["private"],
                    "owner": r["owner"],
                    "html_url": r["html_url"],
                    "description": r["description"],
                    "branches_url": r["branches_url"],
                    "tech_stack": self._call_github_api(r["languages_url"]),
                    "contributors_url": r["contributors_url"],
                    "created_at": r["created_at"],
                    "updated_at": r["updated_at"],
                    "pushed_at": r["pushed_at"],
                    "git_url": r["git_url"],
                    "ssh_url": r["ssh_url"],
                    "clone_url": r["clone_url"],
                    "size": r["size"],
                    "stargazers_count": r["stargazers_count"],
                    "watchers_count": r["watchers_count"],
                    "language": r["language"],
                    "forks_count": r["forks_count"],
                    "open_issues_count": r["open_issues_count"],
                    "topics": r["topics"],
                    "visibility": r["visibility"],
                    "forks": r["forks"],
                    "default_branch": r["default_branch"],
                })

        return {
            "username": username,
            "profile_name": profile.get("name"),
            "bio": profile.get("bio"),
            "repositories": repo_details,
            "follower_count": profile.get("followers")
        }

    def process_resume(self, pdf_path):
        raw_text, links = self.extract_from_pdf(pdf_path)
        norm_text = self._normalize_text(raw_text)
        doc = self.nlp(raw_text) # Use raw for NER, norm for skills

        # NLP Extractions
        name = next((ent.text for ent in doc.ents if ent.label_ == "PERSON"), "Unknown")
        address = list(set([ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]))
        
        # Skill Matching
        matcher = PhraseMatcher(self.nlp.vocab, attr="LOWER")
        patterns = [self.nlp.make_doc(s) for s in self.skills_list]
        matcher.add("SKILLS", patterns)
        skills = list(set([doc[start:end].text.lower() for _, start, end in matcher(self.nlp(norm_text))]))

        return {
            "name": name,
            "address": address,
            "skills": skills,
            "github_intelligence": self.get_github_data(links),
            "education": self._extract_list_by_keyword(raw_text, ["bachelor", "master", "tech", "phd"]),
            "experience": self._extract_by_regex(raw_text, r"\d{4}\s?[-–]\s?(present|\d{4})")
        }

    def _extract_list_by_keyword(self, text, keywords):
        return [line.strip() for line in text.split('\n') if any(k in line.lower() for k in keywords)]

    def _extract_by_regex(self, text, pattern):
        return [line.strip() for line in text.split('\n') if re.search(pattern, line.lower())]


In [13]:
# --- Execution ---
if __name__ == "__main__":
    pipeline = ResumePipeline()
    # Ensure the path exists
    result = pipeline.process_resume("resume/Aman_Kumar_Resume_entry_lvl_mld.pdf")
    import json
    print(json.dumps(result, indent=4))

{
    "name": "AMAN KUMAR",
    "address": [
        "India",
        "Raipur",
        "Linux",
        "Bangalore"
    ],
    "skills": [
        ".",
        "tasks",
        "nlp",
        "\n",
        ",",
        "workflows.",
        "on",
        "\nmachine",
        "hr",
        "a",
        "document",
        ": model",
        "experience",
        "applied",
        "scikit",
        "heavy",
        "processing",
        "basic",
        "building scalable",
        "with growing",
        "-",
        "\u2022",
        "administrative"
    ],
    "github_intelligence": {
        "username": "aman-k-codes",
        "profile_name": "Aman Kumar Sahu",
        "bio": "Backend Engineer | Laravel & Python | Building REST APIs and ML-powered tools",
        "repositories": [
            {
                "id": 1136502016,
                "node_id": "R_kgDOQ72lAA",
                "name": "AI_Talent_Intelligence",
                "full_name": "aman-k-codes/AI_Talent_Intelligen