In [61]:
docs = [
    "resume/Aman_Kumar_Resume_entry_lvl_mld.pdf",
    "resume/resume (1).pdf",
    "resume/resume (2).pdf",
    "resume/resume (3).pdf",
    "resume/resume (4).pdf",
    "resume/resume (5).pdf",
    "resume/resume (6).pdf",
    "resume/resume (7).pdf",
]

In [62]:
import requests
import os
import re
import fitz
import spacy
import unicodedata
from spacy.matcher import PhraseMatcher
from concurrent.futures import ThreadPoolExecutor, as_completed



def get_refined_skills(text):
    doc = nlp(text)
    matcher = PhraseMatcher(nlp.vocab)
    
    # List of skills you want to ensure are caught
    skill_list = ["Python", "Machine Learning", "REST API", "Docker", "Flask", "Laravel", "MySQL"]
    patterns = [nlp.make_doc(text) for text in skill_list]
    matcher.add("SKILLS", patterns)
    
    matches = matcher(doc)
    found_skills = set([doc[start:end].text for match_id, start, end in matches])
    
    return found_skills

nlp = spacy.load("en_core_web_lg")

candidateRawData = {
    "name": None,
    "skills": [],
    "resume_text": "",
    "links": [],
    "github": None,
}
github_details = {}

In [63]:
def normalize_text(text: str) -> str:
    if not text:
        return ""

    text = unicodedata.normalize("NFKD", text)
    text = text.lower()
    text = re.sub(r"[–—−]", "-", text)
    text = re.sub(r"[•●▪►■·]", " ", text)
    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", " ", text)
    text = re.sub(r"[^a-z0-9\.\-\+\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [64]:
def extract_text_from_pdf(path):
    pages = fitz.open(path)
    text = ''
    links = []

    for page in pages:
        text += page.get_text()
        for url in page.get_links():
            if 'uri' in url:
                links.append(url['uri'])

    return text, links

In [65]:
def extract_github_links(links):
    github_links = set()

    for link in links:
        if link and "github.com" in link:
            github_links.add(link.split("?")[0])

    return list(github_links)

In [66]:
def get_github_username(url):
    return url.rstrip("/").split("github.com/")[-1].split("/")[0]

In [67]:
def get_github_profile(username, timeout=10):
    """
    Fetch GitHub user profile. Token can be passed or provided via
    GITHUB_TOKEN or GH_TOKEN env var. Returns JSON on success or raises an HTTPError.
    """
    gittoken = os.environ.get("GITHUB_TOKEN")
    print(gittoken)
    headers = {
        "Accept": "application/vnd.github+json",
        "User-Agent": "my-script-or-app",  # set a descriptive user agent
    }
    if gittoken:
        headers["Authorization"] = f"Bearer {gittoken}"  # or "token {token}"

    url = f"https://api.github.com/users/{username}"
    resp = requests.get(url, headers=headers, timeout=timeout)

    # Raise if HTTP status indicates an error (e.g., 404, 401, 403)
    resp.raise_for_status()
    return resp.json()

In [68]:
def get_resume_name(text):
    tokens = nlp(text)
    for token in tokens.ents:
        if token.label_ == 'PERSON':
            return token
        else:
            return null
            

In [69]:
SKILLS = {
    # Languages & Frameworks
    "python", "java", "php", "laravel", "sql", "mysql", "javascript", "typescript",
    "flask", "django", "fastapi", "node.js", "react", "vue",
    
    # Machine Learning & AI
    "ml", "machine learning", "nlp", "natural language processing", "deep learning", 
    "computer vision", "pytorch", "tensorflow", "keras", "scikit-learn", "pandas", 
    "numpy", "opencv", "huggingface", "llm", "bert", "transformers",
    
    # Backend & Infrastructure
    "backend", "rest", "api", "rest api", "graphql", "microservices", "docker", 
    "kubernetes", "aws", "gcp", "azure", "linux", "git", "postman", "redis", 
    "rabbitmq", "celery", "nginx", "ci/cd",
    
    # Data & Database
    "postgresql", "mongodb", "sqlite", "elasticsearch", "data engineering", 
    "data science", "web scraping", "beautifulsoup", "selenium"
}

def get_resume_skills(text):
    doc = nlp(text.lower())
    found_skills = set()

    for token in doc:
        if token.text in SKILLS:
            found_skills.add(token.text)

    return list(found_skills)


In [70]:
rawText, rawLinks = extract_text_from_pdf("resume/Aman_Kumar_Resume_entry_lvl_mld.pdf")
candidateRawData["resume_text"] = rawText
candidateRawData["links"] = rawLinks

In [71]:
github_url = extract_github_links(rawLinks)
github_username = get_github_username(github_url[0])

In [None]:
github_profile = get_github_profile(github_username)
candidateRawData["github"] = github_profile
print(candidateRawData['github'])

None
{'login': 'aman-k-codes', 'id': 76843794, 'node_id': 'MDQ6VXNlcjc2ODQzNzk0', 'avatar_url': 'https://avatars.githubusercontent.com/u/76843794?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/aman-k-codes', 'html_url': 'https://github.com/aman-k-codes', 'followers_url': 'https://api.github.com/users/aman-k-codes/followers', 'following_url': 'https://api.github.com/users/aman-k-codes/following{/other_user}', 'gists_url': 'https://api.github.com/users/aman-k-codes/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/aman-k-codes/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/aman-k-codes/subscriptions', 'organizations_url': 'https://api.github.com/users/aman-k-codes/orgs', 'repos_url': 'https://api.github.com/users/aman-k-codes/repos', 'events_url': 'https://api.github.com/users/aman-k-codes/events{/privacy}', 'received_events_url': 'https://api.github.com/users/aman-k-codes/received_events', 'type': 'User', 'user_view_type': 'public

In [73]:
# keys you want to fetch
keys = [
    "followers_url", "following_url", "gists_url", "starred_url",
    "subscriptions_url", "organizations_url", "repos_url",
    "events_url", "received_events_url",
]

def normalize_url(url):
    # Many GitHub template URLs include {...} templates; strip them for GET
    return re.sub(r"\{.*\}", "", url) if url else url

def fetch(session, key, url, timeout=10, headers=None):
    try:
        url = normalize_url(url)
        if not url:
            return key, None
        resp = session.get(url, timeout=timeout, headers=headers)
        resp.raise_for_status()
        return key, resp.json()
    except Exception as e:
        # choose how to handle errors: return None or the exception info
        return key, {"error": str(e)}

def fetch_github_sections_concurrent(candidateRawData, token=None, max_workers=5):
    github = candidateRawData.get("github", {})
    # capture original URLs so they don't get overwritten mid-loop
    url_map = {k: github.get(k) for k in keys if github.get(k)}

    headers = {"Accept": "application/vnd.github+json", "User-Agent": "my-app"}
    if token is None:
        token = os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN")
    if token:
        headers["Authorization"] = f"Bearer {token}"

    with requests.Session() as session:
        # optional: session.headers.update(headers) and avoid passing headers per-request
        session.headers.update(headers)

        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            futures = {ex.submit(fetch, session, k, url): k for k, url in url_map.items()}
            for fut in as_completed(futures):
                key, data = fut.result()
                candidateRawData["github"][key] = data

    return candidateRawData

In [74]:
candidateRawData['name'] = get_resume_name(candidateRawData['resume_text'])
candidateRawData['resume_text'] = normalize_text(candidateRawData['resume_text'])
candidateRawData['skills'] = get_resume_skills(candidateRawData['resume_text'])

In [75]:
print(candidateRawData['skills'])

['flask', 'sql', 'javascript', 'rest', 'pytorch', 'git', 'ml', 'nlp', 'docker', 'postman', 'mysql', 'laravel', 'python', 'php', 'backend', 'fastapi', 'api', 'linux']


In [76]:
print(candidateRawData['github'])

{'login': 'aman-k-codes', 'id': 76843794, 'node_id': 'MDQ6VXNlcjc2ODQzNzk0', 'avatar_url': 'https://avatars.githubusercontent.com/u/76843794?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/aman-k-codes', 'html_url': 'https://github.com/aman-k-codes', 'followers_url': 'https://api.github.com/users/aman-k-codes/followers', 'following_url': 'https://api.github.com/users/aman-k-codes/following{/other_user}', 'gists_url': 'https://api.github.com/users/aman-k-codes/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/aman-k-codes/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/aman-k-codes/subscriptions', 'organizations_url': 'https://api.github.com/users/aman-k-codes/orgs', 'repos_url': 'https://api.github.com/users/aman-k-codes/repos', 'events_url': 'https://api.github.com/users/aman-k-codes/events{/privacy}', 'received_events_url': 'https://api.github.com/users/aman-k-codes/received_events', 'type': 'User', 'user_view_type': 'public', 's