In [90]:
docs = [
    "resume/Aman_Kumar_Resume_entry_lvl_mld.pdf",
    "resume/resume (1).pdf",
    "resume/resume (2).pdf",
    "resume/resume (3).pdf",
    "resume/resume (4).pdf",
    "resume/resume (5).pdf",
    "resume/resume (6).pdf",
    "resume/resume (7).pdf",
]

In [91]:
import requests
import os
import re
import fitz
import spacy
import unicodedata
from spacy.matcher import PhraseMatcher
from concurrent.futures import ThreadPoolExecutor, as_completed



def get_refined_skills(text):
    doc = nlp(text)
    matcher = PhraseMatcher(nlp.vocab)
    
    # List of skills you want to ensure are caught
    skill_list = ["Python", "Machine Learning", "REST API", "Docker", "Flask", "Laravel", "MySQL"]
    patterns = [nlp.make_doc(text) for text in skill_list]
    matcher.add("SKILLS", patterns)
    
    matches = matcher(doc)
    found_skills = set([doc[start:end].text for match_id, start, end in matches])
    
    return found_skills

nlp = spacy.load("en_core_web_lg")

candidateRawData = {
    "name": None,
    "skills": [],
    "resume_text": "",
    "links": [],
    "github": None,
}
github_details = {
    "name": None,
    "followers": None,
    "followings": None,
    "repositories": {
        "name":None,
        "tech":[],
        "url":None
    },
}

In [92]:
def normalize_text(text: str) -> str:
    if not text:
        return ""

    text = unicodedata.normalize("NFKD", text)
    text = text.lower()
    text = re.sub(r"[–—−]", "-", text)
    text = re.sub(r"[•●▪►■·]", " ", text)
    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", " ", text)
    text = re.sub(r"[^a-z0-9\.\-\+\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [93]:
def extract_text_from_pdf(path):
    pages = fitz.open(path)
    text = ''
    links = []

    for page in pages:
        text += page.get_text()
        for url in page.get_links():
            if 'uri' in url:
                links.append(url['uri'])

    return text, links

In [94]:
def extract_github_links(links):
    github_links = set()

    for link in links:
        if link and "github.com" in link:
            github_links.add(link.split("?")[0])

    return list(github_links)

In [95]:
def get_github_username(url):
    return url.rstrip("/").split("github.com/")[-1].split("/")[0]

In [96]:
def call_api(url,timeout=10):
    """
    Fetch GitHub user profile. Token can be passed or provided via
    GITHUB_TOKEN or GH_TOKEN env var. Returns JSON on success or raises an HTTPError.
    """
    gittoken = os.environ.get("GITHUB_TOKEN")
    headers = {
        "Accept": "application/vnd.github+json",
        "User-Agent": "my-script-or-app",  # set a descriptive user agent
    }
    if gittoken:
        headers["Authorization"] = f"Bearer {gittoken}"
    resp = requests.get(url, headers=headers, timeout=timeout)

    resp.raise_for_status()
    return resp.json()


In [97]:
def get_github_profile(username, timeout=10):
    url = f"https://api.github.com/users/{username}"
    resp = call_api(url)
    return resp

In [98]:
def get_resume_name(text):
    tokens = nlp(text)
    for token in tokens.ents:
        if token.label_ == 'PERSON':
            return token
        else:
            return null
            

In [99]:
SKILLS = {
    # Languages & Frameworks
    "python", "java", "php", "laravel", "sql", "mysql", "javascript", "typescript",
    "flask", "django", "fastapi", "node.js", "react", "vue",
    
    # Machine Learning & AI
    "ml", "machine learning", "nlp", "natural language processing", "deep learning", 
    "computer vision", "pytorch", "tensorflow", "keras", "scikit-learn", "pandas", 
    "numpy", "opencv", "huggingface", "llm", "bert", "transformers",
    
    # Backend & Infrastructure
    "backend", "rest", "api", "rest api", "graphql", "microservices", "docker", 
    "kubernetes", "aws", "gcp", "azure", "linux", "git", "postman", "redis", 
    "rabbitmq", "celery", "nginx", "ci/cd",
    
    # Data & Database
    "postgresql", "mongodb", "sqlite", "elasticsearch", "data engineering", 
    "data science", "web scraping", "beautifulsoup", "selenium"
}

def get_resume_skills(text):
    doc = nlp(text.lower())
    found_skills = set()

    for token in doc:
        if token.text in SKILLS:
            found_skills.add(token.text)

    return list(found_skills)


In [100]:
rawText, rawLinks = extract_text_from_pdf("resume/Aman_Kumar_Resume_entry_lvl_mld.pdf")
candidateRawData["resume_text"] = rawText
candidateRawData["links"] = rawLinks

In [101]:
github_url = extract_github_links(rawLinks)
github_username = get_github_username(github_url[0])

In [102]:
github_profile = get_github_profile(github_username)
candidateRawData["github"] = github_profile

In [115]:
required_keys = [
    'id',
    'node_id',
    'name',
    'full_name',
    'private',
    'owner',
    'html_url',
    'description',
    'branches_url',
    'languages_url',
    'contributors_url',
    'created_at',
    'updated_at',
    'pushed_at',
    'git_url',
    'ssh_url',
    'clone_url',
    'size',
    'stargazers_count',
    'watchers_count',
    'language',
    'forks_count',
    'open_issues_count',
    'topics',
    'visibility',
    'forks',
    'default_branch',
]
# print(candidateRawData['github']["followers_url"])
# print(candidateRawData['github']["following_url"])
# print(candidateRawData['github']["repos_url"])

# print(candidateRawData['github']["gists_url"])
# print(candidateRawData['github']["starred_url"])
# print(candidateRawData['github']["subscriptions_url"])
# print(candidateRawData['github']["organizations_url"])
# print(candidateRawData['github']["events_url"])
# print(candidateRawData['github']["received_events_url"])

# github_details['followers'] = call_api(candidateRawData['github']["followers_url"])
# github_details['followings'] = call_api(candidateRawData['github']["following_url"])
repos = call_api(candidateRawData['github']["repos_url"])
repositories = []
for repo in repos:
    repositories.append([
        repo['id'],
        repo['node_id'],
        repo['name'],
        repo['full_name'],
        repo['private'],
        repo['owner'],
        repo['html_url'],
        repo['description'],
        repo['branches_url'],
        repo['languages_url'],
        repo['contributors_url'],
        repo['created_at'],
        repo['updated_at'],
        repo['pushed_at'],
        repo['git_url'],
        repo['ssh_url'],
        repo['clone_url'],
        repo['size'],
        repo['stargazers_count'],
        repo['watchers_count'],
        repo['language'],
        repo['forks_count'],
        repo['open_issues_count'],
        repo['topics'],
        repo['visibility'],
        repo['forks'],
        repo['default_branch'],
    ])
    print(repositories)


[[1136502016, 'R_kgDOQ72lAA', 'AI_Talent_Intelligence', 'aman-k-codes/AI_Talent_Intelligence', False, {'login': 'aman-k-codes', 'id': 76843794, 'node_id': 'MDQ6VXNlcjc2ODQzNzk0', 'avatar_url': 'https://avatars.githubusercontent.com/u/76843794?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/aman-k-codes', 'html_url': 'https://github.com/aman-k-codes', 'followers_url': 'https://api.github.com/users/aman-k-codes/followers', 'following_url': 'https://api.github.com/users/aman-k-codes/following{/other_user}', 'gists_url': 'https://api.github.com/users/aman-k-codes/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/aman-k-codes/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/aman-k-codes/subscriptions', 'organizations_url': 'https://api.github.com/users/aman-k-codes/orgs', 'repos_url': 'https://api.github.com/users/aman-k-codes/repos', 'events_url': 'https://api.github.com/users/aman-k-codes/events{/privacy}', 'received_events_url': 'http

In [104]:
candidateRawData['name'] = get_resume_name(candidateRawData['resume_text'])
candidateRawData['resume_text'] = normalize_text(candidateRawData['resume_text'])
candidateRawData['skills'] = get_resume_skills(candidateRawData['resume_text'])

In [105]:
print(candidateRawData['skills'])

['flask', 'python', 'mysql', 'sql', 'git', 'nlp', 'php', 'api', 'linux', 'docker', 'backend', 'pytorch', 'laravel', 'postman', 'rest', 'ml', 'fastapi', 'javascript']


In [106]:
print(candidateRawData['github'])

{'login': 'aman-k-codes', 'id': 76843794, 'node_id': 'MDQ6VXNlcjc2ODQzNzk0', 'avatar_url': 'https://avatars.githubusercontent.com/u/76843794?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/aman-k-codes', 'html_url': 'https://github.com/aman-k-codes', 'followers_url': 'https://api.github.com/users/aman-k-codes/followers', 'following_url': 'https://api.github.com/users/aman-k-codes/following{/other_user}', 'gists_url': 'https://api.github.com/users/aman-k-codes/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/aman-k-codes/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/aman-k-codes/subscriptions', 'organizations_url': 'https://api.github.com/users/aman-k-codes/orgs', 'repos_url': 'https://api.github.com/users/aman-k-codes/repos', 'events_url': 'https://api.github.com/users/aman-k-codes/events{/privacy}', 'received_events_url': 'https://api.github.com/users/aman-k-codes/received_events', 'type': 'User', 'user_view_type': 'public', 's