In [10]:
import pandas as pd
import pdfplumber
import re
import nltk
from datetime import datetime as dt
from nltk import ngrams
from difflib import get_close_matches as gcm

In [34]:
def extract_cv_skill(filename):
    # Read skills
    df_skills = pd.read_csv('skills_db2/skill.csv')
    SKILLS = df_skills['Skill'].unique().tolist()
    # Redundant skills
    df_redskills = pd.read_excel('skills/Other Skills.xlsx')
    RED_SKILLS = df_redskills['Skill'].unique().tolist()
    # Duplicate skills
    df_dupskills = pd.read_excel('skills/Other Skills.xlsx', sheet_name='Duplicates')
    DUP_SKILLS = df_dupskills.set_index('Skill').to_dict()['Parent']
    SKILLS.extend(list(DUP_SKILLS.keys()))
    # Initialization
    all_skills = []
    # Read pdf
    with pdfplumber.open(filename) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            this_skills = extract_skills(page_text, SKILLS)
            all_skills.extend(this_skills)
    all_skills = list(set(all_skills))
    keep_skills, ignore_skills = extract_ignore(all_skills, RED_SKILLS, DUP_SKILLS)
    keep_skills.sort()
    ignore_skills.sort()
    return keep_skills, ignore_skills

def get_domains(skills):
    df_skills = pd.read_csv('skills_db2/skill.csv')
    df_domains = pd.read_csv('skills_db2/domain.csv')
    df_skills = df_skills.merge(df_domains, left_on='DomainId', right_on='Id')
    return df_skills.loc[df_skills['Skill'].isin(keep_skills)]['Domain'].value_counts()

def extract_skills(info, skills, threshold=0.9):
    words, unigrams, bigrams, trigrams = clean_info(info)
    results = []
    for skill in skills:
        s = skill
        if '(' in s:
            abb = s[s.find("(")+1:s.find(")")]
            if abb in words:
                results.append(skill)
                continue
            s = re.sub(r"[\(].*?[\)]", "", s)
        s = s.lower()
        s2 = s.split()
        if len(s2) == 1:
            if len(gcm(s, unigrams, cutoff=threshold)) > 0:
                results.append(skill)
        elif len(s2) == 2:
            if len(gcm(s, bigrams, cutoff=threshold)) > 0:
                results.append(skill)
        elif len(s2) == 3:
            if len(gcm(s, trigrams, cutoff=threshold)) > 0:
                results.append(skill)
        else:
            if len(gcm(s, trigrams, cutoff=threshold)) > 0:
                results.append(skill)
    return results

def extract_ignore(skills, redundant_skills, duplicate_skills):
    ignore_skills = []
    for j, skill in enumerate(skills):
        if skill in redundant_skills:
            ignore_skills.append(skill)
            continue
        for other in skills[:j] + skills[j+1:]:
            if skill in other:
                if find_whole_word(skill, other):
                    ignore_skills.append(skill)
                    break
    job_skills = []
    for skill in skills:
        if skill not in ignore_skills:
            if skill in duplicate_skills.keys():
                skill = duplicate_skills[skill]
            job_skills.append(skill)
    return list(set(list(job_skills))), ignore_skills

def clean_info(info):
    # Remove ordered list with alphabets: a), b), c),...
    words = re.sub(r'[\s\t\n|.|\(]+[a-zA-Z\s*][.|\)]+', ' ', info)
    words = re.sub('[\n|,|.|:|;|\-|/|\(|\)|\[|\]]', ' ', words)
    # words = [word.strip() for word in words]
    # words = nltk.word_tokenize(info)
    # unigrams = nltk.word_tokenize(info.lower())
    unigrams = words.lower().split()
    bigrams = [' '.join(g) for g in ngrams(unigrams, 2)]
    trigrams = [' '.join(g) for g in ngrams(unigrams, 3)]
    return words.split(), unigrams, bigrams, trigrams

def find_whole_word(search_string, input_string):
    raw_search_string = r"\b" + search_string + r"\b"
    match_output = re.search(raw_search_string, input_string)
    no_match_was_found = ( match_output is None )
    if no_match_was_found:
        return False
    else:
        return True

In [31]:
with pdfplumber.open("resource/binxuankong.pdf") as pdf:
    pdf_text = []
    for page in pdf.pages:
        pdf_text.append(page.extract_text())

In [32]:
pdf_text[0]

'BIN XUAN KONG\n(+60)126112151 (cid:5) binxuankong@gmail.com (cid:5) github.com/binxuankong\nSkills\nTechnical Deep Learning, Computer Vision, Natural Language Processing,\nComputational Optimization, Data Visualization, Software Engineering\nProgramming Python, Java, C#, SQL, Prolog, MATLAB, LaTeX\nTools NumPy, Pandas, SciKit, Theano, PyTorch, Tableau\nSoft Problem Solving, Time Management, Teamwork, Adaptability, Creativity\nWork Experience\nThe Center of Applied Data Science (CADS) Mar 2020 - Present\nData Science Specialist Kuala Lumpur, Malaysia\n· Currently in part of The CADS Graduate Talent Program\n· Work on internal projects to migrate data from several diﬀerent databases into a clean integrated database to\nallow the ease of analysis\n· Work on internal projects to analyze data and produce dashboard which provide meaningful insights and\nsolutions to the company\n· Work as a Technical/Teaching Assistant (TA), creating and reviewing course materials, and assisting primary\nle

In [33]:
pdf_text[1]

'Stendhal Game Sep 2016 - Dec 2016\n· Stendhal is a multi-player online adventure open source game\n· Tested, debugged, built, developed and deployed a multi-user, multi-threaded, client-server open source game\n· Automation of builds and tests done using Eclipse, Git, Apache Ant, Jenkins, JUnit and SonarQube\nStudy Buddy Feb 2016 - Apr 2016\n· Web application to assist users in studying and help them be aware of procrastination\n· Users can input quizzes, which would pop-up periodically for them to complete\n· Created by a team of six using HTML, CSS, PHP, MySQL and JavaScript\nExternal Curriculum\nInternational Council of Malaysians Scholars and Associates (ICMS) Nov 2016 - Mar 2018\nSecretarial Associate International\n· ICMS is a non-proﬁt professional network of driven and passionate individuals designed to operate like a\nmultinational organization that provides opportunity to experience working in a multi-national company\n· Developed transferable skills through internal program

In [35]:
keep_skills, ignore_skills = extract_cv_skill('resource/binxuankong.pdf')
print(keep_skills)
print()
print(ignore_skills)

['Ad Design', 'Adaptability', 'Analysis', 'Apache Ant', 'Application Programming Interface (API)', 'Automation', 'Budget', 'Business Intelligence (BI)', 'C#', 'Cascading Style Sheet (CSS)', 'Chemistry', 'Coding', 'Collaboration', 'Computational Optimization', 'Computer Vision', 'Computing', 'Construction', 'Convolutional Neural Network (CNN)', 'Creativity', 'Dashboard', 'Data Visualization', 'Database', 'Dataset', 'Decision Making', 'Deep Learning', 'Eclipse', 'Finance', 'Gated Recurrent Unit (GRU)', 'Generative Adversarial Network (GAN)', 'Git', 'Github', 'HyperText Markup Language (HTML)', 'Image', 'International Law', 'Java', 'JavaScript', 'Jenkins', 'LaTeX', 'Leadership', 'MATLAB', 'Mathematics', 'MySQL', 'Natural Language Processing (NLP)', 'Nonverbal Communication', 'NumPy', 'Oracle', 'Organizational Skill', 'PHP', 'Pandas', 'Physics', 'Problem Solving', 'Prolog', 'PyTorch', 'Python', 'Qlik Sense', 'Recurrent Neural Network (RNN)', 'Saving', 'Software Engineering', 'Structured Qu

In [36]:
get_domains(keep_skills)

Computer Science               26
Data Science                   16
Business                       13
Arts and Humanities             3
Mathematics and Logic           2
Social Science                  2
Natural Science                 2
Information Technology (IT)     1
Name: Domain, dtype: int64