In [1]:
import os
import pandas as pd
import pdfplumber
import re
from datetime import datetime as dt
from skill_api import extract_skills, extract_ignore

In [2]:
def extract_cv_skill(filename):
    all_skills = []
    # Read pdf
    with pdfplumber.open(filename) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            this_skills = extract_skills(page_text)
            all_skills.extend(this_skills)
    all_skills = list(set(all_skills))
    keep_skills, ignore_skills = extract_ignore(all_skills)
    keep_skills.sort()
    ignore_skills.sort()
    return keep_skills, ignore_skills

def get_domains(skills):
    df_skills = pd.read_csv('skills_db2/skill.csv')
    df_domains = pd.read_csv('skills_db2/domain.csv')
    df_skills = df_skills.merge(df_domains, left_on='DomainId', right_on='Id')
    return df_skills.loc[df_skills['Skill'].isin(keep_skills)]['Domain'].value_counts()

In [3]:
with pdfplumber.open("resource/binxuankong.pdf") as pdf:
    pdf_text = []
    for page in pdf.pages:
        pdf_text.append(page.extract_text())

In [4]:
pdf_text[0]

'BIN XUAN KONG\n(+60)126112151 (cid:5) binxuankong@gmail.com (cid:5) github.com/binxuankong\nSkills\nTechnical Deep Learning, Computer Vision, Natural Language Processing,\nComputational Optimization, Data Visualization, Software Engineering\nProgramming Python, Java, C#, SQL, Prolog, MATLAB, LaTeX\nTools NumPy, Pandas, SciKit, Theano, PyTorch, Tableau\nSoft Problem Solving, Time Management, Teamwork, Adaptability, Creativity\nWork Experience\nThe Center of Applied Data Science (CADS) Mar 2020 - Present\nData Science Specialist Kuala Lumpur, Malaysia\n· Currently in part of The CADS Graduate Talent Program\n· Work on internal projects to migrate data from several diﬀerent databases into a clean integrated database to\nallow the ease of analysis\n· Work on internal projects to analyze data and produce dashboard which provide meaningful insights and\nsolutions to the company\n· Work as a Technical/Teaching Assistant (TA), creating and reviewing course materials, and assisting primary\nle

In [5]:
pdf_text[1]

'Stendhal Game Sep 2016 - Dec 2016\n· Stendhal is a multi-player online adventure open source game\n· Tested, debugged, built, developed and deployed a multi-user, multi-threaded, client-server open source game\n· Automation of builds and tests done using Eclipse, Git, Apache Ant, Jenkins, JUnit and SonarQube\nStudy Buddy Feb 2016 - Apr 2016\n· Web application to assist users in studying and help them be aware of procrastination\n· Users can input quizzes, which would pop-up periodically for them to complete\n· Created by a team of six using HTML, CSS, PHP, MySQL and JavaScript\nExternal Curriculum\nInternational Council of Malaysians Scholars and Associates (ICMS) Nov 2016 - Mar 2018\nSecretarial Associate International\n· ICMS is a non-proﬁt professional network of driven and passionate individuals designed to operate like a\nmultinational organization that provides opportunity to experience working in a multi-national company\n· Developed transferable skills through internal program

In [6]:
keep_skills, ignore_skills = extract_cv_skill('resource/binxuankong.pdf')
print(keep_skills)
print()
print(ignore_skills)

['Ad Design', 'Adaptability', 'Analysis', 'Apache Ant', 'Application Programming Interface (API)', 'Automation', 'Budget', 'Business Intelligence (BI)', 'C#', 'Cascading Style Sheet (CSS)', 'Chemistry', 'Coding', 'Collaboration', 'Computational Optimization', 'Computer Science', 'Computer Vision', 'Computing', 'Construction', 'Convolutional Neural Network (CNN)', 'Creativity', 'Dashboard', 'Data Science', 'Data Visualization', 'Database', 'Dataset', 'Decision Making', 'Deep Learning', 'Dual Generative Adversarial Network (DualGAN)', 'Eclipse', 'Electroencephalogram (EEG)', 'Finance', 'Functional Magnetic Resonance Imaging (fMRI)', 'Gated Recurrent Unit (GRU)', 'Generative Adversarial Network (GAN)', 'Git', 'Github', 'HyperText Markup Language (HTML)', 'Image', 'International Law', 'JUnit', 'Java', 'JavaScript', 'Jenkins', 'LaTeX', 'Leadership', 'MATLAB', 'Mathematics', 'MySQL', 'Natural Language Processing (NLP)', 'Nonverbal Communication', 'NumPy', 'Oracle', 'Organizational Skill', 'P

In [7]:
get_domains(keep_skills)

Computer Science               32
Data Science                   19
Business                       13
Natural Science                 4
Arts and Humanities             3
Mathematics and Logic           2
Social Science                  2
Information Technology (IT)     1
Name: Domain, dtype: int64

In [8]:
folder = 'resource/CVs'
directory = os.fsencode(folder)
filenames = []
    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.pdf'): 
        filenames.append(folder + '/' + filename)

filenames.sort()
print(len(filenames))
print(filenames)

18
['resource/CVs/10) Nur Ain.pdf', 'resource/CVs/11) Nur Ul Atikah.pdf', 'resource/CVs/12) Nurul Hajar.pdf', 'resource/CVs/13) Razif.pdf', 'resource/CVs/14) Siti Atikah.pdf', 'resource/CVs/15) Siti Hajar Saharudin.pdf', 'resource/CVs/16) Suhaila Sulaiman.pdf', 'resource/CVs/17) Syaza Izyanni.pdf', 'resource/CVs/18) Tan Weng Chun.pdf', 'resource/CVs/19) Tengku.pdf', 'resource/CVs/2) A Hanif.pdf', 'resource/CVs/20) Wan Rosalina Wan Rosli.pdf', 'resource/CVs/21) Yasmin.pdf', 'resource/CVs/5) Hamza.pdf', 'resource/CVs/6) Kamarul Zaman.pdf', 'resource/CVs/7) Zaihafiz.pdf', 'resource/CVs/8) Nor Syaida.pdf', 'resource/CVs/9) B Norerama.pdf']


In [9]:
skills_dict = {}

for f in filenames:
    skills = extract_cv_skill(f)
    skills_dict[f] = skills

In [10]:
def print_skills(index):
    print('Individual:', filenames[index])
    print('Skills:', skills_dict[filenames[index]][0])
    print('Ignore Skills:', skills_dict[filenames[index]][1])

In [11]:
index = 0
print_skills(index)

Individual: resource/CVs/10) Nur Ain.pdf
Skills: ['Agriculture', 'Analysis', 'Anatomy', 'Aquaculture', 'Biochemistry', 'Bioinformatics', 'Biotechnology', 'Chemical', 'Chemistry', 'Communication', 'Consulting', 'Critical Thinking', 'Customer Service', 'Design', 'Division', 'Ecology', 'Email', 'English', 'Environmental Science', 'Expectation-Maximization (EM)', 'Forecasting', 'Human Resource (HR)', 'Information Technology (IT)', 'Insurance', 'Leadership', 'Life Science', 'Malay', 'Media', 'Medium', 'Methodology', 'Microbiology', 'Microsoft Excel', 'Microsoft PowerPoint', 'Microsoft Word', 'Molecular Biology', 'Morphology', 'Outreach', 'Physiology', 'Polymerase chain reaction (PCR)', 'Poster', 'Problem Solving', 'Procurement', 'Research and Development (R&D)', 'SAS', 'Sales', 'Service Management', 'Technology Management', 'Training', 'Tree', 'Writing']
Ignore Skills: ['BASIC', 'Biology', 'Culture', 'D', 'Depression', 'Development', 'Education', 'Executive', 'Goods', 'Interest', 'List', 'M

In [12]:
index += 1
print_skills(index)

Individual: resource/CVs/11) Nur Ul Atikah.pdf
Skills: ['Agricultural Engineering', 'Agriculture', 'Automation', 'Chemical', 'Classification', 'Combat Engineering', 'Cooking', 'Design', 'Dimensionality Reduction', 'Electronics', 'English', 'Environmental Science', 'Food Science', 'Innovation', 'Malay', 'Mechanical Engineering', 'Poster', 'Research', 'Systems Engineering', 'Translation']
Ignore Skills: ['Data', 'Development', 'Education', 'Engineering', 'Food', 'Language', 'List', 'Profiler', 'Unemployment']


In [13]:
index += 1
print_skills(index)

Individual: resource/CVs/12) Nurul Hajar.pdf
Skills: ['Analytical', 'C++', 'Computer Science', 'Electronics', 'Engineering', 'English', 'Internet', 'Java', 'MATLAB', 'Malay', 'Microsoft Access', 'Microsoft Excel', 'Microsoft Office', 'MySQL', 'Oracle', 'PHP', 'Philosophy of Mathematics', 'Physics', 'Problem Solving', 'SPSS', 'Time Management', 'Training']
Ignore Skills: ['Application', 'Education', 'Excel', 'Language', 'Make', 'Management', 'Mathematics', 'Microsoft', 'Philosophy', 'Preference', 'Profiler']


In [14]:
index += 1
print_skills(index)

Individual: resource/CVs/13) Razif.pdf
Skills: ['C', 'Computer Programming', 'Design', 'Electronics', 'Email', 'Engineering', 'Numerical Method', 'Physics', 'Quantum Mechanics', 'R', 'Research', 'Semiconductors']
Ignore Skills: ['BASIC', 'Education', 'Mechanics', 'Preference', 'Profiler', 'Programming', 'Unemployment']


In [15]:
index += 1
print_skills(index)

Individual: resource/CVs/14) Siti Atikah.pdf
Skills: ['Agriculture', 'Analysis', 'Chemical', 'Chemistry', 'Data Science', 'Derivative', 'Design', 'Email', 'English', 'Facebook', 'Instagram', 'Leadership', 'Malay', 'Mathematics', 'Methodology', 'Microsoft Excel', 'Microsoft Word', 'Objective-C', 'Social Media', 'Teamwork', 'Twitter', 'Video']
Ignore Skills: ['Data', 'Development', 'Education', 'Excel', 'Language', 'Media', 'Microsoft', 'Preference', 'Product']


In [16]:
index += 1
print_skills(index)

Individual: resource/CVs/15) Siti Hajar Saharudin.pdf
Skills: ['Analysis', 'Biotechnology', 'Chemical Engineering', 'Communication', 'English', 'Fabrication', 'German', 'Image', 'International Trade', 'Leadership', 'Malay', 'Mathematics', 'Microsoft Excel', 'Microsoft PowerPoint', 'Microsoft Word', 'Objective-C', 'Reporting', 'Research', 'Search Engine Marketing (SEM)', 'Shell', 'Teamwork']
Ignore Skills: ['Application', 'Chemical', 'Data', 'Development', 'Education', 'Engineering', 'Excel', 'Language', 'Microsoft', 'Preference', 'Product', 'Production', 'Trade']


In [17]:
index += 1
print_skills(index)

Individual: resource/CVs/16) Suhaila Sulaiman.pdf
Skills: ['Analytical', 'Automation', 'Bash', 'Big Data', 'Biochemistry', 'Bioinformatics Analysis', 'Brand Management', 'C++', 'Cascading Style Sheet (CSS)', 'Cloud Computing', 'Comparative Economics', 'Data Analysis', 'Data Analytics', 'Data Mining', 'Data Science', 'Data Visualization', 'Database System', 'Division', 'Email', 'Encoding', 'Formatting', 'Genomics', 'HyperText Markup Language (HTML)', 'Informatics', 'Innovation', 'JavaScript', 'Lean Software Development', 'LinkedIn', 'Linux', 'Machine Learning', 'Molecular Biology', 'MySQL', 'Network', 'Nutrition', 'PHP', 'Perl', 'Platform', 'Poster', 'Problem Solving', 'Programming', 'Project Management', 'Python', 'Research and Development (R&D)', 'Risk Analysis', 'Scripting', 'Technical Support', 'Training', 'Visualization']
Ignore Skills: ['Analysis', 'Analytics', 'Bioinformatics', 'Biology', 'Computing', 'D', 'Data', 'Database', 'Development', 'Education', 'Executive', 'IPython', 'M

In [18]:
index += 1
print_skills(index)

Individual: resource/CVs/17) Syaza Izyanni.pdf
Skills: ['Aspen HYSYS', 'Chemical Engineering', 'Competition', 'Delegation', 'Design', 'Economics', 'Energy Technology', 'English', 'Environmental Engineering', 'Indonesian', 'Innovation', 'Japanese', 'MATLAB', 'Malay', 'Management', 'Manufacturing', 'Mathematical Modelling', 'Medium', 'Microsft Publisher', 'Microsoft Excel', 'Microsoft PowerPoint', 'Microsoft Word', 'Outreach', 'Philosophy', 'Poster', 'Presentation', 'Process Model', 'Research and Development (R&D)', 'Root Cause Analysis', 'Router', 'Safety Engineering', 'Sales', 'Saving', 'Selection', 'Sustainability', 'Sustainable Development', 'Systems Engineering']
Ignore Skills: ['Analysis', 'BASIC', 'Chemical', 'Development', 'Engineering', 'Industry', 'Language', 'Microsoft', 'Microsoft Office', 'Modelling', 'Preference', 'Production', 'Profiler', 'Research', 'Service']


In [19]:
index += 1
print_skills(index)

Individual: resource/CVs/18) Tan Weng Chun.pdf
Skills: ['Artificial Intelligence (AI)', 'Assembly', 'AutoCAD', 'Automation', 'Benchmarking', 'C++', 'Classification', 'Competition', 'Construction', 'Data Science', 'Design', 'Documentation', 'Eclipse', 'Electrical Engineering', 'Electronic Engineering', 'Electronics', 'Extract Transform Load (ETL)', 'Feature Engineering', 'Google BigQuery', 'Google Cloud Platform (GCP)', 'Google Colaboratory', 'Hardware', 'Integrated Development Environment (IDE)', 'International Standards on Auditing (ISA)', 'Java', 'Jupyter Notebook', 'LabVIEW', 'MATLAB', 'Matplotlib', 'Mean Shift', 'Mechatronics', 'MongoDB', 'Multisim', 'NetBeans', 'Neural Network', 'NoSQL', 'NumPy', 'Objective-C', 'OpenCV', 'Pandas', 'Perl', 'Philosophy', 'Programming Language', 'Pspice', 'Python', 'Research', 'Robotics', 'Scikit-learn', 'Seaborn', 'Shell', 'Signal Processing', 'U-Net', 'VHDL', 'Verilog']
Ignore Skills: ['AI', 'Application', 'C', 'Cloud Platform', 'Data', 'Developmen

In [20]:
index += 1
print_skills(index)

Individual: resource/CVs/19) Tengku.pdf
Skills: ['Accounting', 'Analytical', 'Applied Statistics', 'Bookkeeping', 'C++', 'Data Mining', 'Database', 'Design', 'Microsoft Excel', 'Microsoft Office', 'Poster', 'Presentation', 'R', 'Research', 'SPSS']
Ignore Skills: ['B', 'C', 'D', 'Data', 'Excel', 'Microsoft', 'Mining', 'Programming', 'R', 'Set', 'Statistics', 'Support']


In [21]:
index += 1
print_skills(index)

Individual: resource/CVs/2) A Hanif.pdf
Skills: ['Accuracy', 'Artificial Neural Network (ANN)', 'Astronomy', 'Collaboration', 'Communication', 'Control Engineering', 'Correlation', 'Descriptive Statistics', 'Design', 'Documentation', 'Electrical Engineering', 'Electronic Engineering', 'Electronics', 'English', 'Evolutionary Algorithm', 'Gastronomy', 'German', 'Hardware', 'Hindi', 'ISO 9001', 'Indonesian', 'Industrial Automation', 'Informatics', 'Information Technology (IT)', 'Internet', 'Java', 'Landscape Architecture', 'Logic', 'MATLAB', 'Malay', 'Mathematics', 'Mechanics', 'Microsoft Excel', 'Microsoft PowerPoint', 'Modelling', 'Objective-C', 'Operating System', 'Photography', 'Physics', 'Physiology', 'Platform', 'Process Control System', 'Production Process', 'Project Management', 'R', 'Regression', 'Research and Development (R&D)', 'Robot', 'Robotics', 'SPSS', 'Simulink', 'Soft Computing', 'Software Engineering', 'Sports', 'Spring', 'Statistical Analysis', 'TestNG', 'Testing', 'Tra

In [22]:
index += 1
print_skills(index)

Individual: resource/CVs/20) Wan Rosalina Wan Rosli.pdf
Skills: ['Administration', 'Analytical', 'Apache Hadoop', 'Apache Hive', 'Apache Spark', 'Bioinformatics', 'Biotechnology', 'Brand Management', 'Cloudera Manager', 'Clustering', 'Data Cleansing', 'Data Engineering', 'Data Mining', 'Data Preparation', 'Data Science Tool', 'Data Streaming', 'Database', 'Design', 'Email', 'Entrepreneurship', 'Ethics', 'Exploratory Analysis', 'Github', 'Industrial Relations', 'Innovation', 'Iterative and Incremental Development (IID)', 'Leadership', 'LinkedIn', 'Machine Learning', 'Microsoft Access', 'Molecular Biology', 'Network', 'Networking', 'Objective-C', 'Pharmacy', 'Platform', 'Polymerase chain reaction (PCR)', 'PostgreSQL', 'Problem Solving', 'Python', 'Quality Assurance (QA)', 'R', 'Regression', 'Research', 'Sequence', 'Spark Streaming', 'Structured Query Language (SQL)', 'Training', 'Visualization']
Ignore Skills: ['Analysis', 'Apache', 'Application', 'B', 'Biology', 'Cloudera', 'Culture', '

In [23]:
index += 1
print_skills(index)

Individual: resource/CVs/21) Yasmin.pdf
Skills: ['Administration', 'Animation', 'Artificial Intelligence (AI)', 'Blogging', 'Business Analysis', 'C++', 'Cascading Style Sheet (CSS)', 'Communication', 'Computer Science', 'Database', 'Documentation', 'Dreamweaver', 'Economics', 'Electronics', 'Email', 'English', 'French', 'Hindi', 'HyperText Markup Language (HTML)', 'Information System', 'Information Technology (IT)', 'Java', 'MATLAB', 'Malay', 'Management', 'Mathematics', 'Microsoft Office', 'MySQL', 'Neural Network', 'Oracle', 'PHP', 'Perceptron', 'Programming Language', 'Programming Tool', 'Prolog', 'Quality Assurance (QA)', 'R', 'RStudio', 'Research', 'Software Engineering', 'Software Tool', 'Structured Query Language (SQL)', 'TestNG', 'Testing', 'Urdu', 'Virtual Reality (VR)', 'Visual Basic', 'Writing', 'eCommerce']
Ignore Skills: ['BASIC', 'C', 'Development', 'Education', 'Engineering', 'Function', 'Language', 'Microsoft', 'Money', 'Network', 'Operations', 'Product', 'Programming',

In [24]:
index += 1
print_skills(index)

Individual: resource/CVs/5) Hamza.pdf
Skills: ['Android', 'Artificial Intelligence (AI)', 'Business Analysis', 'C++', 'Computer Science', 'Data Mining', 'Data Science', 'Embedded System', 'English', 'Finance', 'Genetic Algorithm', 'Image', 'Information Technology (IT)', 'Java', 'LinkedIn', 'Malay', 'Microsoft Access', 'Microsoft Office', 'MySQL', 'Neural Network', 'Nutrition', 'Oracle', 'Outreach', 'PHP', 'Philosophy', 'Programming Language', 'Python', 'R', 'Recommendation System', 'Research', 'Router', 'Software Engineering', 'Structured Query Language (SQL)', 'Training', 'Windows']
Ignore Skills: ['Algorithm', 'Analysis', 'Application', 'B', 'BASIC', 'Data', 'Development', 'Education', 'Engineering', 'Food', 'IPython', 'Language', 'Microsoft', 'Mining', 'Network', 'Product', 'Profiler', 'Programming', 'R', 'SQL']


In [25]:
index += 1
print_skills(index)

Individual: resource/CVs/6) Kamarul Zaman.pdf
Skills: ['Ad Design', 'Adobe Photoshop', 'Analysis', 'Antivirus Software', 'Blogging', 'C++', 'Cascading Style Sheet (CSS)', 'Cisco Certified Network Associate (CCNA)', 'Cisco Router', 'Computer Engineering', 'Computer Science', 'Configuration', 'Customer Service', 'Documentation', 'Dreamweaver', 'Dropbox', 'Electronics', 'Email', 'Engineering Technology', 'English', 'Extreme Programming (XP)', 'Firewall', 'Hardware', 'History', 'HyperText Markup Language (HTML)', 'IP Address', 'IP Addressing', 'Incident Management', 'Informatics', 'Information Technology (IT)', 'Intellectual Property (IP)', 'Internet Protocol (IP)', 'Java', 'Linux', 'MATLAB', 'Microsoft Excel', 'Microsoft PowerPoint', 'Microsoft Word', 'MySQL', 'Network Management', 'Network Security', 'Network Service', 'Networking', 'Operating System', 'PHP', 'Platform', 'Problem Management', 'Programming Language', 'R', 'Reporting', 'Research', 'Rivest-Shamir-Adleman (RSA)', 'Routing Pr

In [26]:
index += 1
print_skills(index)

Individual: resource/CVs/7) Zaihafiz.pdf
Skills: ['A/B Testing', 'Architecture', 'Audit', 'AutoCAD', 'Building Science', 'Civil Engineering', 'Communication', 'Construction', 'Data Analysis', 'Database', 'Design', 'Documentation', 'Economics', 'Email', 'English', 'Financial Modelling', 'Human Resource (HR)', 'ISO 9001', 'Information System', 'Leadership', 'Market', 'Microsoft Excel', 'Microsoft PowerPoint', 'Microsoft Word', 'Music', 'Objective-C', 'Physics', 'Project Management', 'QA Testing', 'R', 'Research', 'SPSS', 'Sustainability', 'TestNG', 'Training', 'Transportation']
Ignore Skills: ['Analysis', 'Application', 'B', 'Commercial', 'Coordination', 'Data', 'Development', 'Education', 'Engineering', 'Function', 'Implementation', 'Industry', 'Interest', 'Language', 'List', 'Management', 'Microsoft', 'Microsoft Office', 'Modelling', 'Operations', 'Planning', 'Preference', 'Service', 'Testing']


In [27]:
index += 1
print_skills(index)

Individual: resource/CVs/8) Nor Syaida.pdf
Skills: ['Analytical Chemistry', 'Derivative', 'English', 'Environmental Law', 'Microsft Publisher', 'Microsoft Excel', 'Microsoft PowerPoint', 'Microsoft Word', 'Objective-C', 'Philosophy', 'Public Relations (PR)', 'Quality Control', 'R', 'SPSS', 'Statistics']
Ignore Skills: ['Addition', 'Analytical', 'BASIC', 'Chemistry', 'Excel', 'List', 'Make', 'Microsoft', 'PowerPoint', 'Product']


In [28]:
index += 1
print_skills(index)

Individual: resource/CVs/9) B Norerama.pdf
Skills: ['Agriculture', 'Arts', 'AutoCAD', 'Brand Management', 'C', 'Coaching', 'Competition', 'Construction', 'Control System', 'Cooking', 'Customer Service', 'Data Analysis', 'Data Science', 'Database', 'Design', 'Documentation', 'Editor', 'Electronics', 'Email', 'English', 'Ethics', 'Fluid Mechanics', 'Google Docs', 'Hospitality', 'Human Resource (HR)', 'Innovation', 'Internet', 'Korean', 'Leadership', 'MATLAB', 'Malay', 'Manufacturing Engineering', 'Mathematics', 'Mechanical Engineering', 'Methodology', 'Microsft Publisher', 'Microsoft Excel', 'Microsoft Office', 'Microsoft PowerPoint', 'Microsoft Visio', 'Network', 'Nonverbal Communication', 'Office Management', 'Philosophy', 'Presentation', 'Public Speaking', 'RStudio', 'Research', 'Retail', 'Sales', 'Sports', 'Statistics', 'Subject Matter Expert (SME)', 'Sustainable Development', 'TestNG', 'Testing', 'Training', 'Velocity', 'Welding', 'Writing']
Ignore Skills: ['Analysis', 'Application'