In [1]:
import pandas as pd
import re
import random
from nltk import ngrams
from difflib import get_close_matches as gcm

In [2]:
df_ds = pd.read_csv('indeed-insights/data_scientist.csv')
df_ds.head()

Unnamed: 0,title,company,country,date_posted,description,title_keywords,indeed_skills,skills,no_skills
0,Data Scientist,Mission Consultancy Services Malaysia Sdn Bhd,Malaysia,2020-07-26,Identify valuable data sources and automate co...,,"'C', 'C++', 'Excel', 'Hadoop', 'Java', 'R', 'S...","['Business', 'Analytical', 'Communication', 'D...",27
1,Data Scientist,AirAsia,Malaysia,2020-08-15,Job DescriptionOverviewThis role will responsi...,,"'C', 'R'","['Business', 'Accuracy', 'Processing', 'Produc...",21
2,Data Scientist,CareerMaster Sdn Bhd,Malaysia,2020-07-26,Position to be based in Kuala Lumpur.We are re...,,"'C', 'GIS', 'Go', 'Python', 'R', 'SQL', 'Stati...","['Business', 'Computer Science', 'Microsoft', ...",30
3,Data Scientist Executive,GENO Management,Malaysia,2020-08-18,Position : Data Scientists ExecutiveLocation :...,,"'AWS', 'C', 'Go', 'Python', 'R', 'Regression',...","['Business', 'Data Science', 'Analytical', 'Bu...",39
4,Data Scientist,PLUS SOLAR SYSTEMS SDN BHD,Malaysia,2020-07-26,Work With Stakeholders Throughout The Organiza...,,'C',"['Business', 'Accuracy', 'Development', 'Marke...",9


In [3]:
df_skills = pd.read_csv('skills_db2/skill.csv')
SKILLS = df_skills['skill'].unique().tolist()
len(SKILLS)

3101

In [4]:
df_redskills = pd.read_excel('skills/Redundant Skills.xlsx')
RED_SKILLS = df_redskills['Skill'].unique().tolist()
len(RED_SKILLS)

38

In [5]:
def print_job(df, i):
    row = df.loc[i]
    print('Title:', row['title'])
    print('Description:', row['description'])
    print('Indeed Skills:', row['indeed_skills'])
    
def test_extract(df, i):
    row = df.loc[i]
    info = row['title'] + ' ' + row['description']
    all_skills = extract_skills(info)
    job_skills, ignore_skills = extract_ignore(all_skills)
    job_skills.sort()
    ignore_skills.sort()
    print('Skills:', job_skills)
    print()
    print('Ignore:', ignore_skills)

def test_skill(df, i, skill, threshold=0.9):
    row = df.loc[i]
    info = row['title'] + ' ' + row['description']
    words, bigrams, trigrams = clean_info(info)
    results = []
    s = skill.lower()
    print('Unigram: {}'.format(gcm(s, words, cutoff=threshold)))
    print('Bigram: {}'.format(gcm(s, bigrams, cutoff=threshold)))
    print('Trigram: {}'.format(gcm(s, trigrams, cutoff=threshold)))
    
def extract_skills(info, threshold=0.9):
    words, bigrams, trigrams = clean_info(info)
    results = []
    for skill in SKILLS:
        s = skill.lower()
        if '(' in s:
            abb = s[s.find("(")+1:s.find(")")].upper()
            if abb in words:
                results.append(skill)
                continue
            s = re.sub(r"[\(].*?[\)]", "", s)
        s2 = s.split()
        if len(s2) == 1:
            if len(gcm(s, words, cutoff=threshold)) > 0:
                results.append(skill)
        elif len(s2) == 2:
            if len(gcm(s, bigrams, cutoff=threshold)) > 0:
                results.append(skill)
        elif len(s2) == 3:
            if len(gcm(s, trigrams, cutoff=threshold)) > 0:
                results.append(skill)
        else:
            if len(gcm(s, trigrams, cutoff=threshold)) > 0:
                results.append(skill)
    return results

def extract_ignore(skills):
    ignore_skills = []
    for j, skill in enumerate(skills):
        if skill in RED_SKILLS:
            ignore_skills.append(skill)
        else:
            for other in skills[:j] + skills[j+1:]:
                if skill in other:
                    if find_whole_word(skill, other):
                        ignore_skills.append(skill)
                        break
    job_skills = [s for s in skills if s not in ignore_skills]
    return job_skills, ignore_skills

def clean_info(info):
    # Remove ordered list with alphabets: a), b), c),...
    words = re.sub(r'[\s\t\n]+[a-zA-Z\s*]\)+', ' ', info)
    words = re.sub('[\n|,|.|\-|/|\(|\)]', ' ', words).lower().split()
    #words = re.findall(r'[a-zA-Z](?:[A-Z]*(?![a-z])|[a-z]*)', words)
    words = [word.strip() for word in words]
    bigrams = [' '.join(g) for g in ngrams(words, 2)]
    trigrams = [' '.join(g) for g in ngrams(words, 3)]
    return words, bigrams, trigrams

def check_skill(skill):
    return df_skills.loc[df_skills['skill'].str.contains(skill)]

def find_whole_word(search_string, input_string):
    raw_search_string = r"\b" + search_string + r"\b"
    match_output = re.search(raw_search_string, input_string)
    no_match_was_found = ( match_output is None )
    if no_match_was_found:
        return False
    else:
        return True

In [9]:
job_no = random.randint(0, len(df_ds))
job_no

464

In [10]:
print_job(df_ds, job_no)

Title: Data Scientist
Description: Roles & ResponsibilitiesCare about climate change and want to build technology to solve it? Aspire to shape and contribute to a fast-growing technology start-up as an early employee? This is your opportunity.At Solar AI, we want to use technology to simplify the solar experience for everyone.Our solution: building intelligence for rooftop solar projects. By combining geospatial analysis of satellite imagery with big data and artificial intelligence, Solar AI empowers solar sales teams to discover qualified leads and prioritize their sales efforts for rooftop solar.Solar AI is a seed stage start-up funded by and incubated as part of ENGIE Factory, the dedicated venture arm for ENGIE Group in Asia-Pacific.Find out more at https://getsolar.ai.The opportunityAs a Data Scientist with Solar AI, you will shape our building intelligence platform by creating application pipelines that pull data from a universe of ingested data-sets and refine them to extract i

In [13]:
test_extract(df_ds, job_no)

Skills: ['Ad Design', 'Agile Software Development', 'Applied Mathematics', 'Artificial Intelligence (AI)', 'Big Data', 'Classification', 'Cloud Storage', 'Creatio', 'Deep Learning', 'Energy Consumption', 'Feature Extraction', 'Feature Selection', 'IPython', 'Lean Software Development', 'Machine Learning Algorithm', 'Platform', 'PostgreSQL', 'Processing', 'Python', 'Regression', 'Relational Database', 'Research', 'Sales', 'Scrum', 'Semantics', 'Software Engineering', 'Testing', 'Text Analysis']

Ignore: ['Agile', 'Algorithm', 'Application', 'Data', 'Database', 'Design', 'Development', 'Engineering', 'Machine Learning', 'Mathematics', 'Selection', 'Service', 'Software Development', 'Unemployment']


In [11]:
test_skill(df_ds, job_no, 'Applied Mathematics')

Unigram: []
Bigram: ['applied mathematics']
Trigram: ['applied mathematics to']


In [12]:
check_skill('Applied Mathematics')

Unnamed: 0,skill_id,skill
2166,2167,Applied Mathematics
