In [28]:
import pandas as pd
import re
import random
import nltk
from nltk import ngrams
from difflib import get_close_matches as gcm

In [2]:
df_ds = pd.read_csv('indeed-insights/data_scientist.csv')
df_ds.head()

Unnamed: 0,title,company,country,date_posted,description,title_keywords,indeed_skills,skills,no_skills
0,Data Scientist,Mission Consultancy Services Malaysia Sdn Bhd,Malaysia,2020-07-26,Identify valuable data sources and automate co...,,"'C', 'C++', 'Excel', 'Hadoop', 'Java', 'R', 'S...","['Business', 'Analytical', 'Communication', 'D...",27
1,Data Scientist,AirAsia,Malaysia,2020-08-15,Job DescriptionOverviewThis role will responsi...,,"'C', 'R'","['Business', 'Accuracy', 'Processing', 'Produc...",21
2,Data Scientist,CareerMaster Sdn Bhd,Malaysia,2020-07-26,Position to be based in Kuala Lumpur.We are re...,,"'C', 'GIS', 'Go', 'Python', 'R', 'SQL', 'Stati...","['Business', 'Computer Science', 'Microsoft', ...",30
3,Data Scientist Executive,GENO Management,Malaysia,2020-08-18,Position : Data Scientists ExecutiveLocation :...,,"'AWS', 'C', 'Go', 'Python', 'R', 'Regression',...","['Business', 'Data Science', 'Analytical', 'Bu...",39
4,Data Scientist,PLUS SOLAR SYSTEMS SDN BHD,Malaysia,2020-07-26,Work With Stakeholders Throughout The Organiza...,,'C',"['Business', 'Accuracy', 'Development', 'Marke...",9


In [4]:
df_skills = pd.read_csv('skills_db2/skill.csv')
SKILLS = df_skills['Skill'].unique().tolist()
len(SKILLS)

3130

In [121]:
df_domains = pd.read_csv('skills_db2/domain.csv')

In [5]:
df_redskills = pd.read_excel('skills/Redundant Skills.xlsx')
RED_SKILLS = df_redskills['Skill'].unique().tolist()
len(RED_SKILLS)

42

In [122]:
def print_job(df, i):
    row = df.loc[i]
    print('Title:', row['title'])
    print('Description:', row['description'])
    print('Indeed Skills:', row['indeed_skills'])
    
def test_extract(df, i):
    row = df.loc[i]
    info = row['title'] + ' ' + row['description']
    all_skills = extract_skills(info)
    job_skills, ignore_skills = extract_ignore(all_skills)
    job_skills.sort()
    ignore_skills.sort()
    print('Skills:', job_skills)
    print()
    print('Ignore:', ignore_skills)

def test_skill(df, i, skill, threshold=0.9):
    row = df.loc[i]
    info = row['title'] + ' ' + row['description']
    _, words, bigrams, trigrams = clean_info(info)
    results = []
    s = skill.lower()
    print('Unigram: {}'.format(gcm(s, words, cutoff=threshold)))
    print('Bigram: {}'.format(gcm(s, bigrams, cutoff=threshold)))
    print('Trigram: {}'.format(gcm(s, trigrams, cutoff=threshold)))
    
def extract_skills(info, threshold=0.9):
    words, unigrams, bigrams, trigrams = clean_info(info)
    results = []
    for skill in SKILLS:
        s = skill
        if '(' in s:
            abb = s[s.find("(")+1:s.find(")")]
            if abb in words:
                results.append(skill)
                continue
            s = re.sub(r"[\(].*?[\)]", "", s)
        s = s.lower()
        s2 = s.split()
        if len(s2) == 1:
            if len(gcm(s, unigrams, cutoff=threshold)) > 0:
                results.append(skill)
        elif len(s2) == 2:
            if len(gcm(s, bigrams, cutoff=threshold)) > 0:
                results.append(skill)
        elif len(s2) == 3:
            if len(gcm(s, trigrams, cutoff=threshold)) > 0:
                results.append(skill)
        else:
            if len(gcm(s, trigrams, cutoff=threshold)) > 0:
                results.append(skill)
    return results

def extract_ignore(skills):
    ignore_skills = []
    for j, skill in enumerate(skills):
        if skill in RED_SKILLS:
            ignore_skills.append(skill)
        else:
            for other in skills[:j] + skills[j+1:]:
                if skill in other:
                    if find_whole_word(skill, other):
                        ignore_skills.append(skill)
                        break
    job_skills = [s for s in skills if s not in ignore_skills]
    return job_skills, ignore_skills

def clean_info(info):
    # Remove ordered list with alphabets: a), b), c),...
    words = re.sub(r'[\s\t\n|.|\(]+[a-zA-Z\s*][.|\)]+', ' ', info)
    words = re.sub('[\n|,|.|:|;|\-|/|\(|\)|\[|\]]', ' ', words)
    # words = [word.strip() for word in words]
    # words = nltk.word_tokenize(info)
    # unigrams = nltk.word_tokenize(info.lower())
    unigrams = words.lower().split()
    bigrams = [' '.join(g) for g in ngrams(unigrams, 2)]
    trigrams = [' '.join(g) for g in ngrams(unigrams, 3)]
    return words.split(), unigrams, bigrams, trigrams

def check_skill(skill):
    return df_skills.loc[df_skills['Skill'].str.contains(skill)].merge(df_domains, left_on='DomainId', right_on='Id')

def find_whole_word(search_string, input_string):
    raw_search_string = r"\b" + search_string + r"\b"
    match_output = re.search(raw_search_string, input_string)
    no_match_was_found = ( match_output is None )
    if no_match_was_found:
        return False
    else:
        return True

In [143]:
job_no = random.randint(0, len(df_ds))
job_no

991

In [144]:
a, b, c, d = clean_info(df_ds['description'][job_no])

In [145]:
print_job(df_ds, job_no)

Title: Associate Director of Sales, Ad Sales (Singapore)
Description: Appier is a technology company which aims to provide artificial intelligence platforms to help enterprises solve their most challenging business problems. Appier was established in 2012 by a passionate team of computer scientists and engineers with expertise in AI, data analysis, distributed systems, and marketing.About the roleThe Sales Manager is responsible for selling and up-selling to new and existing clients and providing excellent service in support of team revenue goals in the local market.Task of the roleProactively contact new and existing clients and educate, propose and secure buy-in on Appier's ads solutionsIdentify, propose and sell programs to targeted organizations within the target markets/sectors set out by the companyDevelop an understanding of client's business, products, services, customer profile, marketing and business objectives, competitors and sales.Manage individual sales pipeline to maximi

In [146]:
test_extract(df_ds, job_no)

Skills: ['Artificial Intelligence (AI)', 'Big Data Analysis', 'Communication', 'Digital Marketing', 'Digital Media', 'Distributed System', 'Market', 'Mobile Marketing', 'Negotiation', 'Platform', 'Profiler', 'Sales']

Ignore: ['Data', 'Data Analysis', 'Language', 'Marketing', 'Media', 'Product', 'Service', 'Set', 'Support']


In [130]:
test_skill(df_ds, job_no, 'Redshift')

Unigram: ['redshift']
Bigram: []
Trigram: []


In [142]:
check_skill('Statistical')

Unnamed: 0,Id_x,Skill,DomainId,Id_y,Domain
0,1519,SAS Statistical Analysis,4,4,Data Science
1,1570,Statistical AI,4,4,Data Science
2,1571,Statistical Analysis,4,4,Data Science
3,1572,Statistical Hypothesis Testing,4,4,Data Science
4,1573,Statistical Model,4,4,Data Science
5,1574,Statistical Modelling,4,4,Data Science
6,1575,Statistical Semantic,4,4,Data Science
7,1576,Statistical Theory,4,4,Data Science
8,2825,Statistical Mechanis,7,7,Natural Science
