In [69]:
import pandas as pd
import re
import random
import nltk
from datetime import datetime as dt
from nltk import ngrams
from difflib import get_close_matches as gcm

In [70]:
df_ds = pd.read_csv('indeed-insights/data_scientist.csv')
len(df_ds)

2284

In [71]:
df_skills = pd.read_csv('skills_db2/skill.csv')
SKILLS = df_skills['Skill'].unique().tolist()
len(SKILLS)

3149

In [72]:
df_domains = pd.read_csv('skills_db2/domain.csv')

In [73]:
df_redskills = pd.read_excel('skills/Other Skills.xlsx')
RED_SKILLS = df_redskills['Skill'].unique().tolist()
len(RED_SKILLS)

42

In [74]:
df_dupskills = pd.read_excel('skills/Other Skills.xlsx', sheet_name='Duplicates')
DUP_SKILLS = df_dupskills.set_index('Skill').to_dict()['Parent']
SKILLS.extend(list(DUP_SKILLS.keys()))
len(SKILLS)

3207

In [75]:
def print_job(df, i):
    row = df.loc[i]
    print('Title:', row['title'])
    print('Description:', row['description'])
    print('Indeed Skills:', row['indeed_skills'])
    
def test_extract(df, i):
    row = df.loc[i]
    info = row['title'] + ' ' + row['description']
    all_skills = extract_skills(info)
    job_skills, ignore_skills = extract_ignore(all_skills)
    job_skills.sort()
    ignore_skills.sort()
    print('Skills:', job_skills)
    print()
    print('Ignore:', ignore_skills)

def test_skill(df, i, skill, threshold=0.9):
    row = df.loc[i]
    info = row['title'] + ' ' + row['description']
    _, words, bigrams, trigrams = clean_info(info)
    results = []
    s = skill.lower()
    print('Unigram: {}'.format(gcm(s, words, cutoff=threshold)))
    print('Bigram: {}'.format(gcm(s, bigrams, cutoff=threshold)))
    print('Trigram: {}'.format(gcm(s, trigrams, cutoff=threshold)))
    
def extract_skills(info, threshold=0.9):
    words, unigrams, bigrams, trigrams = clean_info(info)
    results = []
    for skill in SKILLS:
        s = skill
        if '(' in s:
            abb = s[s.find("(")+1:s.find(")")]
            if abb in words:
                results.append(skill)
                continue
            s = re.sub(r"[\(].*?[\)]", "", s)
        s = s.lower()
        s2 = s.split()
        if len(s2) == 1:
            if len(gcm(s, unigrams, cutoff=threshold)) > 0:
                results.append(skill)
        elif len(s2) == 2:
            if len(gcm(s, bigrams, cutoff=threshold)) > 0:
                results.append(skill)
        elif len(s2) == 3:
            if len(gcm(s, trigrams, cutoff=threshold)) > 0:
                results.append(skill)
        else:
            if len(gcm(s, trigrams, cutoff=threshold)) > 0:
                results.append(skill)
    return results

def extract_ignore(skills):
    ignore_skills = []
    for j, skill in enumerate(skills):
        if skill in RED_SKILLS:
            ignore_skills.append(skill)
            continue
        for other in skills[:j] + skills[j+1:]:
            if skill in other:
                if find_whole_word(skill, other):
                    ignore_skills.append(skill)
                    break
    job_skills = []
    for skill in skills:
        if skill not in ignore_skills:
            if skill in DUP_SKILLS.keys():
                skill = DUP_SKILLS[skill]
            job_skills.append(skill)
    return list(set(list(job_skills))), ignore_skills

def clean_info(info):
    # Remove ordered list with alphabets: a), b), c),...
    words = re.sub(r'[\s\t\n|.|\(]+[a-zA-Z\s*][.|\)]+', ' ', info)
    words = re.sub('[\n|,|.|:|;|\-|/|\(|\)|\[|\]]', ' ', words)
    # words = [word.strip() for word in words]
    # words = nltk.word_tokenize(info)
    # unigrams = nltk.word_tokenize(info.lower())
    unigrams = words.lower().split()
    bigrams = [' '.join(g) for g in ngrams(unigrams, 2)]
    trigrams = [' '.join(g) for g in ngrams(unigrams, 3)]
    return words.split(), unigrams, bigrams, trigrams

def check_skill(skill):
    return df_skills.loc[df_skills['Skill'].str.contains(skill)].merge(df_domains, left_on='DomainId', right_on='Id')

def find_whole_word(search_string, input_string):
    raw_search_string = r"\b" + search_string + r"\b"
    match_output = re.search(raw_search_string, input_string)
    no_match_was_found = ( match_output is None )
    if no_match_was_found:
        return False
    else:
        return True

In [8]:
job_no = random.randint(0, len(df_ds))
job_no

2150

In [9]:
a, b, c, d = clean_info(df_ds['description'][job_no])

In [16]:
print_job(df_ds, job_no)

Title: Senior Data Scientist - Life & Health Regional Analytics Centre
Description: [CANDIDATES WHO REQUIRE WORK PASSES NEED NOT APPLY]Big data, artificial intelligence and advanced analytics are transforming the insurance industry across the value chain. Munich Re is at the forefront of this trend, having made a significant investment in start-of-the-art analytics infrastructure and software, central and regional analytics centres of competence and several successful analytics initiatives with its clients worldwide.Munich Re has experienced exponential growth in demand for analytics pilots from its clients in life, non-life and health. An exciting opportunity exists for a senior data scientist with advanced analytics skills to join Munich Re’s regional analytics team located in Singapore. This team supports Munich Re’s Asia-Pacific, Middle East and Africa business life and health business region. As such you will work in an agile and innovative area, gaining exposure to a wide variety

In [24]:
test_extract(df_ds, job_no)

Skills: ['Agile', 'Amazon Web Service (AWS)', 'Application Programming Interface (API)', 'Applied Mathematics', 'Artificial Intelligence (AI)', 'Automation', 'Big Data', 'Brand Management', 'Business Intelligence (BI)', 'Business Model', 'Coding', 'Communication', 'Customer Experience', 'D3.js', 'Dash', 'Data Analytics', 'Data Structure', 'Decision Tree', 'Deep Learning', 'Engineering', 'English', 'Exploratory Data Analysis', 'Extract Transform Load (ETL)', 'Git', 'Image', 'Innovation', 'Insurance', 'Interpreter', 'Investment', 'JavaScript', 'Jupyter Notebook', 'ML', 'Machine Learning', 'Microsoft Azure', 'Microsoft Power BI', 'Microsoft PowerPoint', 'Model Validation', 'Natural Language Processing (NLP)', 'Predictive Analytics', 'Predictive Modelling', 'Presentation', 'Python', 'R', 'RESTful API', 'Random Forest', 'Regression', 'Reporting', 'Research', 'Selection', 'Statistical Model', 'Statistics', 'Structured Query Language (SQL)', 'Translation', 'Underwriting', 'Virtual Machine', '

In [None]:
test_skill(df_ds, job_no, 'ISO')

In [37]:
check_skill('Digital')

Unnamed: 0,Id_x,Skill,DomainId,Id_y,Domain
0,123,Digital Communication,2,2,Business
1,124,Digital Literacy,2,2,Business
2,125,Digital Marketing,2,2,Business
3,126,Digital Photography,2,2,Business
4,1269,Digital Image Processing,4,4,Data Science
5,1761,Digital Library,5,5,Information Technology (IT)
6,1762,Digital Marketing System,5,5,Information Technology (IT)
7,1763,Digital Storage Systems Interconnect,5,5,Information Technology (IT)
8,1764,DigitalOcean,5,5,Information Technology (IT)
9,2221,Digital Geometry,6,6,Mathematics and Logic


## Test 100 jobs

In [26]:
N = 100
df_j = df_ds.iloc[:N]
initial = dt.now()
interval = dt.now()
print_every = 10

for i, row in df_j.iterrows():
    if (i+1) % print_every == 0:
        print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
        interval = dt.now()
    info = row['title'] + ' ' + row['description']
    all_skills = extract_skills(info)
    job_skills, ignore_skills = extract_ignore(all_skills)
    job_skills.sort()
    ignore_skills.sort()
    df_j.loc[i, 'new_skills'] = '; '.join(job_skills)
    df_j.loc[i, 'ignore_skills'] = '; '.join(ignore_skills)

print("Time taken: {}".format(dt.now() - initial))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


9 jobs processed. Time taken: 0:00:11.651199
19 jobs processed. Time taken: 0:00:23.449458
29 jobs processed. Time taken: 0:00:26.523928
39 jobs processed. Time taken: 0:00:19.606277
49 jobs processed. Time taken: 0:00:23.551673
59 jobs processed. Time taken: 0:00:24.578194
69 jobs processed. Time taken: 0:00:31.234480
79 jobs processed. Time taken: 0:00:33.052710
89 jobs processed. Time taken: 0:00:30.154010
99 jobs processed. Time taken: 0:00:41.143772
Time taken: 0:04:26.929147


In [38]:
len(df_j)

100

In [28]:
i = 0

In [83]:
i += 1
print(i)
print()
print(df_j.loc[i, 'title'])
print(df_j.loc[i, 'description'])
print()
print(df_j.loc[i, 'indeed_skills'])
print()
print(df_j.loc[i, 'new_skills'])
print()
print(df_j.loc[i, 'ignore_skills'])

21

Head of Data, Customer Intelligence
Job DescriptionWhy AirAsia?Are you ready to take off and be part of the Allstar employee? Whether you’re applying for a developer, customer happiness or crew, at AirAsia we act as One AirAsia.AirAsia.com Data Team empowers the build of data-driven products through top-notch algorithms and the use of the latest technologies, to fulfill the expectation of becoming the leading all-in-one travel and lifestyle products for people in ASEAN.If you are hungry to innovate, transform, and disrupt travel and related lifestyle ecosystems, this is the job for you.What You'll DoWork with large, complex data sets. Solve difficult, non-routine analysis problems, applying advanced analytical methods as needed. Conduct end-to-end analysis that includes data gathering and requirements specification, processing, analysis, ongoing deliverables, and presentations.Prototype and build analysis pipelines iteratively to provide insights at scale. Develop comprehensive und

In [81]:
test_skill(df_ds, i, 'telecommunication')

Unigram: ['telecommunication']
Bigram: ['of telecommunication']
Trigram: []


In [98]:
check_skill('Data Cleansing')

Unnamed: 0,Id_x,Skill,DomainId,Id_y,Domain
0,1241,Data Cleansing,4,4,Data Science
