In [17]:
import pandas as pd
import re
import difflib
from nltk import ngrams
from langdetect import detect, DetectorFactory
from difflib import get_close_matches as gcm

In [123]:
df = pd.read_csv('myfuturejobs-insights/myfuturejobs_clean.csv')
df.head()

Unnamed: 0,job_title,job_details,job_type,education,language,has_skills,skills_required,no_skills_required
0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent,4 - STPM / A Level,English,True,"['Sales', 'Operations', 'Process', 'Service', ...",6
1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent,6 - Bachelor's,Others,True,"['Adobe', 'Advertising']",2
2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,5 - Diploma / DVM,English,True,"['Operations', 'Distribution']",2
3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent,6 - Bachelor's,Others,False,[],0
4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,3 - SPM / O Level / SKM,English,True,"['Operations', 'Distribution']",2


In [90]:
df_skills = pd.read_csv('skills_db/skill.csv')
df_skills.head()

Unnamed: 0,skill_id,skill
0,150,Applied Science
1,163,Art
2,322,Business
3,548,Computer Science
4,714,Data Science


In [111]:
skill_tree = pd.read_csv('skills_db/skill_tree.csv')
df_st = skill_tree.replace(df_skills.set_index('skill_id').to_dict()['skill'])\
    .rename(columns={'skill_1_id': 'skill', 'skill_2_id': 'parent'})
df_st.head()

Unnamed: 0,skill,parent
0,Life Science,Natural Science
1,Physical Science,Natural Science
2,Theoretical Computer Science,Computer Science
3,Theoretical Computer Science,Mathematics
4,Agronomy,Applied Science


In [107]:
sk_list = df_skills['skill'].tolist()

def extract_skills(info):
    words = re.sub('[\n|,|.|/|\(|\)]', ' ', info).lower().split()
    bigrams = [' '.join(g) for g in ngrams(words, 2)]
    trigrams = [' '.join(g) for g in ngrams(words, 3)]
    results = []
    for skill in sk_list:
        s = skill.lower()
        if '(' in s:
            abb = s[s.find("(")+1:s.find(")")]
            if len(gcm(abb, words, cutoff=0.95)) > 0:
                results.append(skill)
                continue
            s = re.sub(r"[\(].*?[\)]", "", s)
        s2 = s.split()
        if len(s2) == 1:
            if len(gcm(s, words, cutoff=0.9)) > 0:
                results.append(skill)
        elif len(s2) == 2:
            if len(gcm(s, bigrams, cutoff=0.9)) > 0:
                results.append(skill)
        elif len(s2) == 3:
            if len(gcm(s, trigrams, cutoff=0.85)) > 0:
                results.append(skill)
        else:
            if len(gcm(s, trigrams, cutoff=0.8)) > 0:
                results.append(skill)
    return results

def test_skill(skill, info, c1, c2, c3):
    words = re.sub('[\n|,|.|/|\(|\)]', ' ', info).lower().split()
    bigrams = [' '.join(g) for g in ngrams(words, 2)]
    trigrams = [' '.join(g) for g in ngrams(words, 3)]
    results = []
    s = skill.lower()
    print('Unigram: {}'.format(gcm(s, words, cutoff=c1)))
    print('Bigram: {}'.format(gcm(s, bigrams, cutoff=c2)))
    print('Trigram: {}'.format(gcm(s, trigrams, cutoff=c3)))

In [128]:
N_TRAIN = 20
dict_list = []

for i in range(N_TRAIN):
    row = df.iloc[i]
    all_info = row['job_title'] + ' ' + row['job_details']
    skills = extract_skills(all_info)
    ignore_skills = []
    for j, skill in enumerate(skills):
        if any(skill in string for string in skills[:j] + skills[j+1:]):
            ignore_skills.append(skill)
    return_skills = [s for s in skills if s not in ignore_skills]
    dict_list.append({
        'job_title': row['job_title'],
        'job_details': row['job_details'],
        'skills_required': return_skills,
        'no_skills_required': len(return_skills),
        'ignored_skills': ignore_skills
    })

In [129]:
df2 = pd.DataFrame.from_dict(dict_list)
df2

Unnamed: 0,job_title,job_details,skills_required,no_skills_required,ignored_skills
0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,"[Sales, Operations, Process, Service, Selectio...",6,[]
1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,"[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[Adobe]
2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...","[Operations, Distribution]",2,[]
3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",[],0,[]
4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...","[Operations, Distribution]",2,[]
5,Vacancy For Technician,Engineering assistants ensure the administrati...,"[Engineering, Administration]",2,[]
6,CADET PLANTER,• Assists the Estate Management team in managi...,"[Management, Development, Communication and Se...",16,[Communication]
7,Perunding Setia SAR,"-Drafting,\n-Do office work such as admin work...",[Documentation],1,[]
8,Factory and Warehouse Workers,Factory hands assist machine operators and pro...,"[Make, Product]",2,[]
9,Junior Designer,Kelebihan diberi kepada yang mempunyai kemahir...,"[Adobe Illustrator, Adobe Photoshop]",2,[Adobe]


In [131]:
df_st.loc[df_st['skill'] == 'D']

Unnamed: 0,skill,parent
331,D,Programming Language


In [144]:
df_st.loc[df_st['skill'].str.contains('Optimization')]

Unnamed: 0,skill,parent
280,Compiler Optimization,Compiler
313,Conversion Rate Optimization,Marketing
686,Search Engine Optimization,Marketing
687,Search Enging and Keyword Optimization,Marketing
1086,Search Engine Optimization (SEO),Digital Marketing
1087,Search Engine Optimization (SEO),Social Media Management
1088,Search Engine Optimization (SEO),Web Development
1233,Search Engine Optimization,Technical Writing
1269,Bayesian Optimization,Bayesian
2740,Computational Optimization,Computer Science
