In [2]:
import json
import string
import pandas as pd
import re
import difflib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from nltk import ngrams
from langdetect import detect, DetectorFactory
from difflib import get_close_matches as gcm

In [3]:
skills = pd.read_csv('skills_db/skill.csv')
skills.head()

Unnamed: 0,skill_id,skill
0,152,Applied Science
1,165,Art
2,324,Business
3,550,Computer Science
4,717,Data Science


In [4]:
sk_list = skills['skill'].tolist()
len(sk_list)

3100

In [5]:
with open('resource/job_infos_scrape.json') as f:
    jobs = json.load(f)

len(jobs)

19680

In [6]:
jobs['0']

{'title': 'SALES ASSOCIATE',
 'job_details': '* Under supervision and perform duties to provide technical product assistance/knowledge to customers in order to generate new and repeat sales.\n* Assist customers in the selection of the best products that suited to their needs and desires and explain use, operation, care of the merchandise products and services to customers.\n* Explain the terms of sales, availability of the product and delivery dates and related information and process orders to customers.\n* Handle customers inquiries and complaint.\n* Keep display items clean and displayable and overall cleanliness of the branch.\n* To  handle  any  other  job  as  may be  assigned  by  the  superior.\n\nLocation : Tropicana Aman, Denai Alam, Wisma WCC, Puchong, Bandar Botanik, Sri Petaling, Pandan Indah, Rawang, Sungai Buloh, Kota Damansara, Kajang, Bangi , Balakong, USJ Taipan, Kota Kemuning, Shah Alam.',
 'requirement': 'Permanent| 3 Shift Time |STPM / A Level or Equivalent|Today',

In [7]:
rs = []
i = 0

for j in jobs:
    rs.append(jobs[j]['requirement'].split('|')[2])

set(rs)

{"Bachelor's or Equivalent",
 'Diploma / Advanced Diploma / Higher Graduate Diploma / DVM / DKM Level 4 / DLKM Level 5',
 'Diploma / Advanced Diploma / Higher Graduate Diploma / DVM /…',
 'Diploma / Advanced Diploma / Higher Graduate…',
 'Doctoral (PhD) or Equivalent',
 "Master's or Equivalent",
 'PMR / PT3 or Equivalent',
 'Primary Education or Below',
 'SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 or Equivalent',
 'SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 or…',
 'STPM / A Level or Equivalent'}

In [15]:
df_jobs = pd.DataFrame.from_dict(jobs).transpose().reset_index()
df_jobs.head()

Unnamed: 0,index,title,job_details,requirement,company,company_details
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent| 3 Shift Time |STPM / A Level or Equ...,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot..."
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent| Normal Hour |Bachelor's or Equivale...,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ..."
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| Flexible Hours |Diploma / Advanced ...,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot..."
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent| Normal Hour |Bachelor's or Equivale...,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo..."
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| 3 Shift Time |SPM / O Level / SKM L...,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot..."


In [16]:
df_unique = df_jobs.drop_duplicates(subset=['title', 'job_details'])
df_unique.head()

Unnamed: 0,index,title,job_details,requirement,company,company_details
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent| 3 Shift Time |STPM / A Level or Equ...,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot..."
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent| Normal Hour |Bachelor's or Equivale...,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ..."
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| Flexible Hours |Diploma / Advanced ...,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot..."
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent| Normal Hour |Bachelor's or Equivale...,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo..."
5,5,Vacancy For Technician,Engineering assistants ensure the administrati...,Permanent| 3 Shift Time |SPM / O Level / SKM L...,HLK (Chain-Store) Sdn. Bhd\n|\nShah Alam,"Civil engineering, Construction of buildings\n..."


In [17]:
len(df_unique)

8184

In [31]:
def extract_skills(info):
    words = re.sub('[\n|,|.|/|\(|\)]', ' ', info).lower().split()
    bigrams = [' '.join(g) for g in ngrams(words, 2)]
    trigrams = [' '.join(g) for g in ngrams(words, 3)]
    results = []
    for skill in sk_list:
        s = skill.lower()
        if '(' in s:
            abb = s[s.find("(")+1:s.find(")")]
            if abb in info:
                continue
            s = re.sub(r"[\(].*?[\)]", "", s)
        s2 = s.split()
        if len(s2) == 1:
            if len(gcm(s, words, cutoff=0.9)) > 0:
                results.append(skill)
        elif len(s2) == 2:
            if len(gcm(s, bigrams, cutoff=0.9)) > 0:
                results.append(skill)
        elif len(s2) == 3:
            if len(gcm(s, trigrams, cutoff=0.85)) > 0:
                results.append(skill)
        else:
            if len(gcm(s, trigrams, cutoff=0.8)) > 0:
                results.append(skill)
    return results

In [37]:
job_info_list = []
DetectorFactory.seed = 42
initial = dt.now()
interval = dt.now()
print_every = 200

for i, job in df_unique.iterrows():
    if (i + 1) % print_every == 0:
        print("{} jobs processed. Time taken: {}".format(i+1, dt.now() - interval))
        interval = dt.now()
    all_info = job['title'] + ' ' + job['job_details']
    """
    if len(job['job_details']) < 3:
        language = None
    else:
        language = detect(job['job_details'])
    """
    skills = extract_skills(all_info)
    ignore_skills = []
    for j, skill in enumerate(skills):
        if any(skill in re.sub(r"[\(\)]", "", s).split() for s in skills[:j] + skills[j+1:]):
            ignore_skills.append(skill)
    return_skills = [s for s in skills if s not in ignore_skills]
    job_info_list.append({
        'title': job['title'],
        'job_details': job['job_details'],
        # 'language': language,
        'skills': return_skills,
        'no_skills': len(return_skills),
        'ignored_skills': ignore_skills
    })

print("Total time taken: {}".format(dt.now() - initial))

200 jobs processed. Time taken: 0:01:45.513545
400 jobs processed. Time taken: 0:01:51.341809
600 jobs processed. Time taken: 0:01:49.256458
800 jobs processed. Time taken: 0:01:37.822817
1000 jobs processed. Time taken: 0:01:57.524150
1200 jobs processed. Time taken: 0:02:03.085132
1400 jobs processed. Time taken: 0:01:46.487197
1600 jobs processed. Time taken: 0:01:38.410018
1800 jobs processed. Time taken: 0:01:28.536579
2000 jobs processed. Time taken: 0:02:09.218185
2200 jobs processed. Time taken: 0:02:40.923683
2400 jobs processed. Time taken: 0:01:32.806100
2600 jobs processed. Time taken: 0:01:12.865237
2800 jobs processed. Time taken: 0:00:55.921004
3000 jobs processed. Time taken: 0:00:59.751837
3200 jobs processed. Time taken: 0:01:52.606759
3400 jobs processed. Time taken: 0:01:30.299799
3600 jobs processed. Time taken: 0:01:41.519325
3800 jobs processed. Time taken: 0:02:16.528157
4000 jobs processed. Time taken: 0:01:57.149467
4200 jobs processed. Time taken: 0:01:13.939

In [38]:
df = pd.DataFrame.from_dict(job_info_list)
df.head()

Unnamed: 0,title,job_details,skills,no_skills,ignored_skills
0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,"[Operations, Sales, Service, Process, Selectio...",6,[]
1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,"[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[Adobe]
2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...","[Operations, Distribution]",2,[]
3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",[],0,[]
4,Vacancy For Technician,Engineering assistants ensure the administrati...,"[Engineering, Administration]",2,[]


In [40]:
df2 = df_jobs.merge(df, on=['title', 'job_details'], how='left')
df2.head()

Unnamed: 0,index,title,job_details,requirement,company,company_details,skills,no_skills,ignored_skills
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent| 3 Shift Time |STPM / A Level or Equ...,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Operations, Sales, Service, Process, Selectio...",6,[]
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent| Normal Hour |Bachelor's or Equivale...,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[Adobe]
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| Flexible Hours |Diploma / Advanced ...,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...","[Operations, Distribution]",2,[]
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent| Normal Hour |Bachelor's or Equivale...,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[]
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| 3 Shift Time |SPM / O Level / SKM L...,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...","[Operations, Distribution]",2,[]


In [45]:
df2.isnull().sum()

index              0
title              0
job_details        0
requirement        0
company            0
company_details    0
skills             0
no_skills          0
ignored_skills     0
dtype: int64

In [68]:
df4 = pd.read_csv('myfuturejobs-insights/myfuturejobs_skills2.csv')
df2['language'] = df4['language']
df2.head()

Unnamed: 0,index,title,job_details,requirement,company,company_details,skills,no_skills,ignored_skills,language
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent| 3 Shift Time |STPM / A Level or Equ...,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Operations, Sales, Service, Process, Selectio...",6,[],en
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent| Normal Hour |Bachelor's or Equivale...,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[Adobe],id
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| Flexible Hours |Diploma / Advanced ...,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...","[Operations, Distribution]",2,[],en
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent| Normal Hour |Bachelor's or Equivale...,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[],id
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| 3 Shift Time |SPM / O Level / SKM L...,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...","[Operations, Distribution]",2,[],en


In [81]:
cols[-2:]

['job_type', 'education_requirement']

In [85]:
df3 = df2.copy()
df3[['job_type', 'education_requirement']] = df2['requirement'].str.split('|', expand=True)[[0, 2]]
cols = df3.columns.tolist()
cols = cols[:3] + cols[-2:] + [cols[-3]] + cols[4:-3]
df3 = df3[cols]
df3.head()

Unnamed: 0,index,title,job_details,job_type,education_requirement,language,company,company_details,skills,no_skills,ignored_skills
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent,STPM / A Level or Equivalent,en,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Operations, Sales, Service, Process, Selectio...",6,[]
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent,Bachelor's or Equivalent,id,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[Adobe]
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,Diploma / Advanced Diploma / Higher Graduate…,en,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...","[Operations, Distribution]",2,[]
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent,Bachelor's or Equivalent,id,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[]
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,SPM / O Level / SKM Level 1 / SKM Level 2 / SK...,en,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...","[Operations, Distribution]",2,[]


In [86]:
df3.to_csv('myfuturejobs-insights/myfuturejobs_skills3.csv', index=False)