In [1]:
import json
import string
import pandas as pd
import re
import difflib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from nltk import ngrams
from langdetect import detect, DetectorFactory
from difflib import get_close_matches as gcm
from skill_api import extract_skills, extract_ignore

In [2]:
with open('resource/job_infos_scrape.json') as f:
    jobs = json.load(f)

len(jobs)

19680

In [3]:
jobs['0']

{'title': 'SALES ASSOCIATE',
 'job_details': '* Under supervision and perform duties to provide technical product assistance/knowledge to customers in order to generate new and repeat sales.\n* Assist customers in the selection of the best products that suited to their needs and desires and explain use, operation, care of the merchandise products and services to customers.\n* Explain the terms of sales, availability of the product and delivery dates and related information and process orders to customers.\n* Handle customers inquiries and complaint.\n* Keep display items clean and displayable and overall cleanliness of the branch.\n* To  handle  any  other  job  as  may be  assigned  by  the  superior.\n\nLocation : Tropicana Aman, Denai Alam, Wisma WCC, Puchong, Bandar Botanik, Sri Petaling, Pandan Indah, Rawang, Sungai Buloh, Kota Damansara, Kajang, Bangi , Balakong, USJ Taipan, Kota Kemuning, Shah Alam.',
 'requirement': 'Permanent| 3 Shift Time |STPM / A Level or Equivalent|Today',

In [4]:
rs = []
i = 0

for j in jobs:
    rs.append(jobs[j]['requirement'].split('|')[2])

set(rs)

{"Bachelor's or Equivalent",
 'Diploma / Advanced Diploma / Higher Graduate Diploma / DVM / DKM Level 4 / DLKM Level 5',
 'Diploma / Advanced Diploma / Higher Graduate Diploma / DVM /…',
 'Diploma / Advanced Diploma / Higher Graduate…',
 'Doctoral (PhD) or Equivalent',
 "Master's or Equivalent",
 'PMR / PT3 or Equivalent',
 'Primary Education or Below',
 'SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 or Equivalent',
 'SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 or…',
 'STPM / A Level or Equivalent'}

In [5]:
df_jobs = pd.DataFrame.from_dict(jobs).transpose().reset_index()
df_jobs.head()

Unnamed: 0,index,title,job_details,requirement,company,company_details
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent| 3 Shift Time |STPM / A Level or Equ...,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot..."
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent| Normal Hour |Bachelor's or Equivale...,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ..."
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| Flexible Hours |Diploma / Advanced ...,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot..."
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent| Normal Hour |Bachelor's or Equivale...,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo..."
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| 3 Shift Time |SPM / O Level / SKM L...,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot..."


In [6]:
df_unique = df_jobs.drop_duplicates(subset=['title', 'job_details'])
df_unique.head()

Unnamed: 0,index,title,job_details,requirement,company,company_details
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent| 3 Shift Time |STPM / A Level or Equ...,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot..."
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent| Normal Hour |Bachelor's or Equivale...,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ..."
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| Flexible Hours |Diploma / Advanced ...,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot..."
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent| Normal Hour |Bachelor's or Equivale...,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo..."
5,5,Vacancy For Technician,Engineering assistants ensure the administrati...,Permanent| 3 Shift Time |SPM / O Level / SKM L...,HLK (Chain-Store) Sdn. Bhd\n|\nShah Alam,"Civil engineering, Construction of buildings\n..."


In [7]:
len(df_unique)

8184

In [8]:
i = 0
job_info_list = []
DetectorFactory.seed = 42
initial = dt.now()
interval = dt.now()
print_every = 200

for _, job in df_unique.iterrows():
    i += 1
    if i % print_every == 0:
        print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
        interval = dt.now()
    all_info = job['title'] + ' ' + job['job_details']
    """
    if len(job['job_details']) < 3:
        language = None
    else:
        language = detect(job['job_details'])
    """
    all_skills = extract_skills(all_info)
    job_skills, ignore_skills = extract_ignore(all_skills)
    job_skills.sort()
    ignore_skills.sort()
    job_info_list.append({
        'title': job['title'],
        'job_details': job['job_details'],
        # 'language': language,
        'job_skills': job_skills,
        'no_skills': len(job_skills),
        'ignore_skills': ignore_skills
    })

print("Total time taken: {}".format(dt.now() - initial))

200 jobs processed. Time taken: 0:01:52.761418
400 jobs processed. Time taken: 0:01:54.207851
600 jobs processed. Time taken: 0:01:45.019723
800 jobs processed. Time taken: 0:01:52.950845
1000 jobs processed. Time taken: 0:02:01.904921
1200 jobs processed. Time taken: 0:01:41.871773
1400 jobs processed. Time taken: 0:01:50.273024
1600 jobs processed. Time taken: 0:01:28.543371
1800 jobs processed. Time taken: 0:02:16.767328
2000 jobs processed. Time taken: 0:02:35.030536
2200 jobs processed. Time taken: 0:01:46.494139
2400 jobs processed. Time taken: 0:01:08.413005
2600 jobs processed. Time taken: 0:00:59.382147
2800 jobs processed. Time taken: 0:01:43.637495
3000 jobs processed. Time taken: 0:01:26.976165
3200 jobs processed. Time taken: 0:01:52.641836
3400 jobs processed. Time taken: 0:02:14.345837
3600 jobs processed. Time taken: 0:02:04.326366
3800 jobs processed. Time taken: 0:01:44.683463
4000 jobs processed. Time taken: 0:01:48.394955
4200 jobs processed. Time taken: 0:01:37.383

In [9]:
df = pd.DataFrame.from_dict(job_info_list)
df.head()

Unnamed: 0,title,job_details,job_skills,no_skills,ignore_skills
0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,"[Sales, Selection]",2,"[Operations, Product, Service]"
1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,"[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[]
2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",[],0,"[Distribution, Operations]"
3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",[],0,[]
4,Vacancy For Technician,Engineering assistants ensure the administrati...,"[Administration, Engineering]",2,[]


In [10]:
df2 = df_jobs.merge(df, on=['title', 'job_details'], how='left')
df2.head()

Unnamed: 0,index,title,job_details,requirement,company,company_details,job_skills,no_skills,ignore_skills
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent| 3 Shift Time |STPM / A Level or Equ...,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Sales, Selection]",2,"[Operations, Product, Service]"
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent| Normal Hour |Bachelor's or Equivale...,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[]
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| Flexible Hours |Diploma / Advanced ...,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]"
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent| Normal Hour |Bachelor's or Equivale...,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[]
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| 3 Shift Time |SPM / O Level / SKM L...,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]"


In [11]:
df2.isnull().sum()

index              0
title              0
job_details        0
requirement        0
company            0
company_details    0
job_skills         0
no_skills          0
ignore_skills      0
dtype: int64

In [12]:
df_lan = pd.read_csv('myfuturejobs-insights/v0/myfuturejobs_skills2.csv')
df2['language'] = df_lan['language']
df2.head()

Unnamed: 0,index,title,job_details,requirement,company,company_details,job_skills,no_skills,ignore_skills,language
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent| 3 Shift Time |STPM / A Level or Equ...,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Sales, Selection]",2,"[Operations, Product, Service]",en
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent| Normal Hour |Bachelor's or Equivale...,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[],id
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| Flexible Hours |Diploma / Advanced ...,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]",en
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent| Normal Hour |Bachelor's or Equivale...,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[],id
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent| 3 Shift Time |SPM / O Level / SKM L...,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]",en


## Data Cleaning

In [13]:
df3 = df2.copy()
df3[['job_type', 'education_requirement']] = df2['requirement'].str.split('|', expand=True)[[0, 2]]
cols = df3.columns.tolist()
cols = cols[:3] + cols[-2:] + [cols[-3]] + cols[4:-3]
df3 = df3[cols]
df3.head()

Unnamed: 0,index,title,job_details,job_type,education_requirement,language,company,company_details,job_skills,no_skills,ignore_skills
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent,STPM / A Level or Equivalent,en,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Sales, Selection]",2,"[Operations, Product, Service]"
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent,Bachelor's or Equivalent,id,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[]
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,Diploma / Advanced Diploma / Higher Graduate…,en,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]"
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent,Bachelor's or Equivalent,id,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[]
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,SPM / O Level / SKM Level 1 / SKM Level 2 / SK...,en,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]"


In [14]:
df3['education_requirement'].unique().tolist()

['STPM / A Level or Equivalent',
 "Bachelor's or Equivalent",
 'Diploma / Advanced Diploma / Higher Graduate…',
 'SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 or…',
 'SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 or Equivalent',
 'Diploma / Advanced Diploma / Higher Graduate Diploma / DVM /…',
 'Primary Education or Below',
 'PMR / PT3 or Equivalent',
 'Diploma / Advanced Diploma / Higher Graduate Diploma / DVM / DKM Level 4 / DLKM Level 5',
 "Master's or Equivalent",
 'Doctoral (PhD) or Equivalent']

In [15]:
df3['education'] = df3['education_requirement'].replace({
    'Primary Education or Below': '1 - Primary Education',
    'PMR / PT3 or Equivalent': '2 - PMR / PT3',
    'SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 or…': '3 - SPM / O Level / SKM',
    'SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 or Equivalent': '3 - SPM / O Level / SKM',
    'STPM / A Level or Equivalent': '4 - STPM / A Level',
    'Diploma / Advanced Diploma / Higher Graduate…': '5 - Diploma / DVM',
    'Diploma / Advanced Diploma / Higher Graduate Diploma / DVM /…': '5 - Diploma / DVM',
    'Diploma / Advanced Diploma / Higher Graduate Diploma / DVM / DKM Level 4 / DLKM Level 5': '5 - Diploma / DVM',
    "Bachelor's or Equivalent": "6 - Bachelor's",
    "Master's or Equivalent": "7 - Master's",
    'Doctoral (PhD) or Equivalent': '8 - Doctoral (PhD)'
})
df3.head()

Unnamed: 0,index,title,job_details,job_type,education_requirement,language,company,company_details,job_skills,no_skills,ignore_skills,education
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent,STPM / A Level or Equivalent,en,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Sales, Selection]",2,"[Operations, Product, Service]",4 - STPM / A Level
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent,Bachelor's or Equivalent,id,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[],6 - Bachelor's
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,Diploma / Advanced Diploma / Higher Graduate…,en,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]",5 - Diploma / DVM
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent,Bachelor's or Equivalent,id,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[],6 - Bachelor's
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,SPM / O Level / SKM Level 1 / SKM Level 2 / SK...,en,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]",3 - SPM / O Level / SKM


In [16]:
df3['language'].unique()

array(['en', 'id', 'fr', nan, 'de', 'so', 'sv', 'sw', 'nl', 'tl', 'ca',
       'ro', 'it', 'cy', 'af', 'da', 'et', 'es', 'vi', 'fi', 'no', 'tr',
       'ko', 'zh-cn', 'sl', 'pt', 'hr'], dtype=object)

In [17]:
languages = ['English', 'Others']
df3.loc[df3['language'] == 'en', 'job_language'] = 'English'
df3.loc[df3['language'] != 'en', 'job_language'] = 'Others'
df3.head()

Unnamed: 0,index,title,job_details,job_type,education_requirement,language,company,company_details,job_skills,no_skills,ignore_skills,education,job_language
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent,STPM / A Level or Equivalent,en,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Sales, Selection]",2,"[Operations, Product, Service]",4 - STPM / A Level,English
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent,Bachelor's or Equivalent,id,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[],6 - Bachelor's,Others
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,Diploma / Advanced Diploma / Higher Graduate…,en,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]",5 - Diploma / DVM,English
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent,Bachelor's or Equivalent,id,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[],6 - Bachelor's,Others
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,SPM / O Level / SKM Level 1 / SKM Level 2 / SK...,en,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]",3 - SPM / O Level / SKM,English


In [18]:
df4 = df3.copy()
cols = df4.columns.tolist()
cols = cols[:4] + cols[-2:] + cols[6:]
df4 = df4[cols]
df4.head()

Unnamed: 0,index,title,job_details,job_type,education,job_language,company,company_details,job_skills,no_skills,ignore_skills,education.1,job_language.1
0,0,SALES ASSOCIATE,* Under supervision and perform duties to prov...,Permanent,4 - STPM / A Level,English,GLOBAL PSYTECH SDN BHD\n|\nCyberjaya,"Retail trade, except of motor vehicles and mot...","[Sales, Selection]",2,"[Operations, Product, Service]",4 - STPM / A Level,English
1,1,SIGN HOUSE ADVERTISING,Kelebihan diberi kepada yang mempunyai kemahir...,Permanent,6 - Bachelor's,Others,MR DIY (M) SDN BHD\n|\nSeremban,"Advertising and market research, Printing and ...","[Adobe Illustrator, Adobe Photoshop, Advertising]",3,[],6 - Bachelor's,Others
2,2,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,5 - Diploma / DVM,English,INTER-EXCEL ADVISORY SDN BHD\n|\nKuala Lumpur,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]",5 - Diploma / DVM,English
3,3,Petugas PDK,"1. Bertanggungjawab kepada penyelia PDK, jawat...",Permanent,6 - Bachelor's,Others,IOI PLANTATION SERVICES SDN BHD\n|\nPutrajaya,"Education, Office administrative, office suppo...",[],0,[],6 - Bachelor's,Others
4,4,MR DIY (M) SDN BHD,"Receive stock, checking & distribution item re...",Permanent,3 - SPM / O Level / SKM,English,QL Ansan Poultry Farm Sdn Bhd\n|\nPadang Serai,"Retail trade, except of motor vehicles and mot...",[],0,"[Distribution, Operations]",3 - SPM / O Level / SKM,English


In [19]:
df4.to_csv('myfuturejobs-insights/myfuturejobs_skills2.csv', index=False)

## For Acceltic

In [22]:
skill_dict = {}

for _, row in df4.iterrows():
    skills = row['job_skills']
    if len(skills) > 0:
        for skill in skills:
            if skill in skill_dict.keys():
                skill_dict[skill] += 1
            else:
                skill_dict[skill] = 1

df_sf = pd.DataFrame.from_dict(skill_dict, orient='index', columns=['count']).reset_index()
df_sf.head()

Unnamed: 0,index,count
0,Sales,2580
1,Selection,190
2,Adobe Illustrator,63
3,Adobe Photoshop,94
4,Advertising,286


In [29]:
df_final = df_sf.rename(columns={'index': 'Skill', 'count': 'Count'})
now = dt.now()
# df_final['JobPostedMonth'] = dt(now.year, now.month, 1)
df_final['JobPostedMonth'] = dt(now.year, 1, 1)
df_final['Source'] = 'MYFutureJobs'
df_final.head()

Unnamed: 0,Skill,Count,JobPostedMonth,Source
0,Sales,2580,2021-01-01,MYFutureJobs
1,Selection,190,2021-01-01,MYFutureJobs
2,Adobe Illustrator,63,2021-01-01,MYFutureJobs
3,Adobe Photoshop,94,2021-01-01,MYFutureJobs
4,Advertising,286,2021-01-01,MYFutureJobs


In [30]:
df_final.to_csv('skills/ForAcceltic.csv', index=False)