In [2]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from sqlalchemy import create_engine
from datetime import datetime as dt
from nltk import ngrams
from difflib import get_close_matches as gcm
from secrets import secrets

In [9]:
skills = pd.read_csv('skills_db2/skill.csv')
skills.head()

Unnamed: 0,skill_id,skill
0,1,Applied Science
1,2,Arts and Humanities
2,3,Business
3,4,Computer Science
4,5,Data Science


In [10]:
sk_list = skills['skill'].tolist()
len(sk_list)

3031

In [29]:
query = """
select Job_ID, Job_Title, Company, Country, Date_Posted, Description, Keywords_Present, Title_Keywords 
from {}
where Date_Posted > '2020-06-01'
and Description != 'No Description'
"""

def extract_df(job):
    df = pd.read_sql(query.format(job), engine)
    df.drop_duplicates(inplace=True)
    print('Number of rows', len(df))
    return df

def extract_skills_all(df):
    i = 0
    job_info_list = []
    initial = dt.now()
    interval = dt.now()
    print_every = 200
    for _, job in df.iterrows():
        i += 1
        if i % print_every == 0:
            print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
            interval = dt.now()
        all_info = job['Job_Title'] + ' ' + job['Description']
        skills = extract_skills(all_info)
        job_info_list.append({
            'title': job['Job_Title'],
            'company': job['Company'],
            'country': job['Country'],
            'date_posted': job['Date_Posted'],
            'description': job['Description'],
            'title_keywords': job['Title_Keywords'],
            'indeed_skills': job['Keywords_Present'],
            'skills': skills,
            'no_skills': len(skills)
        })
    print("Total time taken: {}".format(dt.now() - initial))
    return pd.DataFrame.from_dict(job_info_list)

def extract_skills(info):
    # Remove ordered list with alphabets: a), b), c),...
    words = re.sub(r'[\s\t\n]+[a-zA-Z\s*]\)+', ' ', info)
    words = re.sub('[\n|,|.|/|\(|\)]', ' ', words).lower().split()
    bigrams = [' '.join(g) for g in ngrams(words, 2)]
    trigrams = [' '.join(g) for g in ngrams(words, 3)]
    results = []
    for skill in sk_list:
        s = skill.lower()
        if '(' in s:
            abb = s[s.find("(")+1:s.find(")")]
            if abb in info:
                continue
            s = re.sub(r"[\(].*?[\)]", "", s)
        s2 = s.split()
        if len(s2) == 1:
            if len(gcm(s, words, cutoff=0.9)) > 0:
                results.append(skill)
        elif len(s2) == 2:
            if len(gcm(s, bigrams, cutoff=0.9)) > 0:
                results.append(skill)
        elif len(s2) == 3:
            if len(gcm(s, trigrams, cutoff=0.85)) > 0:
                results.append(skill)
        else:
            if len(gcm(s, trigrams, cutoff=0.8)) > 0:
                results.append(skill)
    return results

In [8]:
engine = create_engine(secrets['indeed_db'])
df_ds = extract_df('Data_Scientist')
df_ds.head()

Number of rows 2284


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords
0,dce076954b6def5d,Data Scientist,Mission Consultancy Services Malaysia Sdn Bhd,Malaysia,2020-07-26,Identify valuable data sources and automate co...,"'C', 'C++', 'Excel', 'Hadoop', 'Java', 'R', 'S...",
1,0af114638b83542d,Data Scientist,AirAsia,Malaysia,2020-08-15,Job DescriptionOverviewThis role will responsi...,"'C', 'R'",
2,482f223f74269b38,Data Scientist,CareerMaster Sdn Bhd,Malaysia,2020-07-26,Position to be based in Kuala Lumpur.We are re...,"'C', 'GIS', 'Go', 'Python', 'R', 'SQL', 'Stati...",
3,23570b7ea0a5ecbf,Data Scientist Executive,GENO Management,Malaysia,2020-08-18,Position : Data Scientists ExecutiveLocation :...,"'AWS', 'C', 'Go', 'Python', 'R', 'Regression',...",
4,caacc69df1a0c0e0,Data Scientist,PLUS SOLAR SYSTEMS SDN BHD,Malaysia,2020-07-26,Work With Stakeholders Throughout The Organiza...,'C',


In [17]:
df_dsj = extract_skills_all(df_ds)
df_dsj.head()

200 jobs processed. Time taken: 0:08:31.487232
400 jobs processed. Time taken: 0:07:25.305195
600 jobs processed. Time taken: 0:06:36.810810
800 jobs processed. Time taken: 0:07:56.447118
1000 jobs processed. Time taken: 0:07:52.935527
1200 jobs processed. Time taken: 0:07:40.275196
1400 jobs processed. Time taken: 0:07:16.074752
1600 jobs processed. Time taken: 0:07:18.611516
1800 jobs processed. Time taken: 0:08:08.216351
2000 jobs processed. Time taken: 0:08:19.897735
2200 jobs processed. Time taken: 0:08:00.242933
Total time taken: 1:28:24.364609


Unnamed: 0,title,company,country,date_posted,description,title_keywords,indeed_skills,skills,no_skills
0,Data Scientist,Mission Consultancy Services Malaysia Sdn Bhd,Malaysia,2020-07-26,Identify valuable data sources and automate co...,,"'C', 'C++', 'Excel', 'Hadoop', 'Java', 'R', 'S...","[Business, Analytical, Communication, Developm...",27
1,Data Scientist,AirAsia,Malaysia,2020-08-15,Job DescriptionOverviewThis role will responsi...,,"'C', 'R'","[Business, Accuracy, Processing, Product, A/B ...",21
2,Data Scientist,CareerMaster Sdn Bhd,Malaysia,2020-07-26,Position to be based in Kuala Lumpur.We are re...,,"'C', 'GIS', 'Go', 'Python', 'R', 'SQL', 'Stati...","[Business, Computer Science, Microsoft, Proces...",30
3,Data Scientist Executive,GENO Management,Malaysia,2020-08-18,Position : Data Scientists ExecutiveLocation :...,,"'AWS', 'C', 'Go', 'Python', 'R', 'Regression',...","[Business, Data Science, Analytical, Business ...",39
4,Data Scientist,PLUS SOLAR SYSTEMS SDN BHD,Malaysia,2020-07-26,Work With Stakeholders Throughout The Organiza...,,'C',"[Business, Accuracy, Development, Marketing, P...",9


In [18]:
df_dsj.to_csv('indeed-insights.csv', index=False)

In [19]:
df_da = extract_df('Data_Analyst')
df_da.head()

Number of rows 8069


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords
0,dce076954b6def5d,Data Scientist,Mission Consultancy Services Malaysia Sdn Bhd,Malaysia,2020-07-26,Identify valuable data sources and automate co...,"'C', 'C++', 'Excel', 'Hadoop', 'Java', 'R', 'S...",
1,66201e0720bcd228,Customer Data Analyst,Senheng Electric (KL) Sdn Bhd,Malaysia,2020-07-26,Manages the company’s customer loyalty program...,"'C', 'Excel', 'Go', 'R', 'Statistics'",
2,edad2206e0edd9c4,Data Management Analyst,Samsung SDS Asia Pacific Pte Ltd,Malaysia,2020-08-17,Responsibility:Handling whole process related ...,"'C', 'R'",
3,2ac6b0ad8a608de0,Data Analyst,Asia Online Publishing Group,Malaysia,2020-08-22,Job Description: Ability to analyze and accura...,"'C', 'Excel', 'Go', 'R'",
4,8a57fc4f1e969cf3,"Data Analyst, SQL",Grab Taxi,Malaysia,2020-08-26,Get to know the Role :Collaborate with product...,"'C', 'CG', 'Excel', 'R', 'SQL', 'Statistics', ...",


In [20]:
df_dsa = extract_skills_all(df_da)
df_dsa.head()

200 jobs processed. Time taken: 0:07:38.574058
400 jobs processed. Time taken: 0:08:03.659577
600 jobs processed. Time taken: 0:07:57.090188
800 jobs processed. Time taken: 0:07:50.273669
1000 jobs processed. Time taken: 0:06:23.421924
1200 jobs processed. Time taken: 0:07:18.990292
1400 jobs processed. Time taken: 0:07:23.920834
1600 jobs processed. Time taken: 0:08:01.485652
1800 jobs processed. Time taken: 0:06:33.304374
2000 jobs processed. Time taken: 0:07:21.665184
2200 jobs processed. Time taken: 0:07:42.693433
2400 jobs processed. Time taken: 0:05:43.516511
2600 jobs processed. Time taken: 0:06:39.831240
2800 jobs processed. Time taken: 0:07:19.932316
3000 jobs processed. Time taken: 0:07:31.625077
3200 jobs processed. Time taken: 0:08:33.047903
3400 jobs processed. Time taken: 0:06:17.887602
3600 jobs processed. Time taken: 0:07:17.592962
3800 jobs processed. Time taken: 0:09:14.081823
4000 jobs processed. Time taken: 0:06:09.387244
4200 jobs processed. Time taken: 0:06:42.515

Unnamed: 0,title,company,country,date_posted,description,title_keywords,indeed_skills,skills,no_skills
0,Data Scientist,Mission Consultancy Services Malaysia Sdn Bhd,Malaysia,2020-07-26,Identify valuable data sources and automate co...,,"'C', 'C++', 'Excel', 'Hadoop', 'Java', 'R', 'S...","[Business, Analytical, Communication, Developm...",27
1,Customer Data Analyst,Senheng Electric (KL) Sdn Bhd,Malaysia,2020-07-26,Manages the company’s customer loyalty program...,,"'C', 'Excel', 'Go', 'R', 'Statistics'","[Business, Administration, Business Strategy, ...",15
2,Data Management Analyst,Samsung SDS Asia Pacific Pte Ltd,Malaysia,2020-08-17,Responsibility:Handling whole process related ...,,"'C', 'R'","[Business, Computer Science, Accounting, Admin...",17
3,Data Analyst,Asia Online Publishing Group,Malaysia,2020-08-22,Job Description: Ability to analyze and accura...,,"'C', 'Excel', 'Go', 'R'","[Excel, Processing, Database, Database System,...",10
4,"Data Analyst, SQL",Grab Taxi,Malaysia,2020-08-26,Get to know the Role :Collaborate with product...,,"'C', 'CG', 'Excel', 'R', 'SQL', 'Statistics', ...","[Business, Computer Science, Analytical, Colla...",29


In [21]:
df_dsa.to_csv('indeed-insights/data_analyst.csv', index=False)

In [22]:
df_de = extract_df('Data_Engineer')
df_de.head()

Number of rows 9322


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords
0,4a0d4e1f4736c11b,Software Engineer,Enovision Sdn Bbd,Malaysia,2020-07-26,Are you getting boring to work in office every...,"'Artificial Intelligence', 'C', 'C++', 'Go', 'R'",
1,d2771a7dd040fe7e,Planning Engineer,Grand Dynamic Builders Sdn Bhd,Malaysia,2020-08-26,Responsibilities: -o To assist Planning Manage...,"'C', 'R', 'SAS'",
2,08c07ebc566bde97,Electrical Engineer,Easun Engineering Sdn Bhd,Malaysia,2020-08-17,Diploma or Degree in Electrical Engineering· 2...,"'C', 'R'",
3,50adbda0e1c6ec38,Maintenance Master Data Engineer,Air Products,Malaysia,2020-08-19,PurposeAs a member of the Global Master Data T...,"'C', 'Excel', 'R'",
4,925ba321ef02d64b,Industrial Engineer,KJCF Group,Malaysia,2020-08-19,Packaging is part and parcel of marketing a pr...,"'C', 'Excel', 'Factor', 'Go', 'R'",


In [24]:
df_dej = extract_skills_all(df_de)
df_dej.head()

200 jobs processed. Time taken: 0:07:02.167641
400 jobs processed. Time taken: 0:07:12.173663
600 jobs processed. Time taken: 0:07:36.249357
800 jobs processed. Time taken: 0:07:44.981286
1000 jobs processed. Time taken: 0:07:23.198946
1200 jobs processed. Time taken: 0:07:53.841603
1400 jobs processed. Time taken: 0:06:56.944614
1600 jobs processed. Time taken: 0:06:58.121925
1800 jobs processed. Time taken: 0:06:36.549107
2000 jobs processed. Time taken: 0:06:36.890634
2200 jobs processed. Time taken: 0:07:07.737757
2400 jobs processed. Time taken: 0:06:03.714149
2600 jobs processed. Time taken: 0:07:04.841540
2800 jobs processed. Time taken: 0:06:49.909986
3000 jobs processed. Time taken: 0:41:08.402092
3200 jobs processed. Time taken: 1:16:49.509436
3400 jobs processed. Time taken: 0:06:28.527476
3600 jobs processed. Time taken: 0:06:24.852576
3800 jobs processed. Time taken: 0:06:36.242494
4000 jobs processed. Time taken: 0:07:39.315882
4200 jobs processed. Time taken: 0:06:58.162

Unnamed: 0,title,company,country,date_posted,description,title_keywords,indeed_skills,skills,no_skills
0,Software Engineer,Enovision Sdn Bbd,Malaysia,2020-07-26,Are you getting boring to work in office every...,,"'Artificial Intelligence', 'C', 'C++', 'Go', 'R'","[Computer Science, Analytical, Development, Pr...",30
1,Planning Engineer,Grand Dynamic Builders Sdn Bhd,Malaysia,2020-08-26,Responsibilities: -o To assist Planning Manage...,,"'C', 'R', 'SAS'","[Analytical, Communication, Construction, Indu...",27
2,Electrical Engineer,Easun Engineering Sdn Bhd,Malaysia,2020-08-17,Diploma or Degree in Electrical Engineering· 2...,,"'C', 'R'","[Communication, Nonverbal Communication, Plann...",10
3,Maintenance Master Data Engineer,Air Products,Malaysia,2020-08-19,PurposeAs a member of the Global Master Data T...,,"'C', 'Excel', 'R'","[Business, Accuracy, Communication, Documentat...",34
4,Industrial Engineer,KJCF Group,Malaysia,2020-08-19,Packaging is part and parcel of marketing a pr...,,"'C', 'Excel', 'Factor', 'Go', 'R'","[Analytical, Communication, Decision Making, D...",30


In [25]:
df_dej.to_csv('indeed-insights/data_engineer.csv', index=False)

In [31]:
engine = create_engine(secrets['indeed_db'])
df_mle = extract_df('Machine_Learning_Engineer')
df_mle.head()

Number of rows 2372


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords
0,4becba66641bee03,Machine Learning Engineer,3E Accounting Malaysia,Malaysia,2020-07-27,Position TitleMachine Learning Engineer – Acco...,"'C', 'Machine Learning', 'R'",
1,3b94feea299270df,Machine Learning Engineer / Data Scientist,Neural Technologies,Malaysia,2020-07-27,Main Accountabilities & Responsibilities:Apply...,"'C', 'Go', 'Hive', 'Python', 'R', 'SQL'",
2,5cb1009fe6976657,Artificial Intelligence AI Software Engineer,3E Accounting Malaysia,Malaysia,2020-07-27,Position TitleArtificial Intelligence AI Softw...,"'Artificial Intelligence', 'C', 'Go', 'Machine...",
3,f98889dc5f47b972,"Software Engineer, Backend",Grab Taxi,Malaysia,2020-08-26,"Get to know the Role:As such, we are seeking t...","'AWS', 'C', 'C++', 'Elasticsearch', 'Erlang', ...",
4,66b3d489fb6431d7,"Executive, Data Engineer",AirAsia,Malaysia,2020-08-04,We are all different - one talent to another -...,"'BigQuery', 'C', 'Excel', 'Python', 'R', 'SQL'",


In [32]:
df_mlej = extract_skills_all(df_mle)
df_mlej.head()

200 jobs processed. Time taken: 0:07:09.458571
400 jobs processed. Time taken: 0:06:32.570051
600 jobs processed. Time taken: 0:07:07.645440
800 jobs processed. Time taken: 0:07:33.678055
1000 jobs processed. Time taken: 0:07:22.380791
1200 jobs processed. Time taken: 0:08:13.954454
1400 jobs processed. Time taken: 0:07:58.255262
1600 jobs processed. Time taken: 0:08:27.147819
1800 jobs processed. Time taken: 0:08:20.285030
2000 jobs processed. Time taken: 0:07:56.394660
2200 jobs processed. Time taken: 0:08:58.032652
Total time taken: 1:32:38.731267


Unnamed: 0,title,company,country,date_posted,description,title_keywords,indeed_skills,skills,no_skills
0,Machine Learning Engineer,3E Accounting Malaysia,Malaysia,2020-07-27,Position TitleMachine Learning Engineer – Acco...,,"'C', 'Machine Learning', 'R'","[Computer Science, Accounting, Email, Service,...",19
1,Machine Learning Engineer / Data Scientist,Neural Technologies,Malaysia,2020-07-27,Main Accountabilities & Responsibilities:Apply...,,"'C', 'Go', 'Hive', 'Python', 'R', 'SQL'","[Computer Science, Commercial, Communication, ...",14
2,Artificial Intelligence AI Software Engineer,3E Accounting Malaysia,Malaysia,2020-07-27,Position TitleArtificial Intelligence AI Softw...,,"'Artificial Intelligence', 'C', 'Go', 'Machine...","[Computer Science, Accounting, Development, Do...",19
3,"Software Engineer, Backend",Grab Taxi,Malaysia,2020-08-26,"Get to know the Role:As such, we are seeking t...",,"'AWS', 'C', 'C++', 'Elasticsearch', 'Erlang', ...","[Computer Science, Analytical, Communication, ...",42
4,"Executive, Data Engineer",AirAsia,Malaysia,2020-08-04,We are all different - one talent to another -...,,"'BigQuery', 'C', 'Excel', 'Python', 'R', 'SQL'","[Business, Data Science, Airlines, Analytical,...",51


In [33]:
df_mlej.to_csv('indeed-insights/machine_learning_engineer.csv', index=False)