In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from sqlalchemy import create_engine
from datetime import datetime as dt
from secrets import secrets
from skill_api import extract_skills, extract_ignore

In [2]:
# Skills
df_skills = pd.read_csv('skills_db2/skill.csv')
SKILLS = df_skills['Skill'].unique().tolist()
# Redundant skills
df_redskills = pd.read_excel('skills/Other Skills.xlsx')
RED_SKILLS = df_redskills['Skill'].unique().tolist()
# Duplicate skills
df_dupskills = pd.read_excel('skills/Other Skills.xlsx', sheet_name='Duplicates')
DUP_SKILLS = df_dupskills.set_index('Skill').to_dict()['Parent']
SKILLS.extend(list(DUP_SKILLS.keys()))

In [6]:
query = """
select Job_ID, Job_Title, Company, Country, Date_Posted, Description, Keywords_Present, Title_Keywords 
from {}
where Date_Posted > '2020-06-01'
and Description != 'No Description'
"""

def extract_df(job):
    df = pd.read_sql(query.format(job), engine)
    df.drop_duplicates(inplace=True)
    no_skill = df['Keywords_Present'].isna()
    df['Num_Skill'] = 0
    df.loc[~no_skill, 'Num_Skill'] = df.loc[~no_skill, 'Keywords_Present'].apply(lambda x: len(x.split(',')))
    df = df.loc[df['Num_Skill'] <= 3]
    print('Number of rows', len(df))
    return df

def extract_skills_all(df):
    i = 0
    job_info_list = []
    initial = dt.now()
    interval = dt.now()
    print_every = int(len(df) / 20)
    for _, job in df.iterrows():
        i += 1
        if i % print_every == 0:
            print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
            interval = dt.now()
        all_info = job['Job_Title'] + ' ' + job['Description']
        all_skills = extract_skills(all_info, SKILLS)
        keep_skills, _ = extract_ignore(all_skills, RED_SKILLS, DUP_SKILLS)
        keep_skills.sort()
        # ignore_skills.sort()
        job_info_list.append({
            'title': job['Job_Title'],
            'company': job['Company'],
            'country': job['Country'],
            'date_posted': job['Date_Posted'],
            'description': job['Description'],
            'indeed_skills': job['Keywords_Present'],
            'skills': keep_skills,
        })
    print("Total time taken: {}".format(dt.now() - initial))
    return pd.DataFrame.from_dict(job_info_list)

In [4]:
engine = create_engine(secrets['indeed_db'])
df_ds = extract_df('Data_Scientist')
df_ds.head()

Number of rows 673


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
1,0af114638b83542d,Data Scientist,AirAsia,Malaysia,2020-08-15,Job DescriptionOverviewThis role will responsi...,"'C', 'R'",,2
4,caacc69df1a0c0e0,Data Scientist,PLUS SOLAR SYSTEMS SDN BHD,Malaysia,2020-07-26,Work With Stakeholders Throughout The Organiza...,'C',,1
41,aa123e23bb46c569,Scientist,Mediven,Malaysia,2020-07-26,MedivenTM aspires to be a provider of innovati...,"'C', 'R'",,2
46,5bd740a8c7cdc204,Data & Advanced Analytics Architect,Orsted,Malaysia,2020-08-14,Imagine a future where you architect revolutio...,"'C', 'Python', 'R'",,3
49,05d59d7d9c321ca9,AI Analyst,"ACE Digital, Ace Resource Advisory Services Sd...",Malaysia,2020-08-19,Key Responsibilities:Assist Data Scientists in...,"'C', 'R', 'Statistics'",,3


In [7]:
df_dsj = extract_skills_all(df_ds)
df_dsj.head()

33 jobs processed. Time taken: 0:01:47.458453
66 jobs processed. Time taken: 0:01:17.021420
99 jobs processed. Time taken: 0:01:10.464390
132 jobs processed. Time taken: 0:01:07.321380
165 jobs processed. Time taken: 0:01:16.280228
198 jobs processed. Time taken: 0:01:58.571726
231 jobs processed. Time taken: 0:01:16.134256
264 jobs processed. Time taken: 0:01:12.380399
297 jobs processed. Time taken: 0:01:31.025138
330 jobs processed. Time taken: 0:01:27.038425
363 jobs processed. Time taken: 0:01:04.263817
396 jobs processed. Time taken: 0:01:13.438820
429 jobs processed. Time taken: 0:01:29.788202
462 jobs processed. Time taken: 0:01:27.361734
495 jobs processed. Time taken: 0:01:40.655782
528 jobs processed. Time taken: 0:01:19.395035
561 jobs processed. Time taken: 0:01:34.437820
594 jobs processed. Time taken: 0:01:20.666006
627 jobs processed. Time taken: 0:00:33.139818
660 jobs processed. Time taken: 0:01:13.757290
Total time taken: 0:27:28.133012


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Data Scientist,AirAsia,Malaysia,2020-08-15,Job DescriptionOverviewThis role will responsi...,"'C', 'R'","[A/B Testing, Accuracy, Analytical, Anomaly De..."
1,Data Scientist,PLUS SOLAR SYSTEMS SDN BHD,Malaysia,2020-07-26,Work With Stakeholders Throughout The Organiza...,'C',"[Accuracy, Algorithm, Data Gathering, Database..."
2,Scientist,Mediven,Malaysia,2020-07-26,MedivenTM aspires to be a provider of innovati...,"'C', 'R'","[Accuracy, Analysis, Analytical, Biotechnology..."
3,Data & Advanced Analytics Architect,Orsted,Malaysia,2020-08-14,Imagine a future where you architect revolutio...,"'C', 'Python', 'R'","[Analytical, Architecture, Arts, Coaching, Com..."
4,AI Analyst,"ACE Digital, Ace Resource Advisory Services Sd...",Malaysia,2020-08-19,Key Responsibilities:Assist Data Scientists in...,"'C', 'R', 'Statistics'","[Analytical, Artificial Intelligence (AI), Clo..."


In [8]:
df_dsj.to_csv('indeed-insights/data_scientist2.csv', index=False)

In [9]:
df_da = extract_df('Data_Analyst')
df_da.head()

Number of rows 4206


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
2,edad2206e0edd9c4,Data Management Analyst,Samsung SDS Asia Pacific Pte Ltd,Malaysia,2020-08-17,Responsibility:Handling whole process related ...,"'C', 'R'",,2
8,7aec3cff1d32c3e0,Data Analyst,Canvass Luuminate,Malaysia,2020-08-12,- Data administration- Assist founders on dail...,"'C', 'R'",,2
9,1ff2afdb40430056,Data Management Analyst,AstraZeneca,Malaysia,2020-08-21,"AstraZeneca is a global, innovation-driven bio...","'C', 'Go', 'R'",,3
10,ed48c162d7db36c5,Data Analyst,SWIFT,Malaysia,2020-08-13,About the RoleIn line with the company and div...,"'C', 'Excel', 'R'",,3
11,b3e757a2c8e7ecf8,Know Your Client (KYC) Analyst,Deutsche Bank,Malaysia,2020-08-19,The Know Your Client (KYC) Operator is respons...,"'C', 'R'",,2


In [10]:
df_dsa = extract_skills_all(df_da)
df_dsa.head()

210 jobs processed. Time taken: 0:08:09.004624
420 jobs processed. Time taken: 0:08:53.844750
630 jobs processed. Time taken: 0:07:15.065782
840 jobs processed. Time taken: 0:08:41.575859
1050 jobs processed. Time taken: 0:08:04.682529
1260 jobs processed. Time taken: 0:07:19.264402
1470 jobs processed. Time taken: 0:07:32.063397
1680 jobs processed. Time taken: 0:08:47.507725
1890 jobs processed. Time taken: 0:07:24.935709
2100 jobs processed. Time taken: 0:07:53.305211
2310 jobs processed. Time taken: 0:07:25.557392
2520 jobs processed. Time taken: 0:08:07.604161
2730 jobs processed. Time taken: 0:08:40.922121
2940 jobs processed. Time taken: 0:08:10.687705
3150 jobs processed. Time taken: 0:08:11.434869
3360 jobs processed. Time taken: 0:07:32.758726
3570 jobs processed. Time taken: 0:06:12.425841
3780 jobs processed. Time taken: 0:07:28.938498
3990 jobs processed. Time taken: 0:06:49.200784
4200 jobs processed. Time taken: 0:07:05.088214
Total time taken: 2:35:58.531517


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Data Management Analyst,Samsung SDS Asia Pacific Pte Ltd,Malaysia,2020-08-17,Responsibility:Handling whole process related ...,"'C', 'R'","[Accounting, Administration, Analysis, Compute..."
1,Data Analyst,Canvass Luuminate,Malaysia,2020-08-12,- Data administration- Assist founders on dail...,"'C', 'R'",[Database Administration]
2,Data Management Analyst,AstraZeneca,Malaysia,2020-08-21,"AstraZeneca is a global, innovation-driven bio...","'C', 'Go', 'R'","[Brand Management, Business Administration, Bu..."
3,Data Analyst,SWIFT,Malaysia,2020-08-13,About the RoleIn line with the company and div...,"'C', 'Excel', 'R'","[Accuracy, Analysis, Analytical, Budget, Commu..."
4,Know Your Client (KYC) Analyst,Deutsche Bank,Malaysia,2020-08-19,The Know Your Client (KYC) Operator is respons...,"'C', 'R'","[Accuracy, Banking, Brand Management, Effectiv..."


In [11]:
df_dsa.to_csv('indeed-insights/data_analyst2.csv', index=False)

In [12]:
df_de = extract_df('Data_Engineer')
df_de.head()

Number of rows 4971


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
1,d2771a7dd040fe7e,Planning Engineer,Grand Dynamic Builders Sdn Bhd,Malaysia,2020-08-26,Responsibilities: -o To assist Planning Manage...,"'C', 'R', 'SAS'",,3
2,08c07ebc566bde97,Electrical Engineer,Easun Engineering Sdn Bhd,Malaysia,2020-08-17,Diploma or Degree in Electrical Engineering· 2...,"'C', 'R'",,2
3,50adbda0e1c6ec38,Maintenance Master Data Engineer,Air Products,Malaysia,2020-08-19,PurposeAs a member of the Global Master Data T...,"'C', 'Excel', 'R'",,3
5,f529772747f229c4,Data Engineer,Axrail Sdn Bhd,Malaysia,2020-07-26,Data engineers are an essential part of our an...,"'AWS', 'R', 'Tableau'",,3
7,2e84c8b017751381,Proposal Engineer,Precision Control Sdn Bhd,Malaysia,2020-08-04,· To provide pre-sales to customers.· Collabor...,"'C', 'Go', 'R'",,3


In [13]:
df_dej = extract_skills_all(df_de)
df_dej.head()

248 jobs processed. Time taken: 0:08:24.520942
496 jobs processed. Time taken: 0:09:04.752090
744 jobs processed. Time taken: 0:08:38.314209
992 jobs processed. Time taken: 0:07:33.047894
1240 jobs processed. Time taken: 0:07:42.373214
1488 jobs processed. Time taken: 0:08:16.943567
1736 jobs processed. Time taken: 0:07:17.542231
1984 jobs processed. Time taken: 0:07:22.964189
2232 jobs processed. Time taken: 0:08:23.632669
2480 jobs processed. Time taken: 0:07:59.648578
2728 jobs processed. Time taken: 0:09:03.807301
2976 jobs processed. Time taken: 0:07:41.267650
3224 jobs processed. Time taken: 0:07:52.400941
3472 jobs processed. Time taken: 0:08:29.487179
3720 jobs processed. Time taken: 0:09:01.837624
3968 jobs processed. Time taken: 0:07:51.955621
4216 jobs processed. Time taken: 0:07:47.963288
4464 jobs processed. Time taken: 0:08:26.175348
4712 jobs processed. Time taken: 0:07:36.152229
4960 jobs processed. Time taken: 0:08:20.051774
Total time taken: 2:43:18.690907


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Planning Engineer,Grand Dynamic Builders Sdn Bhd,Malaysia,2020-08-26,Responsibilities: -o To assist Planning Manage...,"'C', 'R', 'SAS'","[Analytical, Civil Engineering, Communication,..."
1,Electrical Engineer,Easun Engineering Sdn Bhd,Malaysia,2020-08-17,Diploma or Degree in Electrical Engineering· 2...,"'C', 'R'","[Assembly, Electrical Engineering, Electronic ..."
2,Maintenance Master Data Engineer,Air Products,Malaysia,2020-08-19,PurposeAs a member of the Global Master Data T...,"'C', 'Excel', 'R'","[Accuracy, Automation, Chemical Engineering, C..."
3,Data Engineer,Axrail Sdn Bhd,Malaysia,2020-07-26,Data engineers are an essential part of our an...,"'AWS', 'R', 'Tableau'","[Amazon EMR, Amazon S3, Amazon Web Service (AW..."
4,Proposal Engineer,Precision Control Sdn Bhd,Malaysia,2020-08-04,· To provide pre-sales to customers.· Collabor...,"'C', 'Go', 'R'","[Collaboration, Communication, Design, Electro..."


In [14]:
df_dej.to_csv('indeed-insights/data_engineer2.csv', index=False)

In [15]:
engine = create_engine(secrets['indeed_db'])
df_mle = extract_df('Machine_Learning_Engineer')
df_mle.head()

Number of rows 458


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
0,4becba66641bee03,Machine Learning Engineer,3E Accounting Malaysia,Malaysia,2020-07-27,Position TitleMachine Learning Engineer – Acco...,"'C', 'Machine Learning', 'R'",,3
6,28aefe300cf14774,Robotics Engineer,NDR Medical Technology Sdn. Bhd.,Malaysia,2020-08-25,Job Responsibilities:Conduct research to deter...,"'C', 'Excel', 'R'",,3
27,aa8f2e607f63e88b,Maintenance Technician,JACOBS DOUWE EGBERTS,Malaysia,2020-08-22,Company DescriptionIt’s amazing what can happe...,"'C', 'R'",,2
31,f7837ed0331dac73,Special Training Programme for Project Enginee...,RWNA Engineering,Malaysia,2020-07-27,TRAINING PROGRAMME TITLE:Preparation for Proje...,"'C', 'R'",,2
32,386bd8e583629cab,Programming & SMT Engineer,NALA EMPLOYMENT SDN BHD,Malaysia,2020-07-29,"ResponsibilityPROGRAMMING-- to prepare, to con...","'C', 'R'",,2


In [17]:
df_mlej = extract_skills_all(df_mle)
df_mlej.head()

22 jobs processed. Time taken: 0:00:49.093413
44 jobs processed. Time taken: 0:00:43.951808
66 jobs processed. Time taken: 0:00:43.885003
88 jobs processed. Time taken: 0:00:40.798607
110 jobs processed. Time taken: 0:00:51.184228
132 jobs processed. Time taken: 0:00:44.716359
154 jobs processed. Time taken: 0:00:45.444496
176 jobs processed. Time taken: 0:00:50.546184
198 jobs processed. Time taken: 0:01:04.047386
220 jobs processed. Time taken: 0:00:57.697364
242 jobs processed. Time taken: 0:01:12.792243
264 jobs processed. Time taken: 0:00:56.434899
286 jobs processed. Time taken: 0:00:47.415359
308 jobs processed. Time taken: 0:00:54.252302
330 jobs processed. Time taken: 0:00:52.611237
352 jobs processed. Time taken: 0:01:09.746094
374 jobs processed. Time taken: 0:00:50.470090
396 jobs processed. Time taken: 0:00:56.928567
418 jobs processed. Time taken: 0:00:42.239570
440 jobs processed. Time taken: 0:00:38.992700
Total time taken: 0:18:16.471403


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Machine Learning Engineer,3E Accounting Malaysia,Malaysia,2020-07-27,Position TitleMachine Learning Engineer – Acco...,"'C', 'Machine Learning', 'R'","[Accounting, Azure Machine Learning, Computer ..."
1,Robotics Engineer,NDR Medical Technology Sdn. Bhd.,Malaysia,2020-08-25,Job Responsibilities:Conduct research to deter...,"'C', 'Excel', 'R'","[Analytical, Applied Physics, Communication, C..."
2,Maintenance Technician,JACOBS DOUWE EGBERTS,Malaysia,2020-08-22,Company DescriptionIt’s amazing what can happe...,"'C', 'R'","[Coaching, Imagine, Machinery, Mechanical Engi..."
3,Special Training Programme for Project Enginee...,RWNA Engineering,Malaysia,2020-07-27,TRAINING PROGRAMME TITLE:Preparation for Proje...,"'C', 'R'","[Assembly, Communication, Control System, ISO ..."
4,Programming & SMT Engineer,NALA EMPLOYMENT SDN BHD,Malaysia,2020-07-29,"ResponsibilityPROGRAMMING-- to prepare, to con...","'C', 'R'","[Analysis, Process Engineering, Profiling, Pro..."


In [18]:
df_mlej.to_csv('indeed-insights/machine_learning_engineer2.csv', index=False)

In [19]:
engine.dispose()