In [11]:
import pandas as pd
import re
from datetime import datetime as dt
from sqlalchemy import create_engine
from secrets import secrets
from skill_api import extract_skills, extract_ignore

In [2]:
# Skills
df_skills = pd.read_csv('skills_db2/skill.csv')
SKILLS = df_skills['Skill'].unique().tolist()
# Redundant skills
df_redskills = pd.read_excel('skills/Other Skills.xlsx')
RED_SKILLS = df_redskills['Skill'].unique().tolist()
# Duplicate skills
df_dupskills = pd.read_excel('skills/Other Skills.xlsx', sheet_name='Duplicates')
DUP_SKILLS = df_dupskills.set_index('Skill').to_dict()['Parent']
SKILLS.extend(list(DUP_SKILLS.keys()))

In [5]:
query = """
select Job_ID, Job_Title, Company, Country, Date_Posted, Description, Keywords_Present, Title_Keywords 
from {}
where Date_Posted > '2020-04-01'
and Description != 'No Description'
and lower(Job_Title) like '%%manager%%'
"""

def extract_df(job):
    df = pd.read_sql(query.format(job), engine)
    df.drop_duplicates(inplace=True)
    no_skill = df['Keywords_Present'].isna()
    df['Num_Skill'] = 0
    df.loc[~no_skill, 'Num_Skill'] = df.loc[~no_skill, 'Keywords_Present'].apply(lambda x: len(x.split(',')))
    df = df.loc[df['Num_Skill'] <= 3]
    print('Number of rows', len(df))
    return df

def extract_skills_all(df):
    i = 0
    job_info_list = []
    initial = dt.now()
    interval = dt.now()
    print_every = int(len(df) / 20)
    for _, job in df.iterrows():
        i += 1
        if i % print_every == 0:
            print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
            interval = dt.now()
        all_info = job['Job_Title'] + ' ' + job['Description']
        all_skills = extract_skills(all_info, SKILLS)
        keep_skills, _ = extract_ignore(all_skills, RED_SKILLS, DUP_SKILLS)
        keep_skills.sort()
        # ignore_skills.sort()
        job_info_list.append({
            'title': job['Job_Title'],
            'company': job['Company'],
            'country': job['Country'],
            'date_posted': job['Date_Posted'],
            'description': job['Description'],
            'indeed_skills': job['Keywords_Present'],
            'skills': keep_skills,
        })
    print("Total time taken: {}".format(dt.now() - initial))
    return pd.DataFrame.from_dict(job_info_list)

In [6]:
engine = create_engine(secrets['indeed_db'])
df_ds = extract_df('Data_Scientist')
df_ds.head()

Number of rows 260


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
16,b8259e44f8dc12ad,Principal Data Scientist Manager - Cloud Hardw...,Microsoft,USA,2020-05-02,To delight our customers in a Cloud First worl...,"'C', 'R'",,2
21,e8565fa183b873d5,Arity-Data Scientist-Sr Manager,Allstate,USA,2020-04-10,"Founded by The Allstate Corporation in 2016, A...","'C', 'Excel', 'R'",'Sr',3
25,21df6fd71777d30d,Project Manager - Data Science | Chicago IL,Photon,USA,2020-04-28,Project Manager - Data Science | Chicago IL - ...,"'C', 'R', 'SQL'",,3
31,6560a73b8f1fee47,Corporate Communication Manager - Artificial I...,TuSimple,USA,2020-05-11,"This role is located in San Diego, CA. Relocat...","'Artificial Intelligence', 'C', 'R'",,3
40,92fc3ac15f8c1c81,Project Manager (Data Science & Public Health),ICF,USA,2020-04-29,ICF seeks a Project Manager with a data scienc...,"'C', 'Excel', 'R'",,3


In [7]:
df_da = extract_df('Data_Analyst')
df_da.head()

Number of rows 696


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
2,c406ae5189abe01b,Business Monitoring Analyst / Manager,Wirecard,Malaysia,2020-05-05,The role sits in Business Monitoring function ...,"'C', 'Excel', 'R'",,3
5,653ee53c6262cb44,Assistant Investment Manager,BW INDUSTRIAL DEVELOPMENT JSC,Vietnam,2020-04-10,Làm việc tại: Hồ Chí Minh Mức lương: Cạnh Tran...,"'C', 'Excel', 'R'",,3
7,4c2ca84fd914a953,Financial Analyst/Assistant Manager - Finance,In.Corp Global Pte. Ltd.,Singapore,2020-05-04,Duties and responsibilities· Experienced in gr...,"'C', 'Excel', 'R'",,3
8,ed45e3c6b4603630,Accounting Manager,Span systems,Thailand,2020-04-10,Role & ResponsibilitiesManage the financial & ...,'R',,1
9,1e71c925b0a6415c,Product Manager (Internal Product),NVG Technology,Vietnam,2020-04-10,job descriptionDefine and further develop inte...,"'C', 'Excel', 'R'",'Intern',3


In [8]:
df_de = extract_df('Data_Engineer')
df_de.head()

Number of rows 2690


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
0,4d081f53e4763438,PROJECT MANAGER / PROJECT ENGINEER,Misi Setia Oil & Gas,Malaysia,2020-04-10,Location:BalakongResponsibilities:Define proje...,"'C', 'R'",,2
1,499250d99acee1c0,MANUFACTURING / PRODUCTION - SENIOR ENGINEER /...,APM Malaysia,Malaysia,2020-04-10,"APM Automotive Holdings Berhad, is one of the ...","'Go', 'R'",,2
3,879f1027dbe4d926,Technical Manager_Gelang Patah_18644,Godzilink.com,Malaysia,2020-04-10,1. 协助项目经理或项目总工进行项目图纸与设计管理工作Assist Project Mana...,"'C', 'R'",,2
5,6a3d3ff8e77a86ce,Product Manager (Payments),Fave Group Pte Ltd,Malaysia,2020-04-10,Fave is the next generation Digital Merchant P...,"'C', 'Excel', 'R'",,3
8,0904424ee04fb55b,Corporate Planning Manager/ Planners,Fircroft,Malaysia,2020-04-25,"The Role:Location: Kuala Lumpur, MalaysiaTenur...","'C', 'R'",,2


In [9]:
df_mle = extract_df('Machine_Learning_Engineer')
df_mle.head()

Number of rows 440


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
2,b9946084c93f3877,Engineer Manager,Siam M&M,Thailand,2020-04-10,Job DescriptionResponsibilities:Handle for Eng...,"'C', 'Go', 'R'",,3
3,8caaff8a4e42e3de,Data Scientist Manager,Pasona HR Consulting Recruitment (Thailand) Co...,Thailand,2020-04-10,Job ID 12563Occupation Data Scientist ManagerB...,"'C', 'R'",,2
5,0aae98815a6a20a4,Program Manager,Program Manager,Indonesia,2020-04-10,Wavemaker Partners is Southeast Asia’s leading...,"'C', 'Excel', 'R'",,3
12,69edc117a66b2bb7,Senior Product Manager,Singtel,Singapore,2020-05-11,DataSpark was created from a vision to transfo...,"'C', 'Excel', 'R'",'Senior',3
14,0196cb5a9fee96b4,APJC Competitive Business Development Manager,CISCO SYSTEMS (USA) PTE. LTD.,Singapore,2020-04-24,Roles & ResponsibilitiesWhat You'll doAs Compe...,"'C', 'Machine Learning', 'R'",,3


In [12]:
engine.dispose()
df_dsj = extract_skills_all(df_ds)
df_dsj.head()

13 jobs processed. Time taken: 0:00:47.412693
26 jobs processed. Time taken: 0:00:51.422837
39 jobs processed. Time taken: 0:00:34.137897
52 jobs processed. Time taken: 0:00:41.589995
65 jobs processed. Time taken: 0:00:46.706060
78 jobs processed. Time taken: 0:00:43.167474
91 jobs processed. Time taken: 0:00:38.465031
104 jobs processed. Time taken: 0:00:42.419194
117 jobs processed. Time taken: 0:00:29.890351
130 jobs processed. Time taken: 0:00:48.374952
143 jobs processed. Time taken: 0:00:40.822881
156 jobs processed. Time taken: 0:00:52.491858
169 jobs processed. Time taken: 0:01:27.793107
182 jobs processed. Time taken: 0:00:45.115368
195 jobs processed. Time taken: 0:00:40.794447
208 jobs processed. Time taken: 0:00:33.862468
221 jobs processed. Time taken: 0:00:28.591172
234 jobs processed. Time taken: 0:00:38.845923
247 jobs processed. Time taken: 0:00:28.568055
260 jobs processed. Time taken: 0:00:49.763342
Total time taken: 0:14:33.551391


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Principal Data Scientist Manager - Cloud Hardw...,Microsoft,USA,2020-05-02,To delight our customers in a Cloud First worl...,"'C', 'R'","[Applied Statistics, Automation, Cloud Service..."
1,Arity-Data Scientist-Sr Manager,Allstate,USA,2020-04-10,"Founded by The Allstate Corporation in 2016, A...","'C', 'Excel', 'R'","[Agile, Architecture, Business Analysis, Busin..."
2,Project Manager - Data Science | Chicago IL,Photon,USA,2020-04-28,Project Manager - Data Science | Chicago IL - ...,"'C', 'R', 'SQL'","[Apache Spark, Business Intelligence (BI), Dat..."
3,Corporate Communication Manager - Artificial I...,TuSimple,USA,2020-05-11,"This role is located in San Diego, CA. Relocat...","'Artificial Intelligence', 'C', 'R'","[Algorithm, Array, Artificial Intelligence (AI..."
4,Project Manager (Data Science & Public Health),ICF,USA,2020-04-29,ICF seeks a Project Manager with a data scienc...,"'C', 'Excel', 'R'","[Analytical, Business Analysis, Collaboration,..."


In [13]:
df_daj = extract_skills_all(df_da)
df_daj.head()

34 jobs processed. Time taken: 0:01:20.820214
68 jobs processed. Time taken: 0:01:37.464968
102 jobs processed. Time taken: 0:01:43.006114
136 jobs processed. Time taken: 0:02:03.294674
170 jobs processed. Time taken: 0:01:47.042629
204 jobs processed. Time taken: 0:01:56.214504
238 jobs processed. Time taken: 0:02:00.264164
272 jobs processed. Time taken: 0:01:38.296331
306 jobs processed. Time taken: 0:01:22.984659
340 jobs processed. Time taken: 0:01:50.277857
374 jobs processed. Time taken: 0:01:47.676128
408 jobs processed. Time taken: 0:02:02.348663
442 jobs processed. Time taken: 0:01:56.399032
476 jobs processed. Time taken: 0:01:16.338490
510 jobs processed. Time taken: 0:01:28.163783
544 jobs processed. Time taken: 0:01:28.396423
578 jobs processed. Time taken: 0:01:32.174170
612 jobs processed. Time taken: 0:01:44.265618
646 jobs processed. Time taken: 0:01:21.433254
680 jobs processed. Time taken: 0:01:24.078403
Total time taken: 0:34:03.115963


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Business Monitoring Analyst / Manager,Wirecard,Malaysia,2020-05-05,The role sits in Business Monitoring function ...,"'C', 'Excel', 'R'","[Automation, Brand Management, Credit Risk, Da..."
1,Assistant Investment Manager,BW INDUSTRIAL DEVELOPMENT JSC,Vietnam,2020-04-10,Làm việc tại: Hồ Chí Minh Mức lương: Cạnh Tran...,"'C', 'Excel', 'R'","[Accuracy, Analysis, Analytical, Annual Report..."
2,Financial Analyst/Assistant Manager - Finance,In.Corp Global Pte. Ltd.,Singapore,2020-05-04,Duties and responsibilities· Experienced in gr...,"'C', 'Excel', 'R'","[Analytical, Analytical Thinking, Audit, Budge..."
3,Accounting Manager,Span systems,Thailand,2020-04-10,Role & ResponsibilitiesManage the financial & ...,'R',"[Accounting, Analytical, Budget, Finance, Fore..."
4,Product Manager (Internal Product),NVG Technology,Vietnam,2020-04-10,job descriptionDefine and further develop inte...,"'C', 'Excel', 'R'","[Administration, Agile, Analytical Thinking, A..."


In [14]:
df_dej = extract_skills_all(df_de)
df_dej.head()

134 jobs processed. Time taken: 0:04:17.496911
268 jobs processed. Time taken: 0:07:59.705177
402 jobs processed. Time taken: 0:09:11.208289
536 jobs processed. Time taken: 0:09:50.369051
670 jobs processed. Time taken: 0:09:19.447551
804 jobs processed. Time taken: 0:09:34.888542
938 jobs processed. Time taken: 0:09:14.067171
1072 jobs processed. Time taken: 0:09:17.812506
1206 jobs processed. Time taken: 0:08:38.646464
1340 jobs processed. Time taken: 0:10:26.397910
1474 jobs processed. Time taken: 0:09:07.829619
1608 jobs processed. Time taken: 0:08:29.045128
1742 jobs processed. Time taken: 0:09:37.203054
1876 jobs processed. Time taken: 0:10:05.764167
2010 jobs processed. Time taken: 0:10:21.065463
2144 jobs processed. Time taken: 0:09:29.141505
2278 jobs processed. Time taken: 0:09:20.831176
2412 jobs processed. Time taken: 0:09:17.313930
2546 jobs processed. Time taken: 0:06:02.922331
2680 jobs processed. Time taken: 0:06:05.263118
Total time taken: 2:56:12.466360


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,PROJECT MANAGER / PROJECT ENGINEER,Misi Setia Oil & Gas,Malaysia,2020-04-10,Location:BalakongResponsibilities:Define proje...,"'C', 'R'","[Administration, Change Management, Chart, Con..."
1,MANUFACTURING / PRODUCTION - SENIOR ENGINEER /...,APM Malaysia,Malaysia,2020-04-10,"APM Automotive Holdings Berhad, is one of the ...","'Go', 'R'","[Automotive, Design, Documentation, Email, Hum..."
2,Technical Manager_Gelang Patah_18644,Godzilink.com,Malaysia,2020-04-10,1. 协助项目经理或项目总工进行项目图纸与设计管理工作Assist Project Mana...,"'C', 'R'","[Brand Management, Civil Engineering, Communic..."
3,Product Manager (Payments),Fave Group Pte Ltd,Malaysia,2020-04-10,Fave is the next generation Digital Merchant P...,"'C', 'Excel', 'R'","[Application Programming Interface (API), Coll..."
4,Corporate Planning Manager/ Planners,Fircroft,Malaysia,2020-04-25,"The Role:Location: Kuala Lumpur, MalaysiaTenur...","'C', 'R'","[Audit, Business Intelligence (BI), Business P..."


In [15]:
df_mlej = extract_skills_all(df_mle)
df_mlej.head()

22 jobs processed. Time taken: 0:01:05.948893
44 jobs processed. Time taken: 0:01:26.009153
66 jobs processed. Time taken: 0:01:25.844878
88 jobs processed. Time taken: 0:01:20.957422
110 jobs processed. Time taken: 0:01:22.718905
132 jobs processed. Time taken: 0:01:18.102775
154 jobs processed. Time taken: 0:01:12.583946
176 jobs processed. Time taken: 0:01:28.683881
198 jobs processed. Time taken: 0:01:24.298506
220 jobs processed. Time taken: 0:01:09.703242
242 jobs processed. Time taken: 0:01:09.360919
264 jobs processed. Time taken: 0:01:25.874231
286 jobs processed. Time taken: 0:01:14.530301
308 jobs processed. Time taken: 0:01:28.812989
330 jobs processed. Time taken: 0:01:36.846727
352 jobs processed. Time taken: 0:01:23.724539
374 jobs processed. Time taken: 0:01:22.534612
396 jobs processed. Time taken: 0:01:20.730154
418 jobs processed. Time taken: 0:01:05.178908
440 jobs processed. Time taken: 0:01:26.059972
Total time taken: 0:26:51.031079


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Engineer Manager,Siam M&M,Thailand,2020-04-10,Job DescriptionResponsibilities:Handle for Eng...,"'C', 'Go', 'R'","[Analytical Thinking, Budget, Communication, D..."
1,Data Scientist Manager,Pasona HR Consulting Recruitment (Thailand) Co...,Thailand,2020-04-10,Job ID 12563Occupation Data Scientist ManagerB...,"'C', 'R'","[Analysis, Business Acumen, Business Administr..."
2,Program Manager,Program Manager,Indonesia,2020-04-10,Wavemaker Partners is Southeast Asia’s leading...,"'C', 'Excel', 'R'","[Advertising, Agile, Artificial Intelligence (..."
3,Senior Product Manager,Singtel,Singapore,2020-05-11,DataSpark was created from a vision to transfo...,"'C', 'Excel', 'R'","[Agile, Analytical Thinking, Business Analysis..."
4,APJC Competitive Business Development Manager,CISCO SYSTEMS (USA) PTE. LTD.,Singapore,2020-04-24,Roles & ResponsibilitiesWhat You'll doAs Compe...,"'C', 'Machine Learning', 'R'","[Architectural Engineering, Architecture, Arti..."


In [17]:
def get_skill_count(df, d):
    for _, row in df.iterrows():
        for s in row['skills']:
            if s in d:
                d[s] += 1
            else:
                d[s] = 1

In [18]:
skill_count = {}

get_skill_count(df_dsj, skill_count)
get_skill_count(df_daj, skill_count)
get_skill_count(df_dej, skill_count)
get_skill_count(df_mlej, skill_count)

In [19]:
skill_count = {k: v for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)}
skill_count

{'Design': 1488,
 'Leadership': 1472,
 'Communication': 1433,
 'Training': 1313,
 'Project Management': 1168,
 'Analytical': 1159,
 'Engineering': 1116,
 'Market': 1013,
 'Budget': 966,
 'Platform': 887,
 'Reporting': 841,
 'Sales': 836,
 'Innovation': 834,
 'Management': 832,
 'Collaboration': 821,
 'Problem Solving': 798,
 'Presentation': 759,
 'Computer Science': 731,
 'Analysis': 710,
 'Documentation': 692,
 'TestNG': 662,
 'Law': 643,
 'Security': 628,
 'Research': 610,
 'Testing': 579,
 'Information Technology (IT)': 575,
 'Construction': 568,
 'Microsoft Excel': 558,
 'Marketing': 556,
 'Microsoft Access': 555,
 'Machine Learning': 543,
 'Product Management': 536,
 'Written Communication': 529,
 'Network': 516,
 'Administration': 496,
 'Agile': 492,
 'Writing': 487,
 'Audit': 486,
 'Decision Making': 480,
 'Architecture': 479,
 'Manufacturing': 459,
 'Insurance': 443,
 'Brand Management': 441,
 'Customer Service': 418,
 'English': 415,
 'Finance': 413,
 'Product Development': 39

In [28]:
total = len(df_dsj) + len(df_daj) + len(df_dej) + len(df_mlej)
len(df_dsj) / total, len(df_daj) / total, len(df_dej) / total, len(df_mlej) / total

(0.06363191385217817,
 0.17033773861967694,
 0.6583455702398434,
 0.10768477728830152)

In [27]:
skills = 'Presentation, Computer Science, Documentation, Testing, Microsoft Excel, Machine Learning, Agile, Architecture, Business Analysis, Automation, Artificial Intelligence (AI), Data Analysis, Consulting, Golang, Database, Quality Assurance (QA), Data Science, Microsoft PowerPoint, Software Engineering, C, Scrum, Data Management, Statistics'
skills = skills.split(', ')
s_count = {}

for s in skills:
    s_count[s] = str(round(skill_count[s] / 4086 * 100, 2)) + '%'

s_count

{'Presentation': '18.58%',
 'Computer Science': '17.89%',
 'Documentation': '16.94%',
 'Testing': '14.17%',
 'Microsoft Excel': '13.66%',
 'Machine Learning': '13.29%',
 'Agile': '12.04%',
 'Architecture': '11.72%',
 'Business Analysis': '8.59%',
 'Automation': '8.42%',
 'Artificial Intelligence (AI)': '8.3%',
 'Data Analysis': '7.76%',
 'Consulting': '7.71%',
 'Golang': '7.59%',
 'Database': '7.46%',
 'Quality Assurance (QA)': '7.42%',
 'Data Science': '6.66%',
 'Microsoft PowerPoint': '6.66%',
 'Software Engineering': '6.34%',
 'C': '5.24%',
 'Scrum': '4.67%',
 'Data Management': '4.6%',
 'Statistics': '4.36%'}

In [20]:
df_dsj.to_csv('indeed-insights/data_scientist_managers.csv', index=False)
df_daj.to_csv('indeed-insights/data_analyst_managers.csv', index=False)
df_dej.to_csv('indeed-insights/data_engineer_managers.csv', index=False)
df_mlej.to_csv('indeed-insights/machine_learning_engineer_managers.csv', index=False)