In [11]:
import pandas as pd
import re
from datetime import datetime as dt
from sqlalchemy import create_engine
from secrets import secrets
from skill_api import extract_skills, extract_ignore

In [2]:
# Skills
df_skills = pd.read_csv('skills_db2/skill.csv')
SKILLS = df_skills['Skill'].unique().tolist()
# Redundant skills
df_redskills = pd.read_excel('skills/Other Skills.xlsx')
RED_SKILLS = df_redskills['Skill'].unique().tolist()
# Duplicate skills
df_dupskills = pd.read_excel('skills/Other Skills.xlsx', sheet_name='Duplicates')
DUP_SKILLS = df_dupskills.set_index('Skill').to_dict()['Parent']
SKILLS.extend(list(DUP_SKILLS.keys()))

In [55]:
query = """
select Job_ID, Job_Title, Company, Country, Date_Posted, Description, Keywords_Present, Title_Keywords 
from {}
where Date_Posted > '2020-04-01'
and Description != 'No Description'
and lower(Job_Title) like '%%manager%%'
"""

def extract_df(job):
    df = pd.read_sql(query.format(job), engine)
    df.drop_duplicates(inplace=True)
    no_skill = df['Keywords_Present'].isna()
    df['Num_Skill'] = 0
    df.loc[~no_skill, 'Num_Skill'] = df.loc[~no_skill, 'Keywords_Present'].apply(lambda x: len(x.split(',')))
    print('Number of rows', len(df))
    return df

def extract_skills_all(df):
    i = 0
    job_info_list = []
    initial = dt.now()
    interval = dt.now()
    print_every = int(len(df) / 20)
    for _, job in df.iterrows():
        i += 1
        if i % print_every == 0:
            print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
            interval = dt.now()
        all_info = job['Job_Title'] + ' ' + job['Description']
        all_skills = extract_skills(all_info, SKILLS)
        keep_skills, _ = extract_ignore(all_skills, RED_SKILLS, DUP_SKILLS)
        keep_skills.sort()
        # ignore_skills.sort()
        job_info_list.append({
            'job_id': job['Job_ID'],
            'title': job['Job_Title'],
            'company': job['Company'],
            'country': job['Country'],
            'date_posted': job['Date_Posted'],
            'description': job['Description'],
            'indeed_skills': job['Keywords_Present'],
            'skills': keep_skills,
        })
    print("Total time taken: {}".format(dt.now() - initial))
    return pd.DataFrame.from_dict(job_info_list)

In [6]:
engine = create_engine(secrets['indeed_db'])
df_ds = extract_df('Data_Scientist')
df_ds.head()

Number of rows 260


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
16,b8259e44f8dc12ad,Principal Data Scientist Manager - Cloud Hardw...,Microsoft,USA,2020-05-02,To delight our customers in a Cloud First worl...,"'C', 'R'",,2
21,e8565fa183b873d5,Arity-Data Scientist-Sr Manager,Allstate,USA,2020-04-10,"Founded by The Allstate Corporation in 2016, A...","'C', 'Excel', 'R'",'Sr',3
25,21df6fd71777d30d,Project Manager - Data Science | Chicago IL,Photon,USA,2020-04-28,Project Manager - Data Science | Chicago IL - ...,"'C', 'R', 'SQL'",,3
31,6560a73b8f1fee47,Corporate Communication Manager - Artificial I...,TuSimple,USA,2020-05-11,"This role is located in San Diego, CA. Relocat...","'Artificial Intelligence', 'C', 'R'",,3
40,92fc3ac15f8c1c81,Project Manager (Data Science & Public Health),ICF,USA,2020-04-29,ICF seeks a Project Manager with a data scienc...,"'C', 'Excel', 'R'",,3


In [7]:
df_da = extract_df('Data_Analyst')
df_da.head()

Number of rows 696


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
2,c406ae5189abe01b,Business Monitoring Analyst / Manager,Wirecard,Malaysia,2020-05-05,The role sits in Business Monitoring function ...,"'C', 'Excel', 'R'",,3
5,653ee53c6262cb44,Assistant Investment Manager,BW INDUSTRIAL DEVELOPMENT JSC,Vietnam,2020-04-10,Làm việc tại: Hồ Chí Minh Mức lương: Cạnh Tran...,"'C', 'Excel', 'R'",,3
7,4c2ca84fd914a953,Financial Analyst/Assistant Manager - Finance,In.Corp Global Pte. Ltd.,Singapore,2020-05-04,Duties and responsibilities· Experienced in gr...,"'C', 'Excel', 'R'",,3
8,ed45e3c6b4603630,Accounting Manager,Span systems,Thailand,2020-04-10,Role & ResponsibilitiesManage the financial & ...,'R',,1
9,1e71c925b0a6415c,Product Manager (Internal Product),NVG Technology,Vietnam,2020-04-10,job descriptionDefine and further develop inte...,"'C', 'Excel', 'R'",'Intern',3


In [8]:
df_de = extract_df('Data_Engineer')
df_de.head()

Number of rows 2690


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
0,4d081f53e4763438,PROJECT MANAGER / PROJECT ENGINEER,Misi Setia Oil & Gas,Malaysia,2020-04-10,Location:BalakongResponsibilities:Define proje...,"'C', 'R'",,2
1,499250d99acee1c0,MANUFACTURING / PRODUCTION - SENIOR ENGINEER /...,APM Malaysia,Malaysia,2020-04-10,"APM Automotive Holdings Berhad, is one of the ...","'Go', 'R'",,2
3,879f1027dbe4d926,Technical Manager_Gelang Patah_18644,Godzilink.com,Malaysia,2020-04-10,1. 协助项目经理或项目总工进行项目图纸与设计管理工作Assist Project Mana...,"'C', 'R'",,2
5,6a3d3ff8e77a86ce,Product Manager (Payments),Fave Group Pte Ltd,Malaysia,2020-04-10,Fave is the next generation Digital Merchant P...,"'C', 'Excel', 'R'",,3
8,0904424ee04fb55b,Corporate Planning Manager/ Planners,Fircroft,Malaysia,2020-04-25,"The Role:Location: Kuala Lumpur, MalaysiaTenur...","'C', 'R'",,2


In [9]:
df_mle = extract_df('Machine_Learning_Engineer')
df_mle.head()

Number of rows 440


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
2,b9946084c93f3877,Engineer Manager,Siam M&M,Thailand,2020-04-10,Job DescriptionResponsibilities:Handle for Eng...,"'C', 'Go', 'R'",,3
3,8caaff8a4e42e3de,Data Scientist Manager,Pasona HR Consulting Recruitment (Thailand) Co...,Thailand,2020-04-10,Job ID 12563Occupation Data Scientist ManagerB...,"'C', 'R'",,2
5,0aae98815a6a20a4,Program Manager,Program Manager,Indonesia,2020-04-10,Wavemaker Partners is Southeast Asia’s leading...,"'C', 'Excel', 'R'",,3
12,69edc117a66b2bb7,Senior Product Manager,Singtel,Singapore,2020-05-11,DataSpark was created from a vision to transfo...,"'C', 'Excel', 'R'",'Senior',3
14,0196cb5a9fee96b4,APJC Competitive Business Development Manager,CISCO SYSTEMS (USA) PTE. LTD.,Singapore,2020-04-24,Roles & ResponsibilitiesWhat You'll doAs Compe...,"'C', 'Machine Learning', 'R'",,3


In [12]:
engine.dispose()
df_dsj = extract_skills_all(df_ds)
df_dsj.head()

13 jobs processed. Time taken: 0:00:47.412693
26 jobs processed. Time taken: 0:00:51.422837
39 jobs processed. Time taken: 0:00:34.137897
52 jobs processed. Time taken: 0:00:41.589995
65 jobs processed. Time taken: 0:00:46.706060
78 jobs processed. Time taken: 0:00:43.167474
91 jobs processed. Time taken: 0:00:38.465031
104 jobs processed. Time taken: 0:00:42.419194
117 jobs processed. Time taken: 0:00:29.890351
130 jobs processed. Time taken: 0:00:48.374952
143 jobs processed. Time taken: 0:00:40.822881
156 jobs processed. Time taken: 0:00:52.491858
169 jobs processed. Time taken: 0:01:27.793107
182 jobs processed. Time taken: 0:00:45.115368
195 jobs processed. Time taken: 0:00:40.794447
208 jobs processed. Time taken: 0:00:33.862468
221 jobs processed. Time taken: 0:00:28.591172
234 jobs processed. Time taken: 0:00:38.845923
247 jobs processed. Time taken: 0:00:28.568055
260 jobs processed. Time taken: 0:00:49.763342
Total time taken: 0:14:33.551391


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Principal Data Scientist Manager - Cloud Hardw...,Microsoft,USA,2020-05-02,To delight our customers in a Cloud First worl...,"'C', 'R'","[Applied Statistics, Automation, Cloud Service..."
1,Arity-Data Scientist-Sr Manager,Allstate,USA,2020-04-10,"Founded by The Allstate Corporation in 2016, A...","'C', 'Excel', 'R'","[Agile, Architecture, Business Analysis, Busin..."
2,Project Manager - Data Science | Chicago IL,Photon,USA,2020-04-28,Project Manager - Data Science | Chicago IL - ...,"'C', 'R', 'SQL'","[Apache Spark, Business Intelligence (BI), Dat..."
3,Corporate Communication Manager - Artificial I...,TuSimple,USA,2020-05-11,"This role is located in San Diego, CA. Relocat...","'Artificial Intelligence', 'C', 'R'","[Algorithm, Array, Artificial Intelligence (AI..."
4,Project Manager (Data Science & Public Health),ICF,USA,2020-04-29,ICF seeks a Project Manager with a data scienc...,"'C', 'Excel', 'R'","[Analytical, Business Analysis, Collaboration,..."


In [13]:
df_daj = extract_skills_all(df_da)
df_daj.head()

34 jobs processed. Time taken: 0:01:20.820214
68 jobs processed. Time taken: 0:01:37.464968
102 jobs processed. Time taken: 0:01:43.006114
136 jobs processed. Time taken: 0:02:03.294674
170 jobs processed. Time taken: 0:01:47.042629
204 jobs processed. Time taken: 0:01:56.214504
238 jobs processed. Time taken: 0:02:00.264164
272 jobs processed. Time taken: 0:01:38.296331
306 jobs processed. Time taken: 0:01:22.984659
340 jobs processed. Time taken: 0:01:50.277857
374 jobs processed. Time taken: 0:01:47.676128
408 jobs processed. Time taken: 0:02:02.348663
442 jobs processed. Time taken: 0:01:56.399032
476 jobs processed. Time taken: 0:01:16.338490
510 jobs processed. Time taken: 0:01:28.163783
544 jobs processed. Time taken: 0:01:28.396423
578 jobs processed. Time taken: 0:01:32.174170
612 jobs processed. Time taken: 0:01:44.265618
646 jobs processed. Time taken: 0:01:21.433254
680 jobs processed. Time taken: 0:01:24.078403
Total time taken: 0:34:03.115963


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Business Monitoring Analyst / Manager,Wirecard,Malaysia,2020-05-05,The role sits in Business Monitoring function ...,"'C', 'Excel', 'R'","[Automation, Brand Management, Credit Risk, Da..."
1,Assistant Investment Manager,BW INDUSTRIAL DEVELOPMENT JSC,Vietnam,2020-04-10,Làm việc tại: Hồ Chí Minh Mức lương: Cạnh Tran...,"'C', 'Excel', 'R'","[Accuracy, Analysis, Analytical, Annual Report..."
2,Financial Analyst/Assistant Manager - Finance,In.Corp Global Pte. Ltd.,Singapore,2020-05-04,Duties and responsibilities· Experienced in gr...,"'C', 'Excel', 'R'","[Analytical, Analytical Thinking, Audit, Budge..."
3,Accounting Manager,Span systems,Thailand,2020-04-10,Role & ResponsibilitiesManage the financial & ...,'R',"[Accounting, Analytical, Budget, Finance, Fore..."
4,Product Manager (Internal Product),NVG Technology,Vietnam,2020-04-10,job descriptionDefine and further develop inte...,"'C', 'Excel', 'R'","[Administration, Agile, Analytical Thinking, A..."


In [14]:
df_dej = extract_skills_all(df_de)
df_dej.head()

134 jobs processed. Time taken: 0:04:17.496911
268 jobs processed. Time taken: 0:07:59.705177
402 jobs processed. Time taken: 0:09:11.208289
536 jobs processed. Time taken: 0:09:50.369051
670 jobs processed. Time taken: 0:09:19.447551
804 jobs processed. Time taken: 0:09:34.888542
938 jobs processed. Time taken: 0:09:14.067171
1072 jobs processed. Time taken: 0:09:17.812506
1206 jobs processed. Time taken: 0:08:38.646464
1340 jobs processed. Time taken: 0:10:26.397910
1474 jobs processed. Time taken: 0:09:07.829619
1608 jobs processed. Time taken: 0:08:29.045128
1742 jobs processed. Time taken: 0:09:37.203054
1876 jobs processed. Time taken: 0:10:05.764167
2010 jobs processed. Time taken: 0:10:21.065463
2144 jobs processed. Time taken: 0:09:29.141505
2278 jobs processed. Time taken: 0:09:20.831176
2412 jobs processed. Time taken: 0:09:17.313930
2546 jobs processed. Time taken: 0:06:02.922331
2680 jobs processed. Time taken: 0:06:05.263118
Total time taken: 2:56:12.466360


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,PROJECT MANAGER / PROJECT ENGINEER,Misi Setia Oil & Gas,Malaysia,2020-04-10,Location:BalakongResponsibilities:Define proje...,"'C', 'R'","[Administration, Change Management, Chart, Con..."
1,MANUFACTURING / PRODUCTION - SENIOR ENGINEER /...,APM Malaysia,Malaysia,2020-04-10,"APM Automotive Holdings Berhad, is one of the ...","'Go', 'R'","[Automotive, Design, Documentation, Email, Hum..."
2,Technical Manager_Gelang Patah_18644,Godzilink.com,Malaysia,2020-04-10,1. 协助项目经理或项目总工进行项目图纸与设计管理工作Assist Project Mana...,"'C', 'R'","[Brand Management, Civil Engineering, Communic..."
3,Product Manager (Payments),Fave Group Pte Ltd,Malaysia,2020-04-10,Fave is the next generation Digital Merchant P...,"'C', 'Excel', 'R'","[Application Programming Interface (API), Coll..."
4,Corporate Planning Manager/ Planners,Fircroft,Malaysia,2020-04-25,"The Role:Location: Kuala Lumpur, MalaysiaTenur...","'C', 'R'","[Audit, Business Intelligence (BI), Business P..."


In [15]:
df_mlej = extract_skills_all(df_mle)
df_mlej.head()

22 jobs processed. Time taken: 0:01:05.948893
44 jobs processed. Time taken: 0:01:26.009153
66 jobs processed. Time taken: 0:01:25.844878
88 jobs processed. Time taken: 0:01:20.957422
110 jobs processed. Time taken: 0:01:22.718905
132 jobs processed. Time taken: 0:01:18.102775
154 jobs processed. Time taken: 0:01:12.583946
176 jobs processed. Time taken: 0:01:28.683881
198 jobs processed. Time taken: 0:01:24.298506
220 jobs processed. Time taken: 0:01:09.703242
242 jobs processed. Time taken: 0:01:09.360919
264 jobs processed. Time taken: 0:01:25.874231
286 jobs processed. Time taken: 0:01:14.530301
308 jobs processed. Time taken: 0:01:28.812989
330 jobs processed. Time taken: 0:01:36.846727
352 jobs processed. Time taken: 0:01:23.724539
374 jobs processed. Time taken: 0:01:22.534612
396 jobs processed. Time taken: 0:01:20.730154
418 jobs processed. Time taken: 0:01:05.178908
440 jobs processed. Time taken: 0:01:26.059972
Total time taken: 0:26:51.031079


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Engineer Manager,Siam M&M,Thailand,2020-04-10,Job DescriptionResponsibilities:Handle for Eng...,"'C', 'Go', 'R'","[Analytical Thinking, Budget, Communication, D..."
1,Data Scientist Manager,Pasona HR Consulting Recruitment (Thailand) Co...,Thailand,2020-04-10,Job ID 12563Occupation Data Scientist ManagerB...,"'C', 'R'","[Analysis, Business Acumen, Business Administr..."
2,Program Manager,Program Manager,Indonesia,2020-04-10,Wavemaker Partners is Southeast Asia’s leading...,"'C', 'Excel', 'R'","[Advertising, Agile, Artificial Intelligence (..."
3,Senior Product Manager,Singtel,Singapore,2020-05-11,DataSpark was created from a vision to transfo...,"'C', 'Excel', 'R'","[Agile, Analytical Thinking, Business Analysis..."
4,APJC Competitive Business Development Manager,CISCO SYSTEMS (USA) PTE. LTD.,Singapore,2020-04-24,Roles & ResponsibilitiesWhat You'll doAs Compe...,"'C', 'Machine Learning', 'R'","[Architectural Engineering, Architecture, Arti..."


In [105]:
def get_skill_count(df, d):
    for _, row in df.iterrows():
        for s in row['skills']:
            if s in d:
                d[s] += 1
            else:
                d[s] = 1

In [18]:
skill_count = {}

get_skill_count(df_dsj, skill_count)
get_skill_count(df_daj, skill_count)
get_skill_count(df_dej, skill_count)
get_skill_count(df_mlej, skill_count)

In [19]:
skill_count = {k: v for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)}
skill_count

{'Design': 1488,
 'Leadership': 1472,
 'Communication': 1433,
 'Training': 1313,
 'Project Management': 1168,
 'Analytical': 1159,
 'Engineering': 1116,
 'Market': 1013,
 'Budget': 966,
 'Platform': 887,
 'Reporting': 841,
 'Sales': 836,
 'Innovation': 834,
 'Management': 832,
 'Collaboration': 821,
 'Problem Solving': 798,
 'Presentation': 759,
 'Computer Science': 731,
 'Analysis': 710,
 'Documentation': 692,
 'TestNG': 662,
 'Law': 643,
 'Security': 628,
 'Research': 610,
 'Testing': 579,
 'Information Technology (IT)': 575,
 'Construction': 568,
 'Microsoft Excel': 558,
 'Marketing': 556,
 'Microsoft Access': 555,
 'Machine Learning': 543,
 'Product Management': 536,
 'Written Communication': 529,
 'Network': 516,
 'Administration': 496,
 'Agile': 492,
 'Writing': 487,
 'Audit': 486,
 'Decision Making': 480,
 'Architecture': 479,
 'Manufacturing': 459,
 'Insurance': 443,
 'Brand Management': 441,
 'Customer Service': 418,
 'English': 415,
 'Finance': 413,
 'Product Development': 39

In [103]:
total = len(df_dsj) + len(df_daj) + len(df_dej) + len(df_mlej)
len(df_dsj) / total, len(df_daj) / total, len(df_dej) / total, len(df_mlej) / total

(0.06363191385217817,
 0.17033773861967694,
 0.6583455702398434,
 0.10768477728830152)

In [27]:
skills = 'Presentation, Computer Science, Documentation, Testing, Microsoft Excel, Machine Learning, Agile, Architecture, Business Analysis, Automation, Artificial Intelligence (AI), Data Analysis, Consulting, Golang, Database, Quality Assurance (QA), Data Science, Microsoft PowerPoint, Software Engineering, C, Scrum, Data Management, Statistics'
skills = skills.split(', ')
s_count = {}

for s in skills:
    s_count[s] = skill_count[s

s_count

{'Presentation': '18.58%',
 'Computer Science': '17.89%',
 'Documentation': '16.94%',
 'Testing': '14.17%',
 'Microsoft Excel': '13.66%',
 'Machine Learning': '13.29%',
 'Agile': '12.04%',
 'Architecture': '11.72%',
 'Business Analysis': '8.59%',
 'Automation': '8.42%',
 'Artificial Intelligence (AI)': '8.3%',
 'Data Analysis': '7.76%',
 'Consulting': '7.71%',
 'Golang': '7.59%',
 'Database': '7.46%',
 'Quality Assurance (QA)': '7.42%',
 'Data Science': '6.66%',
 'Microsoft PowerPoint': '6.66%',
 'Software Engineering': '6.34%',
 'C': '5.24%',
 'Scrum': '4.67%',
 'Data Management': '4.6%',
 'Statistics': '4.36%'}

In [20]:
df_dsj.to_csv('indeed-insights/data_scientist_managers.csv', index=False)
df_daj.to_csv('indeed-insights/data_analyst_managers.csv', index=False)
df_dej.to_csv('indeed-insights/data_engineer_managers.csv', index=False)
df_mlej.to_csv('indeed-insights/machine_learning_engineer_managers.csv', index=False)

In [53]:
df_m = pd.read_csv('resource/Manager.csv')
df_m = df_m.drop_duplicates(subset=['Job_ID'])
print(len(df_m))
df_m.head()

170


Unnamed: 0.1,Unnamed: 0,Job_ID,Job_Title,Company,Salary,Country,State,Location,Metadata,Date_Posted,Description,Job_URL,Keywords_Present,Title_Keywords
0,0,b93858aaa66216b5,Front Desk Manager,"Marriott International, Inc",,Thailand,Not Applicable,Thailand,,2021-05-06,"Posting Date May 05, 2021Job Number 21042992Jo...",https://th.indeed.com/rc/clk?jk=b93858aaa66216...,"'R', 'C'",
1,0,cb335346567f593d,Customer Service Manager,L'Oreal,,Vietnam,Not Applicable,Thành phố Hồ Chí Minh,,2021-05-05,KEY JOB ACCOUNTABILITIES:Ensure the order-to-c...,https://vn.indeed.com/rc/clk?jk=cb335346567f59...,"'C', 'CG', 'Excel', 'R'",
2,0,f311790e03591da4,IT MANAGER,POWER STEEL & ELECTRO-PLATING WORKS SDN BHD,"RM 6,000 - RM 7,999 a month",Malaysia,Not Applicable,Shah Alam,,2021-05-05,We are an established manufacturing company in...,https://www.indeed.com.my/rc/clk?jk=f311790e03...,"'Microsoft Office', 'D3', 'Golang', 'C', 'Exce...",
3,0,0c0af4a0af366017,"Security Manager, Asia Pacific",Wiley,,Singapore,Not Applicable,Singapore,,2021-04-29,"At Wiley, we welcome you for who you are, the ...",https://www.indeed.com.sg/rc/clk?jk=0c0af4a0af...,"'R', 'Microsoft Office', 'C'",
4,0,961527fcfa62f250,STELLA – EVENT MANAGER,THE ADORA,,Vietnam,Not Applicable,Thành phố Hồ Chí Minh,,2021-05-06,Mô tả công việc:Lên ý tưởng nội dung và hình ả...,https://vn.indeed.com/rc/clk?jk=961527fcfa62f2...,"'C', 'Excel', 'PowerPoint'",


In [54]:
df_musa = pd.read_csv('resource/Manager_USA.csv')
df_musa = df_musa.drop_duplicates(subset=['Job_ID'])
print(len(df_musa))
df_musa.head()

  interactivity=interactivity, compiler=compiler, result=result)


15412


Unnamed: 0.1,Unnamed: 0,Job_ID,Job_Title,Company,Salary,Country,State,Location,Metadata,Date_Posted,Description,Job_URL,Keywords_Present,Title_Keywords
0,0,650c976838d7f54a,Operations Manager,ACE Hardware,,USA,AZ,"Prescott Valley, AZ 86314",,2021-05-05,We have an exciting opportunity for a Operatio...,https://www.indeed.com/company/Ace-Hardware/jo...,"'SAP', 'Excel', 'Microsoft Office', 'C', 'R', ...",
1,0,8400fc2fcd84644c,Assistant Project Manager,Fairbanks North Star Borough,$30.37 an hour,USA,AK,"Fairbanks, AK 99701",,2021-05-04,POSITION TITLE: Assistant Project ManagerSTAT...,https://www.indeed.com/company/Fairbanks-North...,"'R', 'Microsoft Office', 'C', 'AutoCAD'",
2,0,06046b646b0c71d1,Corporate Store Assistant Marketing Manager,Murphy USA,,USA,AR,"El Dorado, AR 71730",,2021-05-05,The Corporate Store Assistant Manager (ASM) wi...,https://www.indeed.com/rc/clk?jk=06046b646b0c7...,"'R', 'C'",
3,0,41fe5651c81f91a0,Manager In Training,McDonald's,Up to $14 an hour,USA,AL,"Stevenson, AL 35772",,2021-05-06,Are you looking for a long-term career withgre...,https://www.indeed.com/rc/clk?jk=41fe5651c81f9...,"'R', 'C'",
5,1,0c6442357823c030,Warehouse Manager,ACE Hardware,,USA,AZ,"Prescott Valley, AZ 86314",,2021-05-06,PURPOSE AND SCOPE:The Warehouse Manager will b...,https://www.indeed.com/rc/clk?jk=0c6442357823c...,"'R', 'C'",


In [56]:
df_m2 = df_m.dropna(subset=['Description'])
df_m2 = df_m.drop_duplicates(subset=['Description'])
len(df_m2)

149

In [57]:
df_musa2 = df_musa.dropna(subset=['Description'])
df_musa2 = df_musa.drop_duplicates(subset=['Description'])
len(df_musa2)

436

In [59]:
df_m2j = extract_skills_all(df_m2)
df_m2j.head()

7 jobs processed. Time taken: 0:00:16.367012
14 jobs processed. Time taken: 0:00:15.432640
21 jobs processed. Time taken: 0:00:10.965708
28 jobs processed. Time taken: 0:00:19.574557
35 jobs processed. Time taken: 0:00:14.091822
42 jobs processed. Time taken: 0:00:16.882405
49 jobs processed. Time taken: 0:00:13.077847
56 jobs processed. Time taken: 0:00:18.615172
63 jobs processed. Time taken: 0:00:10.135088
70 jobs processed. Time taken: 0:00:17.145819
77 jobs processed. Time taken: 0:00:13.359265
84 jobs processed. Time taken: 0:00:13.703946
91 jobs processed. Time taken: 0:00:24.502049
98 jobs processed. Time taken: 0:00:15.934104
105 jobs processed. Time taken: 0:00:15.277584
112 jobs processed. Time taken: 0:00:12.556205
119 jobs processed. Time taken: 0:00:13.679552
126 jobs processed. Time taken: 0:00:18.760842
133 jobs processed. Time taken: 0:00:12.343077
140 jobs processed. Time taken: 0:00:13.434660
147 jobs processed. Time taken: 0:00:11.196514
Total time taken: 0:05:20.30

Unnamed: 0,job_id,title,company,country,date_posted,description,indeed_skills,skills
0,b93858aaa66216b5,Front Desk Manager,"Marriott International, Inc",Thailand,2021-05-06,"Posting Date May 05, 2021Job Number 21042992Jo...","'R', 'C'","[Accounting, Accuracy, Asset Management, Coach..."
1,cb335346567f593d,Customer Service Manager,L'Oreal,Vietnam,2021-05-05,KEY JOB ACCOUNTABILITIES:Ensure the order-to-c...,"'C', 'CG', 'Excel', 'R'","[Accuracy, Change Management, Collaboration, C..."
2,f311790e03591da4,IT MANAGER,POWER STEEL & ELECTRO-PLATING WORKS SDN BHD,Malaysia,2021-05-05,We are an established manufacturing company in...,"'Microsoft Office', 'D3', 'Golang', 'C', 'Exce...","[Analysis, Analytical Thinking, Audit, Budget,..."
3,0c0af4a0af366017,"Security Manager, Asia Pacific",Wiley,Singapore,2021-04-29,"At Wiley, we welcome you for who you are, the ...","'R', 'Microsoft Office', 'C'","[Collaboration, Creativity, Customer Service, ..."
4,961527fcfa62f250,STELLA – EVENT MANAGER,THE ADORA,Vietnam,2021-05-06,Mô tả công việc:Lên ý tưởng nội dung và hình ả...,"'C', 'Excel', 'PowerPoint'","[C, Design, Microsoft Excel, Microsoft PowerPo..."


In [60]:
df_musa2j = extract_skills_all(df_musa2)
df_musa2j.head()

21 jobs processed. Time taken: 0:01:00.830605
42 jobs processed. Time taken: 0:01:11.067973
63 jobs processed. Time taken: 0:01:24.587132
84 jobs processed. Time taken: 0:01:09.385854
105 jobs processed. Time taken: 0:01:16.604754
126 jobs processed. Time taken: 0:01:03.175028
147 jobs processed. Time taken: 0:01:25.282795
168 jobs processed. Time taken: 0:00:55.207781
189 jobs processed. Time taken: 0:01:08.355705
210 jobs processed. Time taken: 0:01:15.352851
231 jobs processed. Time taken: 0:01:22.021525
252 jobs processed. Time taken: 0:01:17.943607
273 jobs processed. Time taken: 0:01:11.040318
294 jobs processed. Time taken: 0:01:10.057737
315 jobs processed. Time taken: 0:01:14.616535
336 jobs processed. Time taken: 0:01:23.722176
357 jobs processed. Time taken: 0:00:58.714897
378 jobs processed. Time taken: 0:01:40.596053
399 jobs processed. Time taken: 0:01:10.983391
420 jobs processed. Time taken: 0:01:15.558137
Total time taken: 0:25:32.271599


Unnamed: 0,job_id,title,company,country,date_posted,description,indeed_skills,skills
0,650c976838d7f54a,Operations Manager,ACE Hardware,USA,2021-05-05,We have an exciting opportunity for a Operatio...,"'SAP', 'Excel', 'Microsoft Office', 'C', 'R', ...","[Analytical, Data Analysis, Hardware, Leadersh..."
1,8400fc2fcd84644c,Assistant Project Manager,Fairbanks North Star Borough,USA,2021-05-04,POSITION TITLE: Assistant Project ManagerSTAT...,"'R', 'Microsoft Office', 'C', 'AutoCAD'","[Accuracy, Administration, Architectural Engin..."
2,06046b646b0c71d1,Corporate Store Assistant Marketing Manager,Murphy USA,USA,2021-05-05,The Corporate Store Assistant Manager (ASM) wi...,"'R', 'C'","[Administration, Closing, Customer Service, Ma..."
3,41fe5651c81f91a0,Manager In Training,McDonald's,USA,2021-05-06,Are you looking for a long-term career withgre...,"'R', 'C'","[Insurance, Management, Metals, Microsoft Exce..."
4,0c6442357823c030,Warehouse Manager,ACE Hardware,USA,2021-05-06,PURPOSE AND SCOPE:The Warehouse Manager will b...,"'R', 'C'","[Budget, Customer Service, Ergonomics, Hardwar..."


In [115]:
def get_skill_count(df, d):
    for _, row in df.iterrows():
        for s in row['skills']:
            d.append({'Skill': s, 'Count': 1})

skill_count = []

get_skill_count(df_dsj, skill_count)
get_skill_count(df_daj, skill_count)
get_skill_count(df_dej, skill_count)
get_skill_count(df_mlej, skill_count)
get_skill_count(df_m2j, skill_count)
get_skill_count(df_musa2j, skill_count)
get_skill_count(df_m2j, skill_count)
get_skill_count(df_musa2j, skill_count)

df_sc = pd.DataFrame.from_dict(skill_count)
df_sc.head()

Unnamed: 0,Skill,Count
0,Applied Statistics,1
1,Automation,1
2,Cloud Service,1
3,Cloud Technology,1
4,Collaboration,1


In [121]:
df_all = df_dsj.append(df_daj).append(df_dej).append(df_mlej).append(df_m2j).append(df_m2j).append(df_musa2j).append(df_musa2j)
df_all = df_all.sample(frac=1).reset_index(drop=True)
df_all.head()

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills,job_id
0,assistant manager design engineer (electronic ...,บริษัท จัดหางาน คะเรียลิงค์ (ประเทศไทย) จำกัด,Thailand,2020-05-23,รายละเอียดงานรูปแบบงาน งานประจำจำนวนที่รับ ...,,"[Communication, Design, Electronics, English, ...",
1,Factory Manager,AA PRODUCTS SDN BHD,Malaysia,2021-05-06,"Manufacturing managers plan, oversee and direc...",'C',"[Budget, Engineering, Manufacturing, Sales]",040c1448846b6eb3
2,"Manager, Enrichment/Service Excellence, Custom...",Grab,Singapore,2021-05-06,Get to know our Team :We design and provide pr...,"'R', 'Excel', 'Golang', 'PowerPoint', 'C'","[Analytical, Audit, Automation, Banking, Brain...",b0fbfc0eb67911d5
3,Manufacturing Engineer manager,Hot Shot Staffing,USA,2020-04-10,"• Hire, train, and supervise engineering staff...","'C', 'R'","[Accuracy, Analytical, Budget, Business Admini...",
4,Senior Product Manager,Hopper,USA,2020-05-23,at hopper we’re on a mission to make booking ...,,"[Airlines, Banking, Big Data, Construction, Cr...",


In [122]:
df_all.loc[df_all['title'].str.contains('data', case=False)]

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills,job_id
21,Data & AI Senior Manager,Monroe Consulting Group,Indonesia,2021-03-26,Data & AI Strategy Senior ManagerAccenture is ...,"'C', 'Machine Learning', 'Artificial Intellige...","[Analysis, Analytical, Artificial Intelligence...",
25,Data / Analytics Manager,Bioworld Merchandising,USA,2020-05-28,bioworld merchandising is a leading design and...,"'Excel', 'SQL'","[Analytical, Apparel, Artificial Intelligence ...",
31,Data Analytics & BI Manager,Michael Page TH,Thailand,2021-04-13,Reputable with top-notch products diverse comp...,"'Data Analysis', 'R', 'C'","[Analytical, Business Intelligence (BI), Data ...",
36,"Program Manager, Data Science and Applied Comp...",University of Wisconsin Extended Campus,USA,2020-05-09,NOTE: The University of Wisconsin System is en...,"'C', 'R'","[Administrative Law, Array, Budget, Collaborat...",
45,Data Modeling Assistant Manager,Argyll Scott,Thailand,2020-05-19,one of the leading insurance companies is recr...,"'Python', 'SAS', 'SQL'","[Analytical Thinking, Applied Mathematics, Com...",
...,...,...,...,...,...,...,...,...
5207,Engineering Manager - Data,Convr,USA,2020-05-23,about us convr is a growing startup in the ins...,,"[Agile, Architecture, Communication, D3.js, Da...",
5223,Sr. Engineering Manager - Data & Analytics,ProClinical,USA,2020-05-14,proclinical is currently recruiting for a seni...,"'C', 'R'","[Agile, Analytical, Apache Kafka, Apache Spark...",
5224,"Global Compensation Manager / Data Analyst, Re...",Wilbur-Ellis Company,USA,2020-05-12,do you have experience in north america and as...,"'C', 'Excel', 'R'","[Agricultural Production, Analysis, Architectu...",
5227,Data & Applied Scientist Manager,Microsoft,USA,2020-04-10,Core Services Engineering builds and manages t...,"'C', 'Go', 'R'","[Business Acumen, Communication, Data Analytic...",


In [120]:
df_all['country'].value_counts(normalize=True)

USA            0.713470
Singapore      0.094178
Malaysia       0.064117
Thailand       0.047374
Vietnam        0.033866
Indonesia      0.025495
Philippines    0.021499
Name: country, dtype: float64

In [116]:
total = len(df_dsj) + len(df_daj) + len(df_dej) + len(df_mlej) + len(df_m2j) * 2 + len(df_musa2j) * 2
total

5256

In [117]:
df_sc2 = df_sc.groupby('Skill').sum().reset_index()
df_sc2['Percent'] = df_sc2['Count'].apply(lambda x: round(x / total * 100, 2))
df_sc2 = df_sc2.sort_values(by='Count', ascending=False)
df_sc2.head()

Unnamed: 0,Skill,Count,Percent
777,Leadership,1854,35.27
275,Communication,1849,35.18
1434,Training,1829,34.8
424,Design,1624,30.9
1108,Project Management,1308,24.89


In [118]:
df_sc2.to_csv('manager_skills.csv')