In [12]:
import numpy as np
import pandas as pd
import re
from datetime import datetime as dt
from sqlalchemy import create_engine
from secrets import secrets
from skill_api import extract_skills, extract_ignore

In [2]:
SKILLS = []

with open("skills/new_skills.txt") as f:
    for s in f:
        SKILLS.append(s.strip('\n'))

SKILLS = list(set(SKILLS))
SKILLS.sort()
print(SKILLS)

['AB Initio', 'AWS Glue', 'Airbyte', 'Alooma', 'Alteryx', 'Apache Accumulo', 'Apache Airflow', 'Apache Camel', 'Apache Kafka', 'Apache Mahout', 'Apache NiFi', 'Apache Nutch', 'Apatar', 'Atom', 'Azure Data Factory', 'BusinessObjects Data Integrator', 'CloverDX', 'DBConvert Studio', 'DBSoftlab', 'Data Integration Studio', 'Denodo', 'Etleap', 'FME', 'Fivetran', 'FlyData', 'GeoKettle', 'HPCC Systems', 'Hevo Data', 'Hypertable', 'IRI Voracity', 'InfoSphere DataStage', 'InfoSphere Information Server', 'Informatica PowerCenter', 'Jasper', 'Jaspersoft ETL', 'KETL', 'Logstash', 'Matillion', 'Oracle Data Integrator (ODI)', 'Oracle Golden Gate (OGG)', 'Oracle Warehouse Builder', 'Panoply', 'Pentaho', 'Pentaho Data Integration (PDI)', 'Qlik Replicate', 'SQL Server Integrated Services (SSIS)', 'Scriptella', 'Segment', 'Singer', 'Skyvia', 'Sprinkle', 'Stitch', 'Striim', 'Sybase ETL', 'TIBCO Messaging', 'Talend', 'Xplenty']


In [10]:
query = """
select Job_ID, Description
from Data_Analyst
where Date_Posted > '2020-04-01'
and Description != 'No Description'
union
select Job_ID, Description
from Data_Engineer
where Date_Posted > '2020-04-01'
and Description != 'No Description'
union
select Job_ID, Description
from Data_Scientist
where Date_Posted > '2020-04-01'
and Description != 'No Description'
union
select Job_ID, Description
from Machine_Learning_Engineer
where Date_Posted > '2020-04-01'
and Description != 'No Description'
"""

engine = create_engine(secrets['indeed_db'])
df = pd.read_sql(query, engine)
engine.dispose()
df = df.drop_duplicates()
print(len(df))
df.head()

110361


Unnamed: 0,Job_ID,Description
0,52559f1046316681,Looking for a candidate who loves crunching nu...
1,e61ab28dad96531e,"APAC Data Informatics Analyst, APAC Sales Info..."
2,45d36230efb28afa,"APAC Data Informatics Analyst, APAC Sales Info..."
3,4a24763c53d22529,Data Analyst (Quality)ResponsibilitiesLocate a...
4,8b805d72b542a3d3,Job Description:Work closely with Warehouse te...


In [13]:
descriptions = df['Description'].unique().tolist()
len(descriptions)

103334

In [14]:
i = 0
initial = dt.now()
interval = dt.now()
print_every = int(len(df) / 20)
skill_count = {s: 0 for s in SKILLS}

for d in descriptions:
    i += 1
    if i % print_every == 0:
        print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
        interval = dt.now()
    skills = extract_skills(d, SKILLS)
    for s in skills:
        skill_count[s] += 1

print("Total time taken: {}".format(dt.now() - initial))

5518 jobs processed. Time taken: 0:04:12.095799
11036 jobs processed. Time taken: 0:05:12.495792
16554 jobs processed. Time taken: 0:05:06.056115
22072 jobs processed. Time taken: 0:03:59.341730
27590 jobs processed. Time taken: 0:04:17.448146
33108 jobs processed. Time taken: 0:03:50.534780
38626 jobs processed. Time taken: 0:05:32.008812
44144 jobs processed. Time taken: 0:05:41.172955
49662 jobs processed. Time taken: 0:05:43.978704
55180 jobs processed. Time taken: 0:05:47.236158
60698 jobs processed. Time taken: 0:05:47.791418
66216 jobs processed. Time taken: 0:05:47.308954
71734 jobs processed. Time taken: 0:05:54.197182
77252 jobs processed. Time taken: 0:04:37.469573
82770 jobs processed. Time taken: 0:03:37.162193
88288 jobs processed. Time taken: 0:04:34.912025
93806 jobs processed. Time taken: 0:04:34.925985
99324 jobs processed. Time taken: 0:05:07.093099
Total time taken: 1:32:34.452708


In [26]:
temp_dict = []

for s in skill_count:
    temp_dict.append({'skill': s, 'count': skill_count[s]})

df_s = pd.DataFrame.from_dict(temp_dict)
df_s.head()

Unnamed: 0,skill,count
0,AB Initio,98
1,AWS Glue,164
2,Airbyte,0
3,Alooma,7
4,Alteryx,478


In [28]:
df_s['percent %'] = round(df_s['count'] * 100 / len(descriptions), 2)
df_s = df_s.sort_values(by='count', ascending=False).reset_index(drop=True)
df_s.iloc[1:21]

Unnamed: 0,skill,count,percent %
1,SQL Server Integrated Services (SSIS),835,0.81
2,Xplenty,505,0.49
3,Alteryx,478,0.46
4,Talend,476,0.46
5,Apache Kafka,299,0.29
6,Azure Data Factory,235,0.23
7,Logstash,185,0.18
8,Pentaho,172,0.17
9,Apache Airflow,170,0.16
10,AWS Glue,164,0.16
