In [1]:
import pandas as pd
import so_utils

# Load Job Postings

Save the `all_jobs` dataframe output of `1.0-dn-job-posting-ingestion.ipynb` to `data/postings/all_jobs.csv`.

In [13]:
all_jobs_file = './data/postings/all_jobs.csv'
all_jobs = pd.read_csv(all_jobs_file)
all_jobs.sample(10)

Unnamed: 0,job_title,company,location,description,date_posted
4056,Operations Research Analyst / Data Scientist,"Apogee Integration, LLC","Alexandria, VA","Bachelor of Science in Operations Research, St...",2019-08-28 17:19:11+00:00
3706,Data Engineer,Cars.com,"Chicago, IL 60604",We are looking for a Data Engineer to join the...,2019-09-06 16:44:45+00:00
3968,Data Scientist,Oracle,"Austin, TX","BS, MS or advanced professional certificate in...",2019-09-28 02:15:54+00:00
1745,Front End Developer,Fetch,"Washington, DC","A Bachelorâ€™s Degree in Computer Science, Eng...",2019-09-09 22:11:08+00:00
4649,Data Engineer,Logistics Management Institute,"Washington, DC","Develop data architecture and design products,...",2019-08-08 21:20:22+00:00
1197,"Front End Developer, Consulting",Yext,"Washington, DC","Develop, maintain and improve front-end testin...",2019-09-30 20:31:50+00:00
1381,React & Redux Developer,Prescribe Wellness LLC,"Quincy, MA 02169",They are also responsible with gaining an unde...,2019-10-01 17:42:03+00:00
3664,Data Support Analyst - Entry-Level - Tech Startup,Enterprise Peak,"Chicago, IL",Opportunities will be presented to develop new...,2019-09-30 15:22:49+00:00
4341,Big Data Engineer,Chase,"San Francisco, CA 94107",Youâ€™ll support the delivery of award winning...,2019-08-02 04:40:52+00:00
150,Senior Web Engineer - Frontend,"{'id': 11755, 'short_name': 'beeswax', 'name':...","[{'name': 'New York, NY'}]",<p><span>Do you love building intuitive UX and...,2019-08-17T02:30:28.420480Z


## Get Stackoverflow Tags from Description

**Note**: Tags are taken from [StackExchange Data Explorer](https://data.stackexchange.com/stackoverflow/query/new) using the query:

```
    Select TagName, [Count]
    From Tags
    Order By Count Desc;
````
For convenience, Stackoverflow Tags can be downloaded from this [gdrive link](https://drive.google.com/open?id=1hFhUIBN7_Ov859cg7ONrUzlPg3VjVrS0). 

In [4]:
tags_file = './data/stackoverflow/QueryResults.csv'

tags = pd.read_csv(tags_file).dropna()
tags = so_utils.preprocess_stackoverflow_tags(tags, min_count=3000)

print(tags.shape)
tags.head(3)

(1798, 2)


Unnamed: 0,TagName,Count
0,JAVASCRIPT,1872748
1,JAVA,1588289
2,C#,1344882


In [5]:
text = all_jobs.iloc[3]['description']
text = so_utils.clean_text(text)
extracted_tags = so_utils.get_stackoverflow_tags_from_text(text, tags)
print('Tags: {}\n'.format([tag[0] for tag in extracted_tags]))
print('Text:\n===== \n{}'.format(text))

Tags: ['JAVA', 'PYTHON', 'SQL', 'REACT', 'NODEJS', 'DJANGO', 'GIT', 'POSTGRESQL', 'TYPESCRIPT', 'APACHE', 'REST', 'REACT-NATIVE', 'HTTP', 'ELASTICSEARCH', 'AWS', 'XAMARIN', 'JENKINS', 'D3JS', 'FLASK', 'WEBPACK', 'MOBILE', 'KAFKA', 'HTTPS', 'APACHE-KAFKA', 'STATISTICS', 'NOSQL', 'LEAFLET', 'JWT', 'JIRA', 'IONIC', 'NATIVE', 'BOKEH', 'COMPUTER-SCIENCE']

Text:
===== 
Aon Cyber Solutions is looking for a Senior Developer in NY or LA. As part of an industryleading team, you will help empower results for our clients by delivering innovative and effective solutions supporting Risk.
APPLY HERE: https://usstrozfriedbergaon.icims.com/jobs/24315/seniordeveloper/job
Position Overview
Join our expert software development team to create new customerfacing web applications. You will apply your expertise in web development and user interface design, working in concert with subject matter experts in digital forensics, incident response, cybersecurity, and threat intelligence. Your applications will con

In [6]:
from tqdm import tqdm

skills = []
for index, row in tqdm(all_jobs.iterrows(), total=len(all_jobs)):
    text = row['description']
    extracted_tags = so_utils.get_stackoverflow_tags_from_text(text, tags)
    skills.append(' '.join([tag[0] for tag in extracted_tags]))

100%|██████████████████████████████████████████████████████████████████████████████| 4822/4822 [06:13<00:00, 12.90it/s]


In [14]:
all_jobs['languages'] = skills
all_jobs.head(3)

Unnamed: 0,job_title,company,location,description,date_posted,languages
0,Lead Software Engineer,Simon & Schuster,"New York, NY",<p>Simon &amp; Schuster is seeking a Lead Soft...,2019-09-11 18:59:59+00:00,JAVASCRIPT PYTHON SQL LINUX AZURE AWS GITHUB N...
1,Web Developer,Noe & Associates,"New York, NY",<p>A leading international branding and design...,2019-09-09 15:29:25+00:00,JAVASCRIPT PHP HTML RUBY LINUX WORDPRESS HTML5...
2,Front End Developer,Prosek Partners,"Fairfield, CT","<p>Background</p>\n<p>Prosek Partners, one of ...",2018-11-19 19:40:01+00:00,JAVASCRIPT UI CSS UX WINDOWS HTML5 CSS3 DOM CR...


In [15]:
all_jobs.to_csv('./data/postings/all_jobs.csv')