In [2]:
import pandas as pd
import so_utils

# Load Job Postings

Save the `all_jobs` dataframe output of `1.0-dn-job-posting-ingestion.ipynb` to `data/postings/all_jobs.csv`.

In [8]:
all_jobs_file = '../data/postings/all_jobs.csv'
all_jobs = pd.read_csv(all_jobs_file)
all_jobs.sample(10)

Unnamed: 0,job_title,company,location,description,date_posted
3714,Temporary Data Scientist,The University of Chicago,"Chicago, IL",Under the guidance of the faculty director con...,2019-09-25 22:42:10+00:00
1240,"Drupal Developer, Junior",Booz Allen Hamilton,"Washington, DC","Weâ€™ll help you develop the career you want, ...",2019-09-15 14:30:10+00:00
2739,"Full Stack Developer, Mid",Booz Allen Hamilton,"Washington, DC","Weâ€™ll help you develop the career you want, ...",2019-06-21 05:37:34+00:00
3530,Software Engineer - Junior (C++/Python),Boston Dynamics,"Waltham, MA 02451","BS in Computer Science, Engineering, Math or e...",2019-09-26 23:26:57+00:00
3901,Project Scientist - Research Center for Health...,Cedars-Sinai,"West Hollywood, CA",Project Scientists are not required to carry o...,2019-09-20 18:47:37+00:00
4200,Data Science Manager,Freebird,"Cambridge, MA 02139",We are looking for a Data Science Manager to h...,2019-09-10 09:49:05+00:00
4730,Full Stack Engineer - Product Development,cloudtamer.io,"Fulton, MD 20759",The Full Stack Engineer will take point in dat...,2019-09-14 15:23:07+00:00
1884,Backend Engineer,SmartAsset,"New York, NY 10012","Our personal finance tools, calculators and co...",2019-09-19 22:02:05+00:00
1406,Front End Web Developer @ Harvard Innovation L...,Ignite Mental Health,"Boston, MA",Own the mission of our organization and wear w...,2019-09-10 20:50:21+00:00
2291,Senior ServiceNow Developer,Guidehouse,"Washington, DC","As a senior platform developer on our team, yo...",2019-09-12 01:34:24+00:00


## Get Stackoverflow Tags from Description

**Note**: Tags are taken from [StackExchange Data Explorer](https://data.stackexchange.com/stackoverflow/query/new) using the query:

```
    Select TagName, [Count]
    From Tags
    Order By Count Desc;
````
For convenience, Stackoverflow Tags can be downloaded from this [gdrive link](https://drive.google.com/open?id=1hFhUIBN7_Ov859cg7ONrUzlPg3VjVrS0). 

In [4]:
tags_file = '../data/stackoverflow/QueryResults.csv'

tags = pd.read_csv(tags_file).dropna()
tags = so_utils.preprocess_stackoverflow_tags(tags, min_count=3000)

print(tags.shape)
tags.head(3)

(1796, 2)


Unnamed: 0,TagName,Count
0,JAVASCRIPT,1872748
1,JAVA,1588289
2,C#,1344882


In [5]:
text = all_jobs.iloc[3]['description']
text = so_utils.clean_text(text)
extracted_tags = so_utils.get_stackoverflow_tags_from_text(text, tags)
print('Tags: {}\n'.format([tag[0] for tag in extracted_tags]))
print('Text:\n===== \n{}'.format(text))

Tags: ['JAVA', 'PYTHON', 'SQL', 'REACT', 'NODEJS', 'DJANGO', 'GIT', 'POSTGRESQL', 'TYPESCRIPT', 'APACHE', 'REST', 'REACT-NATIVE', 'HTTP', 'ELASTICSEARCH', 'AWS', 'XAMARIN', 'JENKINS', 'D3JS', 'FLASK', 'WEBPACK', 'MOBILE', 'KAFKA', 'HTTPS', 'APACHE-KAFKA', 'STATISTICS', 'NOSQL', 'LEAFLET', 'JWT', 'JIRA', 'IONIC', 'NATIVE', 'BOKEH', 'COMPUTER-SCIENCE']

Text:
===== 
Aon Cyber Solutions is looking for a Senior Developer in NY or LA. As part of an industryleading team, you will help empower results for our clients by delivering innovative and effective solutions supporting Risk.
APPLY HERE: https://usstrozfriedbergaon.icims.com/jobs/24315/seniordeveloper/job
Position Overview
Join our expert software development team to create new customerfacing web applications. You will apply your expertise in web development and user interface design, working in concert with subject matter experts in digital forensics, incident response, cybersecurity, and threat intelligence. Your applications will con

In [6]:
from tqdm import tqdm

skills = []
for index, row in tqdm(all_jobs.iterrows(), total=len(all_jobs)):
    text = row['description']
    extracted_tags = so_utils.get_stackoverflow_tags_from_text(text, tags)
    skills.append(' '.join([tag[0] for tag in extracted_tags]))

100%|██████████████████████████████████████████████████████████████████████████████| 4822/4822 [06:39<00:00, 12.06it/s]


In [9]:
all_jobs['expected_skills'] = skills
all_jobs.head(3)

Unnamed: 0,job_title,company,location,description,date_posted,expected_skills
0,Lead Software Engineer,Simon & Schuster,"New York, NY",<p>Simon &amp; Schuster is seeking a Lead Soft...,2019-09-11 18:59:59+00:00,JAVASCRIPT PYTHON SQL LINUX AZURE AWS GITHUB N...
1,Web Developer,Noe & Associates,"New York, NY",<p>A leading international branding and design...,2019-09-09 15:29:25+00:00,JAVASCRIPT PHP HTML RUBY LINUX WORDPRESS HTML5...
2,Front End Developer,Prosek Partners,"Fairfield, CT","<p>Background</p>\n<p>Prosek Partners, one of ...",2018-11-19 19:40:01+00:00,JAVASCRIPT UI CSS UX WINDOWS HTML5 CSS3 DOM CR...


In [10]:
all_jobs.to_csv('../data/postings/all_jobs.csv', index=False)