In [1]:
import pandas as pd
import so_utils

# Load Job Postings

Save the `all_jobs` dataframe output of `1.0-dn-job-posting-ingestion.ipynb` to `data/postings/all_jobs.csv`.

In [2]:
all_jobs_file = '../data/postings/all_jobs.csv'
all_jobs = pd.read_csv(all_jobs_file)
all_jobs.sample(10)

Unnamed: 0,job_title,company,location,description,date_posted,type,categories,levels,expired,language,data_source,new_description,cleaned_description
834,"Performance Engineer (Web Performance, CDN)",InfoObjects Inc,"San Francisco, CA","Bachelor or advanced degree in IT, Computer Sc...",2019-09-23 22:29:21+00:00,,,,0.0,en,indeed,,.
1123,Jr. Software Developer,Windhaven Insurance,"Austin, TX 78759",Bachelor's degree in Computer Science or relat...,2019-06-25 17:47:22+00:00,,,,0.0,en,indeed,,.
519,Principal Engineer Electronics,Northrop Grumman,"Baltimore, MD",<p><b>Northrop Grumman Mission Systems Airborn...,2019-09-17T21:33:39.833784Z,,[{'name': 'Engineering'}],[],,,muse,,.
223,Front-End Developer - CIMD - Marcus by Goldman...,Goldman Sachs,"Chicago, IL",<p><br><br><strong>CONSUMER (MARCUS BY GOLDMAN...,2019-08-28T17:48:36.227623Z,,[{'name': 'Engineering'}],"[{'name': 'Mid Level', 'short_name': 'mid'}]",,,muse,,.
4002,Reporting and Analytics Analyst,Babylon Health,"Austin, TX",Develop strong understanding of both internal ...,2019-06-27 12:21:17+00:00,,,,0.0,en,indeed,,.
319,Software Engineer,Alto Pharmacy,"San Francisco, CA","<p>As one of the first 15 engineers at Alto, y...",2019-09-24T05:32:30.488569Z,,[{'name': 'Engineering'}],"[{'name': 'Mid Level', 'short_name': 'mid'}]",,,muse,"As one of the first 15 engineers at Alto, you ...",As one of the first engineers at Alto you ...
4233,Data Engineer,Slice,"New York, NY 10017",Experience performing root cause analysis on i...,2019-09-19 16:36:30+00:00,,,,0.0,en,indeed,,.
2232,Senior Software Engineer,Amherst,"Austin, TX 78746",BS/BA degree in a technical field such as Comp...,2019-09-06 18:27:17+00:00,,,,0.0,en,indeed,,.
944,Web and Mobile Test engineers,World IP Phone,"Mountain View, CA",Knowledge of how to use mobile/web testing too...,2019-10-01 02:09:01+00:00,,,,0.0,en,indeed,,.
3186,Data Strategist,Retina AI,"Santa Monica, CA","At Retina, we enable businesses to tell their ...",2019-09-19 22:05:10+00:00,,,,0.0,en,indeed,,.


## Get Stackoverflow Tags from Description

**Note**: Tags are taken from [StackExchange Data Explorer](https://data.stackexchange.com/stackoverflow/query/new) using the query:

```
    Select TagName, [Count]
    From Tags
    Order By Count Desc;
````
For convenience, Stackoverflow Tags can be downloaded from this [gdrive link](https://drive.google.com/open?id=1hFhUIBN7_Ov859cg7ONrUzlPg3VjVrS0). 

In [3]:
tags_file = '../data/stackoverflow/QueryResults.csv'

tags = pd.read_csv(tags_file).dropna()
tags = so_utils.preprocess_stackoverflow_tags(tags, min_count=3000)

print(tags.shape)
tags.head(3)

(1796, 2)


Unnamed: 0,TagName,Count
0,JAVASCRIPT,1872748
1,JAVA,1588289
2,C#,1344882


In [4]:
text = all_jobs.iloc[3]['description']
text = so_utils.clean_text(text)
extracted_tags = so_utils.get_stackoverflow_tags_from_text(text, tags)
print('Tags: {}\n'.format([tag[0] for tag in extracted_tags]))
print('Text:\n===== \n{}'.format(text))

Tags: ['JAVA', 'PYTHON', 'SQL', 'REACT', 'NODEJS', 'DJANGO', 'GIT', 'POSTGRESQL', 'TYPESCRIPT', 'APACHE', 'REST', 'REACT-NATIVE', 'HTTP', 'ELASTICSEARCH', 'AWS', 'XAMARIN', 'JENKINS', 'D3JS', 'FLASK', 'WEBPACK', 'MOBILE', 'KAFKA', 'HTTPS', 'APACHE-KAFKA', 'STATISTICS', 'NOSQL', 'LEAFLET', 'JWT', 'JIRA', 'IONIC', 'NATIVE', 'BOKEH', 'COMPUTER-SCIENCE']

Text:
===== 
Aon Cyber Solutions is looking for a Senior Developer in NY or LA. As part of an industryleading team, you will help empower results for our clients by delivering innovative and effective solutions supporting Risk.
APPLY HERE: https://usstrozfriedbergaon.icims.com/jobs/24315/seniordeveloper/job
Position Overview
Join our expert software development team to create new customerfacing web applications. You will apply your expertise in web development and user interface design, working in concert with subject matter experts in digital forensics, incident response, cybersecurity, and threat intelligence. Your applications will con

In [5]:
from tqdm import tqdm

skills = []
for index, row in tqdm(all_jobs.iterrows(), total=len(all_jobs)):
    text = row['description']
    extracted_tags = so_utils.get_stackoverflow_tags_from_text(text, tags)
    skills.append(' '.join([tag[0] for tag in extracted_tags]))

100%|██████████████████████████████████████████████████████████████████████████████| 4822/4822 [06:41<00:00, 12.02it/s]


In [6]:
all_jobs['expected_skills'] = skills
all_jobs.head(3)

Unnamed: 0,job_title,company,location,description,date_posted,type,categories,levels,expired,language,data_source,new_description,cleaned_description,expected_skills
0,Lead Software Engineer,Simon & Schuster,"New York, NY",<p>Simon &amp; Schuster is seeking a Lead Soft...,2019-09-11 18:59:59+00:00,Full Time,,,,,github,Simon and Schuster is seeking a Lead Software ...,Simon and Schuster is seeking a Lead Software ...,JAVASCRIPT PYTHON SQL LINUX AZURE AWS GITHUB N...
1,Web Developer,Noe & Associates,"New York, NY",<p>A leading international branding and design...,2019-09-09 15:29:25+00:00,Full Time,,,,,github,A leading international branding and design co...,A leading international branding and design co...,JAVASCRIPT PHP HTML RUBY LINUX WORDPRESS HTML5...
2,Front End Developer,Prosek Partners,"Fairfield, CT","<p>Background</p>\n<p>Prosek Partners, one of ...",2018-11-19 19:40:01+00:00,Full Time,,,,,github,"Background Prosek Partners, one of the top 25 ...",Background Prosek Partners one of the top ...,JAVASCRIPT UI CSS UX WINDOWS HTML5 CSS3 DOM CR...


In [7]:
all_jobs.to_csv('../data/postings/all_jobs_v1.csv', index=False)

In [None]:
for i in range()