# Extracting Text from Online Job Postings

Reading in html documents and extracting the title, body and bullets

## Import required packages

In [1]:
import glob
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import pickle

Check the working directory

In [2]:
os.getcwd()

'E:\\resume-job-posting-nlp-project'

## Load in the html files and parse them

In [3]:
# path to the data files
path = "./data/html_job_postings/*.html"

# functions to read in all the files in the data folder
def read_files_in_folder(path):
    job_df = pd.DataFrame(columns=['title', 'body','bullets'])
    files = glob.glob(path)
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            html_string = f.read()
        soup = BeautifulSoup(html_string,'lxml')
        title = soup.find_all('title')[0]
        body = soup.find_all('body')[0]
        bullets = [bullet.text for bullet in  body.find_all('li')]
        f.close()
        job_df = job_df.append({'title':title.text, 'body':body.text, 'bullets':bullets}, ignore_index=True)
    return job_df

df = read_files_in_folder(path)

Check the parsed information in the dataframe

In [4]:
df.head()

Unnamed: 0,title,body,bullets
0,"Data Engineer - Columbus, GA 31909","Data Engineer - Columbus, GA 31909\nCelebratin...","[Bachelor’s or Master’s degree in statistics, ..."
1,"Data Analyst - St. Louis, MO","Data Analyst - St. Louis, MO\nDuties\nSummary\...",[Job family (Series)\n1501 General Mathematics...
2,"Data Scientist - Newark, CA","Data Scientist - Newark, CA\nData Scientist\n\...","[ Design, develop, document and maintain machi..."
3,Patient Care Assistant / PCA - Med/Surg (Fayet...,Patient Care Assistant / PCA - Med/Surg (Fayet...,[Provides all personal care services in accord...
4,"Scientific Programmer - Berkeley, CA","Scientific Programmer - Berkeley, CA\nCaribou ...","[Demonstrated proficiency with Python, JavaScr..."


## Filter to only return data scientist roles

In [5]:
ds_df = df.loc[df.title.str.contains('Data Scientist'), :]
ds_df = ds_df.append(df.loc[df.title.str.contains('Data Science'), :])

Drop any duplicate entries

In [6]:
ds_df.reset_index(inplace=True, drop=True)
print(ds_df.shape)
ds_df.drop_duplicates(subset=['title', 'body'], inplace=True)
print(ds_df.shape)

(491, 3)
(485, 3)


Check the parsed information in the dataframe

In [7]:
ds_df.head()

Unnamed: 0,title,body,bullets
0,"Data Scientist - Newark, CA","Data Scientist - Newark, CA\nData Scientist\n\...","[ Design, develop, document and maintain machi..."
1,PwC Labs - Jr. Data Scientist - Machine Learni...,PwC Labs - Jr. Data Scientist - Machine Learni...,[Invite and provide evidence-based feedback in...
2,"Senior Data Scientist - Sunnyvale, CA 94089","Senior Data Scientist - Sunnyvale, CA 94089\nI...",[Ability to mentor and up level junior data sc...
3,"Data Scientist - Seattle, WA","Data Scientist - Seattle, WA\nMS with 2+ years...",[MS with 2+ years of industry experience or Ba...
4,"Data Scientist - Pasadena, CA 91107","Data Scientist - Pasadena, CA 91107\nJob Type:...",[Use statistical and programming software comb...


## Pickle the object for later use

In [8]:
pickle.dump(ds_df, open("./data/data_science_jobs.pkl", "wb" ))