# Imports

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup as bs

# Data

In [2]:
posting_dir = Path('data/html_job_postings/')

In [3]:
posting_list = []

for file in posting_dir.glob('*.html'):
    posting_list.append([file, file.read_text()])

In [4]:
posting_list[0]

[PosixPath('data/html_job_postings/1e92960a19ffdd34_fccid.html'),
 '<html><head><title>Quantitative Analyst - Boston, MA 02116</title></head>\n<body><h2>Quantitative Analyst - Boston, MA 02116</h2>\n<p>Quantitative Analyst (State Street Bank and Trust Company; Boston, MA): The Quantitative Analyst will be part of State Street Treasury’s Treasury Quantitative Analytics (TQA) group. TQA is responsible for developing/implementing/monitoring advanced financial models that are used in company’s capital management, liquidity management, investment portfolio construction, and balance sheet optimization. The group is accountable for in-depth understanding, modeling, and representation of the complex interaction of global markets, customer behaviors, and regulatory oversights to create a view of risk/revenue opportunities and exposures to the investment committee, Board of Directors, senior management, and regulatory agencies. The Quantitative Analyst role is a key contributor to the realizatio

# Data Extraction

In [5]:
def extract_data(html_content):
    
    soup = bs(html_content)

    title_text = soup.find('title').text
    body_text = soup.find('body').text

    bullets = soup.find_all('li')
    bullets_text = []
    for b in bullets:
        bullets_text.append(b.text.strip())
    
    return title_text, body_text, bullets_text

In [6]:
posting_df = pd.DataFrame(columns=['job_posting', 'title', 'body', 'bullets'])

for file_path, html in posting_list:

    title, body, bullets = extract_data(html)
    file_name = file_path.name
    posting_df = posting_df.append(pd.Series([file_name, title, body, bullets], 
                                             index=posting_df.columns), 
                                   ignore_index=True)
                                            

In [7]:
posting_df.head()

Unnamed: 0,job_posting,title,body,bullets
0,1e92960a19ffdd34_fccid.html,"Quantitative Analyst - Boston, MA 02116","Quantitative Analyst - Boston, MA 02116\nQuant...",[]
1,3157fcef3ee474da_fccid.html,"Data Scientist - Mountain View, CA","Data Scientist - Mountain View, CA\nGroundTrut...","[Help senior members of the team to explore, d..."
2,b423ca22a6e2c10f_fccid.html,"Data Scientist - Seattle, WA","Data Scientist - Seattle, WA\nA Bachelor or Ma...",[A Bachelor or Masters Degree in a highly quan...
3,ea487254a487beb5_fccid.html,Senior Natural Language Processing (NLP) Engin...,Senior Natural Language Processing (NLP) Engin...,[Join a small team creating a proprietary NLU ...
4,cb8a5bce330854e9_fccid.html,"FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...","FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...",[]


In [8]:
posting_df.shape

(1337, 4)

## Extract postings related to 'Data Science' and 'ML'

In [9]:
posting_df = posting_df.iloc[np.where(posting_df.title.str.contains('(data science)|(data scientist)|(machine learning)', 
                                                       case=False))]

  return func(self, *args, **kwargs)


In [10]:
posting_df.shape

(547, 4)

## Dropping duplicates

In [11]:
posting_df['bullets'] = posting_df['bullets'].apply(tuple, 1)

In [12]:
posting_df['bullets'].head(n=1).values

array([('Help senior members of the team to explore, develop, productionize and optimize machine learning algorithms and pipelines.', 'Use Hadoop, Spark and Amazon Athena on a daily basis to explore our petabytes of data.', 'Dive deep into our rich set of location data, derive insights and build product prototypes.', 'Collaborate with peer data scientists, engineers and product managers closely.', 'Master degree in Computer Science, Statistics, Mathematics, Engineering; PhD is a plus.', 'Has experience in Statistics, Machine Learning.', 'Fluency in Python.', 'Significant experience with SQL; some relational databases or NoSQL data stores.', 'Familiarity with open source machine learning libraries such as scikit-learn and Spark MLlib.', 'Has experience in Amazon Web Services is a plus.', 'Excellent communication skills.')],
      dtype=object)

In [13]:
posting_df = posting_df.drop_duplicates()

In [14]:
posting_df.shape

(547, 4)

# Persisting Data

In [16]:
#posting_df.to_csv('data/postings.csv', index=False)
posting_df.to_pickle('data/postings.pk')