# Objective

Extract relevant sections from the job posting html files. 

In [54]:
import pathlib
from bs4 import BeautifulSoup 

cur_path = pathlib.Path()
data_path = cur_path.joinpath('data', 'html_job_postings')
html_files = data_path.iterdir() # iterator of html files

# Extract relevant texts from html 

In [55]:
def extract_text(html): 
    """
    Extract position title and job requirements from job positng html. 
    """
    with open(html, "r", encoding = "utf-8") as h:
        html_content = h.read()
    
    soup = BeautifulSoup(html_content)
    title = soup.find('title').text
    position, *location = title.split(" - ")

    if len(location) == 2:
        company, location = location 
    else:
        company, location = None, location[0]

    body = soup.find('body')
    if body.find('li'):
        jobreq = [bullet.text for bullet in body.find_all('li')]
    else:
        jobreq = [bullet.text for bullet in body.find_all('p')]
    
    return {'html': html,
            'position': position,
            'location': location,
            'company': company, 
            'jobreq': ' '.join(jobreq).strip('\n').replace('\n', '')}
    


In [56]:
data = []
for html in html_files:
    data.append(extract_text(html))


In [57]:
import pandas as pd 

df = pd.DataFrame(data)
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   html      1337 non-null   object
 1   position  1337 non-null   object
 2   location  1337 non-null   object
 3   company   244 non-null    object
 4   jobreq    1337 non-null   object
dtypes: object(5)
memory usage: 52.4+ KB
None
                                                html  \
0  data/html_job_postings/1e92960a19ffdd34_fccid....   
1  data/html_job_postings/3157fcef3ee474da_fccid....   
2  data/html_job_postings/b423ca22a6e2c10f_fccid....   
3  data/html_job_postings/ea487254a487beb5_fccid....   
4  data/html_job_postings/cb8a5bce330854e9_fccid....   

                                            position             location  \
0                               Quantitative Analyst     Boston, MA 02116   
1                                     Data Scientist    Mountain View, CA   
2             

In [58]:
df.loc[0, 'jobreq']

'Quantitative Analyst (State Street Bank and Trust Company; Boston, MA): The Quantitative Analyst will be part of State Street Treasury’s Treasury Quantitative Analytics (TQA) group. TQA is responsible for developing/implementing/monitoring advanced financial models that are used in company’s capital management, liquidity management, investment portfolio construction, and balance sheet optimization. The group is accountable for in-depth understanding, modeling, and representation of the complex interaction of global markets, customer behaviors, and regulatory oversights to create a view of risk/revenue opportunities and exposures to the investment committee, Board of Directors, senior management, and regulatory agencies. The Quantitative Analyst role is a key contributor to the realization of the GT’s mission of optimizing net interest income within the desired risk appetite position. Specific responsibilities include: apply advanced statistical techniques to analyze the characteristic

# Remove job postings unrelated to data scientist position

In [59]:
df['position'] = df['position'].map(lambda p: p.lower())
print(df['position'].unique())

['quantitative analyst' 'data scientist'
 'senior natural language processing (nlp) engineer'
 'flexo folder gluer oper' 'junior data scientist' 'business analyst'
 '(entry-level) data scientist' 'data scientist, analytics'
 'data scientist intern' 'software engineering intern'
 'data scientist bslef8' 'computational biologist' 'quality engineer'
 'chief data scientist' 'data science manager' 'data science intern'
 'real estate development manager' 'healthcare data analytics'
 'research statistician' 'staff assessor-pca (entry level)' 'manager'
 'bioinformatics data scientist' 'financial representative'
 'lead data scientist' 'data engineer' 'test automation engineer'
 'software developer – machine-learning/artificial-intelligence'
 'senior data consultant' 'postdoctoral scholar'
 'research engineer – deep learning and generative design'
 'biostatistician i'
 'senior architect, corporate social responsibility platform'
 'statistical analyst' 'senior data scientist' 'epidemiologist'
 'p

In [60]:
def keep_positions(df):
    """
    Keep positions that contain the word "data scientist" or are adjacent to data scientist. 
    """
    df['position'] = df['position'].map(lambda p: p.lower()) # lower case
    roles = ('data scientist', 'data analyst', 'statistical analyst', 
             'quantitative analyst', 'business analyst',
             'data science', 'data consultant', 
             'machine learning', 'data engineer', 'research analyst',
             'business insights analyst', 'quantitative',
             'analyst', 'intellligence', 'natural language processing', 'data',
             'research scientist')
    keep = [p for p in df['position'] if any(role in p for role in roles)]
    return df[[p in keep for p in df['position']]]
             

In [61]:
df = keep_positions(df)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 828 entries, 0 to 1336
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   html      828 non-null    object
 1   position  828 non-null    object
 2   location  828 non-null    object
 3   company   123 non-null    object
 4   jobreq    828 non-null    object
dtypes: object(5)
memory usage: 38.8+ KB
None


# Remove redundant job postings

In [1]:
def uniques_only(df):
    """
    Identify html's of job postings whose job requirements are unique.
    """
    seen = []
    out = []
    for row in df.itertuples(index = True):
        if row.jobreq not in seen:
            out.append(row.html)
            seen.append(row.jobreq)
    return out

In [63]:
df = df[[h in uniques_only(df) for h in df['html']]]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 754 entries, 0 to 1336
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   html      754 non-null    object
 1   position  754 non-null    object
 2   location  754 non-null    object
 3   company   110 non-null    object
 4   jobreq    754 non-null    object
dtypes: object(5)
memory usage: 35.3+ KB
None


In [64]:
df.to_csv('output/data.csv', index = False)