## Latest Job Posts Build

> A project to analyze hacker news stories using nbdbt

In [1]:
#| echo: false
%reload_ext autoreload
%autoreload 2

In [2]:
#| echo: false
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
#| echo: false
%reload_ext nbdbt.dbt_cellmagic

In [4]:
#| echo: false
%dbtconfig -p ../hn_whos_hiring -n notebooks/latest_job_posts_build.ipynb

### How to filter only articles with whos hiring within the last 6 months (see whos_hiring page)
> These models will contain only the latest hiring posts (to save storage as well as compute by 
dealing only with the latest subset of data for hiring posts and job posts)

In [5]:
%%dbt -a latest_whos_hiring  models/latest_whos_hiring_articles.sql
{{ config(materialized='table') }}
-- select from last 6 months (assuming monthly cadence)
select *
from {{ ref('whos_hiring_articles') }}
order by submit_timestamp desc
limit 6 

In [None]:
# %cd ../hn_whos_hiring
# !dbt run -s models/latest_whos_hiring_articles.sql
# %cd ../notebooks

In [6]:
latest_whos_hiring.ref()

Unnamed: 0,post_id,title,url,submitter_id,content,submit_timestamp,dead,post_score,parent_id,post_type,ranking,deleted,descendants
0,32306920,Ask HN: Who is hiring? (August 2022),,whoishiring,"Please state the location and include REMOTE, ...",2022-08-01 15:01:33+00:00,False,446,,story,,,769
1,31947297,Ask HN: Who is hiring? (July 2022),,whoishiring,"Please state the location and include REMOTE, ...",2022-07-01 15:00:14+00:00,False,328,,story,,,620
2,31582796,Ask HN: Who is hiring? (June 2022),,whoishiring,"Please state the location and include REMOTE, ...",2022-06-01 15:01:51+00:00,False,390,,story,,,861
3,31235968,Ask HN: Who is hiring? (May 2022),,whoishiring,"Please state the location and include REMOTE, ...",2022-05-02 15:01:38+00:00,False,390,,story,,,851
4,30878761,Ask HN: Who is hiring? (April 2022),,whoishiring,"Please state the location and include REMOTE, ...",2022-04-01 15:02:05+00:00,False,264,,story,,,805
5,30515750,Ask HN: Who is hiring? (March 2022),,whoishiring,"Please state the location and include REMOTE, ...",2022-03-01 16:01:56+00:00,False,327,,story,,,875


### Filter job posts for latest whos hiring only

In [7]:
%%dbt -a latest_job_posts models/latest_job_posts.sql
{{ config(materialized='table') }}
-- select from last 6 months (assuming monthly cadence)
with latest_hiring as
( select post_id,
  from {{ ref('latest_whos_hiring_articles') }}
),
latest_job_posts as 
(
  select p.*
  from {{ ref('hn_posts') }} p
  inner join latest_hiring h
  on p.parent_id = h.post_id
)
select *  
from latest_job_posts

In [8]:
# %cd ../hn_whos_hiring
# !dbt run -s models/latest_job_posts.sql
# %cd ../notebooks

/home/butch2/play/experiments/hn_whos_hiring/notebooks
13:56:26  Running with dbt=1.1.1
13:56:26  Found 5 models, 0 tests, 0 snapshots, 15 analyses, 191 macros, 0 operations, 0 seed files, 1 source, 0 exposures, 0 metrics
13:56:26  
13:56:27  Concurrency: 1 threads (target='dev')
13:56:27  
13:56:28  1 of 1 START table model 00dev.latest_job_posts ................................ [RUN]
13:56:33  1 of 1 OK created table model 00dev.latest_job_posts ........................... [[32mCREATE TABLE (4.5k rows, 11.9 GB processed)[0m in 5.02s]
13:56:33  
13:56:33  Finished running 1 table model in 6.47s.
13:56:33  
13:56:33  [32mCompleted successfully[0m
13:56:33  
13:56:33  Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1
/home/butch2/play/experiments/hn_whos_hiring/notebooks


In [9]:
jobs_df = latest_job_posts.ref(limit=100)

In [10]:
jobs_df.head()

Unnamed: 0,post_id,title,url,submitter_id,content,submit_timestamp,dead,post_score,parent_id,post_type,ranking,deleted,descendants
0,30607463,,,charlottewan,Current | Fintech | New York City (US) | Full ...,2022-03-08 21:38:33+00:00,False,,30515750,comment,,,
1,30517402,,,taline,1build | YC 2019 | Staff Software Engineer + M...,2022-03-01 17:53:17+00:00,False,,30515750,comment,,,
2,31236564,,,cgvagenas,Disney Streaming | Mid&#x2F;Sr. Software Engin...,2022-05-02 15:39:28+00:00,False,,31235968,comment,,,
3,31236566,,,ineptech,InComm | Engineering Manager &#x2F; Dev Manage...,2022-05-02 15:39:48+00:00,False,,31235968,comment,,,
4,31236562,,,mikikian,CourtDrive | Senior Perl Developer | REMOTE| M...,2022-05-02 15:39:22+00:00,False,,31235968,comment,,,


In [13]:
import html

In [20]:
from bs4 import BeautifulSoup
import re

In [20]:
def html_to_text(html): # https://python.hotexamples.com/examples/bs4/BeautifulSoup/strip/python-beautifulsoup-strip-method-examples.html#0xd95fc8f3bead4ee436033a38ce9788cca28b74bac5a0be60d5b34fd0b457f102-33,,39,
    # Hack to prevent Beautiful Soup from collapsing space-keeping tags
    # until no whitespace remains at all
    html = re.sub("<(br|p|li)", " \\g<0>", html, flags=re.IGNORECASE)
    text = BeautifulSoup(html, "html.parser").get_text()
    # Idea from http://stackoverflow.com/a/1546251
    return " ".join(text.strip().split())

In [12]:
content0 = jobs_df.iloc[0].content; content0

'Current | Fintech | New York City (US) | Full Time |<a href="https:&#x2F;&#x2F;current.com&#x2F;careers&#x2F;" rel="nofollow">https:&#x2F;&#x2F;current.com&#x2F;careers&#x2F;</a><p>Current is a leading U.S. financial technology platform serving the needs of Americans who are working to create a better future for themselves. Our mission is to enable members to change their lives by creating better financial outcomes.<p>We’ve raised over $400 million in funding, backed by investments from Andreessen Horowitz, Tiger Global Management, TQ Ventures, Avenir, Sapphire Ventures, Foundation Capital, Wellington Management, QED Investors, and EXPA.<p>No matter your title, we welcome everyone at Current to build great products, grow quickly, and make an impact with us.<p>Featured Roles:\nBackend Engineer: <a href="https:&#x2F;&#x2F;grnh.se&#x2F;b30f29331us" rel="nofollow">https:&#x2F;&#x2F;grnh.se&#x2F;b30f29331us</a> \niOS Engineer: <a href="https:&#x2F;&#x2F;grnh.se&#x2F;62b81e071us" rel="nofol

In [16]:
hcontent0 = html.unescape(content0); hcontent0

'Current | Fintech | New York City (US) | Full Time |<a href="https://current.com/careers/" rel="nofollow">https://current.com/careers/</a><p>Current is a leading U.S. financial technology platform serving the needs of Americans who are working to create a better future for themselves. Our mission is to enable members to change their lives by creating better financial outcomes.<p>We’ve raised over $400 million in funding, backed by investments from Andreessen Horowitz, Tiger Global Management, TQ Ventures, Avenir, Sapphire Ventures, Foundation Capital, Wellington Management, QED Investors, and EXPA.<p>No matter your title, we welcome everyone at Current to build great products, grow quickly, and make an impact with us.<p>Featured Roles:\nBackend Engineer: <a href="https://grnh.se/b30f29331us" rel="nofollow">https://grnh.se/b30f29331us</a> \niOS Engineer: <a href="https://grnh.se/62b81e071us" rel="nofollow">https://grnh.se/62b81e071us</a> \nAndroid Engineer: <a href="https://grnh.se/26f

`html_to_text` is a slightly modified example from [python.hotexamples](https://python.hotexamples.com/examples/bs4/BeautifulSoup/strip/python-beautifulsoup-strip-method-examples.html#0xd95fc8f3bead4ee436033a38ce9788cca28b74bac5a0be60d5b34fd0b457f102-33,,39,)

In [22]:
text_content = html_to_text(hcontent0); text_content

'Current | Fintech | New York City (US) | Full Time |https://current.com/careers/ Current is a leading U.S. financial technology platform serving the needs of Americans who are working to create a better future for themselves. Our mission is to enable members to change their lives by creating better financial outcomes. We’ve raised over $400 million in funding, backed by investments from Andreessen Horowitz, Tiger Global Management, TQ Ventures, Avenir, Sapphire Ventures, Foundation Capital, Wellington Management, QED Investors, and EXPA. No matter your title, we welcome everyone at Current to build great products, grow quickly, and make an impact with us. Featured Roles: Backend Engineer: https://grnh.se/b30f29331us iOS Engineer: https://grnh.se/62b81e071us Android Engineer: https://grnh.se/26f763531us QA Engineer: https://grnh.se/877dd6061us Check out other roles at https://grnh.se/1686fb701us'

In [23]:
fields = text_content.split('|'); fields[0:4]

['Current ', ' Fintech ', ' New York City (US) ', ' Full Time ']

In [24]:
fields[4]

'https://current.com/careers/ Current is a leading U.S. financial technology platform serving the needs of Americans who are working to create a better future for themselves. Our mission is to enable members to change their lives by creating better financial outcomes. We’ve raised over $400 million in funding, backed by investments from Andreessen Horowitz, Tiger Global Management, TQ Ventures, Avenir, Sapphire Ventures, Foundation Capital, Wellington Management, QED Investors, and EXPA. No matter your title, we welcome everyone at Current to build great products, grow quickly, and make an impact with us. Featured Roles: Backend Engineer: https://grnh.se/b30f29331us iOS Engineer: https://grnh.se/62b81e071us Android Engineer: https://grnh.se/26f763531us QA Engineer: https://grnh.se/877dd6061us Check out other roles at https://grnh.se/1686fb701us'

In [27]:
fixed_fields = [f.strip() for f in fields]; fixed_fields[0:4]

['Current', 'Fintech', 'New York City (US)', 'Full Time']

In [28]:
fixed_fields[4]

'https://current.com/careers/ Current is a leading U.S. financial technology platform serving the needs of Americans who are working to create a better future for themselves. Our mission is to enable members to change their lives by creating better financial outcomes. We’ve raised over $400 million in funding, backed by investments from Andreessen Horowitz, Tiger Global Management, TQ Ventures, Avenir, Sapphire Ventures, Foundation Capital, Wellington Management, QED Investors, and EXPA. No matter your title, we welcome everyone at Current to build great products, grow quickly, and make an impact with us. Featured Roles: Backend Engineer: https://grnh.se/b30f29331us iOS Engineer: https://grnh.se/62b81e071us Android Engineer: https://grnh.se/26f763531us QA Engineer: https://grnh.se/877dd6061us Check out other roles at https://grnh.se/1686fb701us'

In [30]:
# strip out urls
url_matcher = r'https?:\/\/[\S]+'

In [32]:
links = re.findall(url_matcher, fixed_fields[4]); links

['https://current.com/careers/',
 'https://grnh.se/b30f29331us',
 'https://grnh.se/62b81e071us',
 'https://grnh.se/26f763531us',
 'https://grnh.se/877dd6061us',
 'https://grnh.se/1686fb701us']

In [34]:
clear_text = re.sub(url_matcher,'',fixed_fields[4]);clear_text

' Current is a leading U.S. financial technology platform serving the needs of Americans who are working to create a better future for themselves. Our mission is to enable members to change their lives by creating better financial outcomes. We’ve raised over $400 million in funding, backed by investments from Andreessen Horowitz, Tiger Global Management, TQ Ventures, Avenir, Sapphire Ventures, Foundation Capital, Wellington Management, QED Investors, and EXPA. No matter your title, we welcome everyone at Current to build great products, grow quickly, and make an impact with us. Featured Roles: Backend Engineer:  iOS Engineer:  Android Engineer:  QA Engineer:  Check out other roles at '

In [35]:
linked_token_text = re.sub(url_matcher,'__LINK_TOKEN__',fixed_fields[4]); linked_token_text

'__LINK_TOKEN__ Current is a leading U.S. financial technology platform serving the needs of Americans who are working to create a better future for themselves. Our mission is to enable members to change their lives by creating better financial outcomes. We’ve raised over $400 million in funding, backed by investments from Andreessen Horowitz, Tiger Global Management, TQ Ventures, Avenir, Sapphire Ventures, Foundation Capital, Wellington Management, QED Investors, and EXPA. No matter your title, we welcome everyone at Current to build great products, grow quickly, and make an impact with us. Featured Roles: Backend Engineer: __LINK_TOKEN__ iOS Engineer: __LINK_TOKEN__ Android Engineer: __LINK_TOKEN__ QA Engineer: __LINK_TOKEN__ Check out other roles at __LINK_TOKEN__'

In [30]:
# strip out urls
url_matcher = r'https?:\/\/[\S]+'

In [None]:
content0 = jobs_df.iloc[0].content; content0

In [None]:
hcontent0 = html.unescape(content0); hcontent0

In [None]:
text_content = html_to_text(hcontent0); text_content

In [None]:
fields = text_content.split('|'); fields[0:4]

In [None]:
fields[4]

In [None]:
fixed_fields = [f.strip() for f in fields]; fixed_fields[0:4]

In [None]:
fixed_fields[4]

In [32]:
links = re.findall(url_matcher, fixed_fields[4]); links

['https://current.com/careers/',
 'https://grnh.se/b30f29331us',
 'https://grnh.se/62b81e071us',
 'https://grnh.se/26f763531us',
 'https://grnh.se/877dd6061us',
 'https://grnh.se/1686fb701us']

In [None]:
clear_text = re.sub(url_matcher,'',fixed_fields[4]);clear_text

In [None]:
linked_token_text = re.sub(url_matcher,'__LINK_TOKEN__',fixed_fields[4]); linked_token_text