In [None]:
%load_ext autoreload
%autoreload 2

## Setup

In [1]:
import os
from pathlib import Path

import pandas as pd
import polars as pl
from tqdm.notebook import tqdm

import job_search.dataset as dataset
import job_search.company as com
import job_search.resume as res
import job_search.utils as utils
from job_search.config import P_ROOT, P_JOBS, P_URLS, P_PROCESSED
from job_search.utils import path_names

import aw
utils.jupyter_css_style()

In [None]:
# P_job_list = []

# jobs_paths = pd.Series([p for p in P_JOBS.glob('*.md')])
# for P_job in (pbar := tqdm(jobs_paths)):
#     # pbar.set_description(P_job.name)
#     if os.stat(P_job).st_size == 0:
#         P_job_list.append(P_job)
#         P_url = P_URLS / f"{P_job.stem}.html"
#         print(f'"{P_url}"')
#         if P_url.exists():
#             P_url.unlink()
#         P_job.unlink()

In [4]:
QUERY_LIST = [
    DS_NORCAL := 'DS_NorCal',
    HEALTH := 'Healthcare',
    SF := 'SF',
    DS_REMOTE := 'DS_Remote',
    # DS_SOCAL := 'DS_SoCal',
    # DS_SEATTLE := 'DS_Seattle',
    # DS_NY := 'DS_NY',
    # DS_MIDWEST := 'DS_Midwest',
    # DS_DC := 'DS_DC',
    # SW := 'SW',
    # SW_REMOTE := 'SW_Remote',
]
STEM = 'Healthcare'
_date = com.now(time=False)
# _date = "2025-10-12"
# _date = "2025-11-06"
P_save_dict = {stem: P_PROCESSED / (f'{_date}/{stem}/{stem}.html') for stem in QUERY_LIST}
P_companies_dict = {stem: P_save_dict[stem].parents[3] / f"cache/{stem}_company_urls" for stem in QUERY_LIST}
P_query_dict = {stem: P_save_dict[stem].parents[3] / f"queries/{stem}.txt" for stem in QUERY_LIST}
P_save = P_save_dict[STEM]
P_companies = P_companies_dict[STEM]
P_query = P_query_dict[STEM]

In [5]:
# jdf = com.load_jdf(P_save)
# cdf = com.load_cdf(P_companies)
# jdf_dict = {stem: com.load_jdf(P_save_dict[stem]) for stem in QUERY_LIST}
# cdf_dict = {stem: com.load_cdf(P_companies_dict[stem]) for stem in QUERY_LIST}
df_dict = {stem: com.load_df(P_save_dict[stem]).to_pandas() for stem in QUERY_LIST}
dfc_dict = {stem: com.load_dfc(P_save_dict[stem]).to_pandas() for stem in QUERY_LIST}

In [6]:
df = pd.concat(df_dict).drop_duplicates(subset='hash').pipe(pl.DataFrame)
dfc = df.group_by('company').agg([
    pl.len(),
    pl.col('title').implode(),
    pl.col('company_summary').first(),
    pl.col('hours').min().alias('days') // 24,
    pl.col('bay').pipe(com.pl_reduce_list),
    pl.col('location').pipe(com.pl_reduce_list),
    pl.col('skills').pipe(com.pl_reduce_list),
    # pl.col('onsite').pipe(com.pl_enum_min, df['onsite'].dtype),
    # pl.col('full_time').pipe(com.pl_enum_max, df['full_time'].dtype),
    pl.col('lower').min(),
    pl.col('median').mean().round(2),
    pl.col('upper').max(),
    # pl.col('url2').n_unique(),
]).to_pandas()
df = df.to_pandas()
df_hash_set = set(df['hash'])

In [7]:
print(f"{'ALL':12}: {len(df):,} jobs among {len(dfc):,} companies")
for stem in QUERY_LIST:
    print(f"{stem:12}: {len(df_dict[stem]):,} jobs among {len(dfc_dict[stem]):,} companies")
# ALL         : 3,648 jobs among 1,769 companies
# DS_NorCal   : 767 jobs among 404 companies
# Healthcare  : 930 jobs among 627 companies
# SF          : 1,193 jobs among 634 companies
# DS_Remote   : 1,582 jobs among 859 companies

ALL         : 3,712 jobs among 1,784 companies
DS_NorCal   : 969 jobs among 586 companies
Healthcare  : 502 jobs among 283 companies
SF          : 977 jobs among 556 companies
DS_Remote   : 2,208 jobs among 1,223 companies


In [None]:
P_urls_df_parquet = com.P_DATA / 'cache' / f'P_urls_df_{_date}.parquet'
if not P_urls_df_parquet.exists():
    com.path_df(P_URLS).to_parquet(P_urls_df_parquet)
urls_df = pd.read_parquet(P_urls_df_parquet).query('stsize > 0').sort_values('ctime', ascending=False).drop_duplicates('hash').reset_index(drop=True)
urls_df['company'] = urls_df['name'].str.split(' - ', n=1).str[0]
for stem in QUERY_LIST:
    urls_df[stem] = urls_df['hash'].isin(set(df_dict[stem]['hash']))
urls_df = urls_df.sort_values(QUERY_LIST, ascending=False, kind='stable').reset_index(drop=True)
urls_df['DS'] = urls_df['DS_NorCal'] | urls_df['DS_Remote']
urls_df['ALL'] = urls_df['hash'].isin(set(df['hash']))
companies_df = urls_df.drop_duplicates('company').reset_index(drop=True)

In [10]:
# aw.combo_sizes([set(urls_df.query(stem)['hash']) for stem in QUERY_LIST], QUERY_LIST)
# aw.combo_sizes([set(urls_df.query(stem)['hash']) for stem in QUERY_LIST[:3]], QUERY_LIST[:3])
# aw.combo_sizes([set(urls_df.query(stem)['hash']) for stem in QUERY_LIST[:2]], QUERY_LIST[:2])
aw.combo_sizes([set(urls_df.query(stem)['hash']) for stem in ['DS', HEALTH]], ['DS', HEALTH])
#  	DS	Healthcare	Size	%
# 1	-	-	2735	100.0
# 2	Yes	-	1852	67.7
# 3	-	Yes	929	34.0
# 4	Yes	Yes	46	1.7


Unnamed: 0,DS,Healthcare,Size,%
1,-,-,2932,100.0
2,Yes,-,2484,84.7
3,-,Yes,502,17.1
4,Yes,Yes,54,1.8


In [11]:
import pickle

P_DICT = P_JOBS.parent / 'dicts'
P_DICT.mkdir(exist_ok=True)

industries_list = []
# _companies_hash_names = list(companies_df.query('ALL')[['hash', 'name']].iterrows())
_hash_names = list(urls_df.query('ALL')[['hash', 'name']].iterrows())
for _, (hash, name) in tqdm(_hash_names):
    P_dict = P_DICT / f'{name}.pkl'
    if P_dict.exists():
        continue
    next_data_dict = com.viewhash(hash)
    with open(P_dict, 'wb') as f:
        pickle.dump(next_data_dict, f)

  0%|          | 0/3647 [00:00<?, ?it/s]

In [48]:
all_dsize_list = []
_hash_names = list(urls_df.query('ALL')[['hash', 'name']].iterrows())
for _, (hash, name) in tqdm(_hash_names):
    P_dict = P_DICT / f'{name}.pkl'
    all_dsize_list.append(os.path.getsize(P_dict))

  0%|          | 0/3647 [00:00<?, ?it/s]

In [None]:
all_urls_df = urls_df.query('ALL').copy()
all_urls_df['dsize'] = all_dsize_list

In [11]:
industries_list = []
_hash_names = list(companies_df.query('ALL')[['hash', 'name']].iterrows())
# _hash_names = list(urls_df.query('ALL')[['hash', 'name']].iterrows())
for _, (hash, name) in tqdm(_hash_names):
    next_data_dict = com.hash2dict(hash)
    try:
        industries = next_data_dict['props']['pageProps']['job']['v5_processed_company_data']['industries']
    except KeyError:
        industries = None
    industries_list.append(industries)

  0%|          | 0/1776 [00:00<?, ?it/s]

Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Avispa - 11h.elqqytqcx05h4r9z.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Outschool - 12h.79x6x52ey5cndj96.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Blue Yonder - 2mo.0syhcg0w6sqlyusr.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Acubed - AI Engineer – LLM,RAG.7wq9509zoojuo633.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Weave - AI Research Scientist.vwugdze68nj3j5hp.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Owner.com - Staff Product Data Scientist, Risk and Payments.bxq6d49uwhpyighn.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Trunk Tools - NLP Engineer.wepeuneok4ur4p3w.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Toma - Prompt Engineer & Support Specialist (AI_LLM SaaS).moxtzpqc7lwhhw45.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/ca

In [15]:
import pickle
P1_dict = com.P_DICT / f"{companies_df.query('ALL').iloc[0]['name']}.pkl"
with open(P1_dict, 'rb') as f:
    next_data_dict = pickle.load(f)
# next_data_dict

In [None]:
companies_df['industries'] = pd.Series(industries_list)

In [28]:
companies_df['industries'].dropna().explode().drop_duplicates().reset_index(drop=True).to_csv('industries.txt', header=False, index=False)
companies_df.query('Healthcare')['industries'].dropna().explode().drop_duplicates().reset_index(drop=True).to_csv('healthcare_industries.txt', header=False, index=False)

In [18]:
for hash in urls_df.query('company == "Genentech" and DS')['hash']:
    print(com.viewhash(hash)['props']['pageProps']['job']['v5_processed_company_data']['industries'])

['Biotechnology Companies', 'Medical Organizations', 'Pharmaceutical Companies', 'Health Care Companies']
['Biotechnology Companies', 'Medical Organizations', 'Pharmaceutical Companies', 'Health Care Companies']
['Biotechnology Companies', 'Medical Organizations', 'Pharmaceutical Companies', 'Health Care Companies']


## END

In [4]:
import json
import sys
import pickle

import job_search.config as conf
from job_search.config import P_CACHE, P_DICT, P_QUERIES, _date

In [7]:
P_query = P_QUERIES / 'DS.txt'
P_save = dataset.main0(P_query)
jdf = dataset.load_jdf(P_save)

2025-11-19 15:17:12,697 - INFO - /mnt/c/Users/alexa/Dev/Companies/data/processed/2025-11-19/DS/DS.html already exists...


In [7]:
# P_job_data_list = P_CACHE / f'job_data_list_{_date}.pkl'
# with open(P_job_data_list, 'rb') as f:
#     job_data_list = pickle.load(f)

In [None]:
## Takes ...
# P_DICT size is 848 MB (as of 11/13/25 (Thursday))
# P_dict_list = [p for p in conf.P_DICT.glob("*.pkl")]
# Out: 18756
P_dict_list = [p for p in P_DICT.glob("*.pkl") if os.stat(p).st_size > 5]
len(P_dict_list)
# Out: 18520

# job_data_list = []
# for path in tqdm(P_dict_list):
#     with open(path, 'rb') as f:
#         job_data = pickle.load(f)
#     job_data_list.append(job_data)

# P_job_data_list = P_CACHE / f'job_data_list_{_date}.pkl'
# with open(P_job_data_list, 'wb') as f:
#     pickle.dump(job_data_list, f, protocol=pickle.HIGHEST_PROTOCOL)

20776

In [9]:
job_data_list = []
for path in tqdm(P_dict_list):
    with open(path, 'rb') as f:
        job_data = pickle.load(f)
    job_data_list.append(job_data)

P_job_data_list = P_CACHE / f'job_data_list_{_date}.pkl'
with open(P_job_data_list, 'wb') as f:
    pickle.dump(job_data_list, f, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/20776 [00:00<?, ?it/s]

In [10]:
_ctime = [os.path.getctime(p) for p in P_dict_list]
_ctime = pd.Series(_ctime).pipe(pd.to_datetime, unit='s').dt.floor('s')

In [11]:
job_series = pd.Series([d['props']['pageProps'].get('job', None) for d in job_data_list])
V5_JOB = 'v5_processed_job_data'
V5_COMP = 'v5_processed_company_data'
JOB_INFO = 'job_information'
job_v5 = job_series.str[V5_JOB]
job_comp = job_series.str[V5_COMP]
job_info = job_series.str[JOB_INFO]
_est_pub_date = job_v5.str['estimated_publish_date'].str[:10].pipe(pd.to_datetime)
_location = job_v5.str['formatted_workplace_location'].str.replace(', California', '').str.replace(', United States', '')
_id = [p.name.removesuffix('.html.pkl') for p in P_dict_list]

job_df = pd.DataFrame({
    '_id': _id,
    '_ctime': _ctime,
    '_est_pub_date': _est_pub_date,
    'title': job_info.str['title'],
    '_loc': _location,
    'yearly_min_compensation': job_v5.str['yearly_min_compensation'],
    'yearly_max_compensation': job_v5.str['yearly_max_compensation'],
    'workplace_type': job_v5.str['workplace_type'],
    'commitment': job_v5.str['commitment'],
    'company_name': job_v5.str['company_name'],
    'company_tagline': job_v5.str['company_tagline'],
    '_yoe': job_v5.str['min_industry_and_role_yoe'],
    '_mgmt': job_v5.str['min_management_and_leadership_yoe'],
    'requirements_summary': job_v5.str['requirements_summary'],
    'technical_tools': job_v5.str['technical_tools'],
    '_hash': job_series.str['requisition_id'],
    'collapse_key': job_series.str['collapse_key'],
    '_geoloc': job_series.str['_geoloc']
})

In [None]:
comp_df = pd.DataFrame({
    '_id': _id,
    'company_name': job_v5.str['company_name'],
    # 'company_tagline': job_v5.str['company_tagline'],
    'image_url': job_comp.str['image_url'],
    'website': job_comp.str['website'],
    'tagline': job_comp.str['tagline'],
    'is_non_profit': job_comp.str['is_non_profit'],
    'stock_symbol': job_comp.str['stock_symbol'],
    'collapse_key': job_series.str['collapse_key'],
    'year_founded': job_comp.str['year_founded'],
    'num_employees': job_comp.str['num_employees'],
    'industries': job_comp.str['industries'].apply(lambda x: tuple(x) if x else tuple()),
    'activities': job_comp.str['activities'].apply(lambda x: tuple(x) if x else tuple()),
    'latest_investment_amount': job_comp.str['latest_investment_amount'],
    'latest_investment_year': job_comp.str['latest_investment_year'],
    'latest_investment_series': job_comp.str['latest_investment_series'],
    'investors': job_comp.str['investors'].apply(lambda x: tuple(x) if x else tuple()),
    'parent_company': job_comp.str['parent_company'],
    'headquarters_country': job_comp.str['headquarters_country'],
    'linkedin_url': job_comp.str['linkedin_url'],
})
comp_df.nunique()

_id                         20776
company_name                 5591
image_url                    4363
website                      4962
tagline                      6863
is_non_profit                   2
stock_symbol                  795
collapse_key                 5280
year_founded                  196
num_employees                1534
industries                   1359
activities                   6169
latest_investment_amount      695
latest_investment_year         36
latest_investment_series       25
investors                    2466
parent_company                779
headquarters_country           62
linkedin_url                 4372
dtype: int64

In [13]:
from markdownify import markdownify as md
src_df = pd.DataFrame({
    '_id': _id,
    '_ctime': _ctime,
    '_est_pub_date': _est_pub_date,
    'title': job_info.str['title'],
    'job_title_raw': job_info.str['job_title_raw'],
    'description': job_info.str['description'],
    'stripped_description': job_info.str['stripped_description'],
    'viewedByUsers': job_info.str['viewedByUsers'],
    'appliedFromUsers': job_info.str['appliedFromUsers'],
    '_hash': job_series.str['requisition_id'],
})
# src_df['_md'] = src_df['description'].fillna('').map(md)

In [20]:
N = job_df.shape[0]  # 20776
# N = 18_520
_perc = lambda x: print(f"{x.sum():,} of {N:,} ({100*x.mean():.1f}%)")
# _perc_N = lambda x: print(f"{x:,} of {N:,} ({100*x/N:.1f}%)")
# _perc_N(df['company_name'].isna().sum())
_perc(job_df['_hash'].isna())

465 of 20,776 (2.2%)


In [21]:
# df['collapse_key'].nunique()
# any(df['collapse_key'] == "Z3JuaHNlX19fMTB4Z2Vub21pY3NfX18xMHggR2Vub21pY3NfX18xMHhnZW5vbWljcy5jb")
job_df[(job_df['company_name'] == "Genentech")].T

Unnamed: 0,6991
_id,Genentech - Principal Supply Chain Data Scient...
_ctime,2025-11-11 23:50:39
_est_pub_date,2025-08-06 00:00:00
title,Principal Supply Chain Data Scientist
_loc,"Louisville, Kentucky"
yearly_min_compensation,127100.0
yearly_max_compensation,236100.0
workplace_type,Onsite
commitment,[Full Time]
company_name,Genentech


In [22]:
_position = job_df['_id'].str.rsplit('.', n=1).str[0]
job_df['hash'] = job_df['_id'].str.rsplit('.', n=1).str[1]

In [24]:
__DS_regex = r"(?:data|\bml\b|machine learning|\bai\b|artificial intelligence|\bnlp\b|statistical|\bbi\b|business intelligence|devops|mlops).+(?:engineer|scientist|science|programmer)"
_sw_mask = job_df['title'].fillna('').str.contains(r'software.+engineer', case=False)
__DS_mask = job_df['title'].fillna('').str.contains(__DS_regex, case=False)
_DS_mask = __DS_mask & ~_sw_mask
_perc(_DS_mask)

6,149 of 20,776 (29.6%)


In [None]:
ds_mask = job_df['title'].fillna('').str.contains(r'data.+scien', case=False)
ml_mask = job_df['title'].fillna('').str.contains(r'\bml\b|machine.+learning', case=False)
ai_mask = job_df['title'].fillna('').str.contains(r'\bai\b|artificial.+intelligence', case=False)
nlp_mask = job_df['title'].fillna('').str.contains(r'\bnlp\b|natural.+lang.+process', case=False)
_eng_mask = job_df['title'].fillna('').str.contains(r'engineer', case=False)

_health_industries = []
bay_mask = job_df['_bay'].notna()
health_mask = comp_df['industries'].apply(set) & pd.Series([_health_industries]*len(job_df))
yoe_mask = job_df['_yoe'].fillna(0) < 7
_45d_mask = job_df['_days'].fillna(0) <= 45
remote_mask = job_df['workplace_type'] == "Remote"
yoe_45d_mask = yoe_mask & _45d_mask
bay_remote_mask = bay_mask | remote_mask
DS_mask = _DS_mask & bay_remote_mask & yoe_45d_mask
DS_health_mask = _DS_mask & health_mask & bay_remote_mask & yoe_45d_mask

In [26]:
_DS_set = job_df[_DS_mask]['_hash'].pipe(set)
bay_set = job_df[bay_mask]['_hash'].pipe(set)
health_set = job_df[health_mask]['_hash'].pipe(set)
yoe_set = job_df[yoe_mask]['_hash'].pipe(set)
_45d_set = job_df[_45d_mask]['_hash'].pipe(set)
remote_set = job_df[remote_mask]['_hash'].pipe(set)
yoe_45d_set = yoe_set & _45d_set
bay_remote_set = bay_set | remote_set

DS_set = _DS_set & bay_remote_set & yoe_45d_set
DS_health_set = _DS_set & health_set & bay_remote_set & yoe_45d_set

In [27]:
aw.displays(
    aw.combo_sizes([bay_remote_set, yoe_45d_set, _DS_set, health_set], ['Bay_Remote', 'yoe_45d', 'DS', 'Health']),
    aw.combo_sizes2([bay_remote_set, yoe_45d_set, _DS_set, health_set], ['Bay_Remote', 'yoe_45d', 'DS', 'Health']),
)

Unnamed: 0,Bay_Remote,yoe_45d,DS,Health,Size,%
1,-,-,-,-,17065,100.0
2,Yes,-,-,-,11885,69.65
3,-,Yes,-,-,8641,50.64
4,-,-,Yes,-,5916,34.67
5,Yes,Yes,-,-,4933,28.91
6,Yes,-,Yes,-,3761,22.04
7,-,-,-,Yes,2594,15.2
8,-,Yes,Yes,-,2416,14.16
9,Yes,-,-,Yes,1427,8.36
10,Yes,Yes,Yes,-,1362,7.98

Unnamed: 0,Bay_Remote,yoe_45d,DS,Health,Size,%
1,-,-,-,-,17065,100.0
2,Yes,No,No,No,4073,23.87
3,Yes,Yes,No,No,3135,18.37
4,No,Yes,No,No,2198,12.88
5,Yes,No,Yes,No,2111,12.37
6,Yes,Yes,Yes,No,1139,6.67
7,No,No,Yes,No,948,5.56
8,No,Yes,Yes,No,867,5.08
9,Yes,No,No,Yes,480,2.81
10,No,Yes,No,Yes,456,2.67


In [None]:
# aw.displays(
#     aw.combo_sizes([bay_remote_set, yoe_45d_set, _DS_set, health_set], ['Bay_Remote', 'yoe_45d', 'DS', 'Health']),
#     aw.combo_sizes2([bay_remote_set, yoe_45d_set, _DS_set, health_set], ['Bay_Remote', 'yoe_45d', 'DS', 'Health']),
# )

Unnamed: 0,Bay_Remote,yoe_45d,DS,Health,Size,%
1,-,-,-,-,15346,100.0
2,Yes,-,-,-,11164,72.75
3,-,Yes,-,-,7979,51.99
4,-,-,Yes,-,5146,33.53
5,Yes,Yes,-,-,4995,32.55
6,Yes,-,Yes,-,3490,22.74
7,-,-,-,Yes,2346,15.29
8,-,Yes,Yes,-,2113,13.77
9,Yes,Yes,Yes,-,1343,8.75
10,Yes,-,-,Yes,1330,8.67

Unnamed: 0,Bay_Remote,yoe_45d,DS,Health,Size,%
1,-,-,-,-,15346,100.0
2,Yes,No,No,No,3590,23.39
3,Yes,Yes,No,No,3219,20.98
4,Yes,No,Yes,No,1898,12.37
5,No,Yes,No,No,1772,11.55
6,Yes,Yes,Yes,No,1127,7.34
7,No,No,Yes,No,771,5.02
8,No,Yes,Yes,No,623,4.06
9,No,Yes,No,Yes,442,2.88
10,Yes,Yes,No,Yes,433,2.82


In [None]:
_DS_df = job_df[_DS_mask].sort_values(['_est_pub_date', '_ctime'], ascending=False).drop_duplicates('_hash')
DS_df = job_df[DS_mask].sort_values(['_est_pub_date', '_ctime'], ascending=False).drop_duplicates('_hash')
DS_health_df = job_df[DS_health_mask].sort_values(['_est_pub_date', '_ctime'], ascending=False).drop_duplicates('_hash')
_DS_df.shape[0], DS_df.shape[0], DS_health_df.shape[0]
# Out: (5146, 1343, 215)
# Out: (5916, 1362, 222)

(5916, 1362, 222)

In [29]:
aw.displays(
    DS_df['company_name'].pipe(aw.vcounts),
    DS_health_df['company_name'].pipe(aw.vcounts),
)

Unnamed: 0,company_name,N,%
1,Google,22,1.63
2,Adobe,18,1.33
3,Capital One,14,1.04
4,Generalmotors,12,0.89
5,UnitedHealth Group,12,0.89
6,Netflix,11,0.81
7,Reducto,11,0.81
8,CapitalOne,10,0.74
9,Walmart,10,0.74
10,Experian,10,0.74

Unnamed: 0,company_name,N,%
1,UnitedHealth Group,12,5.45
2,Centene,9,4.09
3,Humana,9,4.09
4,Cvshealth,8,3.64
5,Optum Insight Technology,5,2.27
6,Roche,5,2.27
7,Mayo Clinic,5,2.27
8,CVS Health,5,2.27
9,Lilly,4,1.82
10,UnitedHealth Group / Optum,4,1.82


In [30]:
aw.displays(
    # _DS_df['technical_tools'].explode().pipe(aw.vcounts, cutoff=30),
    DS_df['technical_tools'].explode().pipe(aw.vcounts, cutoff=30),
    DS_health_df['technical_tools'].explode().pipe(aw.vcounts, cutoff=30),
)

Unnamed: 0,technical_tools,N,%
1,Python,1073,8.6
2,SQL,631,5.05
3,AWS,378,3.03
4,PyTorch,286,2.29
5,TensorFlow,227,1.82
6,Docker,215,1.72
7,Kubernetes,214,1.71
8,Azure,208,1.67
9,Snowflake,206,1.65
10,Spark,197,1.58

Unnamed: 0,technical_tools,N,%
1,Python,167,8.46
2,SQL,104,5.27
3,AWS,64,3.24
4,Azure,49,2.48
5,PyTorch,45,2.28
6,TensorFlow,44,2.23
7,R,38,1.93
8,Snowflake,38,1.93
9,Docker,33,1.67
10,GCP,31,1.57


In [31]:
DL_mask = job_df['technical_tools'].apply(lambda x: bool(set(x) & {'PyTorch', 'TensorFlow', 'LangChain', 'LLMs', 'JAX', 'Keras', 'Hugging Face'}) if x else False)
_perc(DL_mask)

2,342 of 20,776 (11.3%)


In [None]:
hash = "8w6m7h1f6lz3ybv0"
(job_df.query('_hash == @hash')[['technical_tools']])

Unnamed: 0,technical_tools
10634,"[Python, R, Tableau, Hadoop, Epic]"


In [33]:
_merged_df = pd.merge(job_df, src_df, on=['_id', '_hash', '_ctime', '_est_pub_date', 'title'])
merged_df = pd.merge(_merged_df, comp_df, on=['_id', 'company_name', 'collapse_key'])

In [None]:
from IPython.display import display
COLS = ['_id', '_ctime', '_est_pub_date', 'title', '_loc', 'yearly_min_compensation', 'yearly_max_compensation',
    'workplace_type', 'commitment', 'company_name', 'company_tagline', '_yoe', '_mgmt', 'requirements_summary',
    'technical_tools',
    # '_hash',
    '_bay', '_days', 'website', 'tagline', 'is_non_profit', 'year_founded', 'num_employees',
    'industries', 'latest_investment_amount', 'latest_investment_year', 'latest_investment_series', 'investors',
    'parent_company', 'headquarters_country', 'linkedin_url'
]
def viewhash(hash):
    from IPython.display import HTML
    row_df = merged_df.query('_hash == @hash').sort_values(['_est_pub_date', '_ctime'], ascending=False)
    row = row_df.iloc[0]

    # display(row_df.drop(columns=['description', 'stripped_description']).T.style)
    display(row_df[COLS].T.dropna().style)
    print(com.viewjob(hash))
    # display(HTML(row['description']))
    print()
    print(md(row['description']))

viewhash("jvtqcgnyftowpgt5")

Unnamed: 0,14762
_id,"Pomelo Care - Data Scientist, Care Intelligence (San Francisco, hybrid).jvtqcgnyftowpgt5"
_ctime,2025-11-11 19:30:37
_est_pub_date,2025-11-10 00:00:00
title,"Data Scientist, Care Intelligence (San Francisco, hybrid)"
_loc,San Francisco
yearly_min_compensation,140000.000000
yearly_max_compensation,160000.000000
workplace_type,Hybrid
commitment,['Full Time']
company_name,Pomelo Care


https://hiring.cafe/viewjob/jvtqcgnyftowpgt5

### About us

[Pomelo Care](http://www.pomelocare.com) is a multi-disciplinary team of clinicians, engineers and problem solvers who are passionate about improving care for moms and babies. We are transforming outcomes for pregnant people and babies with evidence-based pregnancy and newborn care at scale. Our technology-driven care platform enables us to engage patients early, conduct individualized risk assessments for poor pregnancy outcomes, and deliver coordinated, personalized virtual care throughout pregnancy, NICU stays, and the first postpartum year. We measure ourselves by reductions in preterm births, NICU admissions, c-sections and maternal mortality; we improve outcomes and reduce healthcare spend.

### Role Description

**Your North Star**: Enable AI product development through data modeling, measurement, and data-driven insights

We're looking to bring on an experienced data scientist to join a team developing AI-powered produ