In [1]:
%load_ext autoreload
%autoreload 2

## Setup

In [3]:
import os
from pathlib import Path

import pandas as pd
import polars as pl
from tqdm.notebook import tqdm

import job_search.dataset as dataset
import job_search.company as com
import job_search.resume as res
import job_search.utils as utils
from job_search.config import P_ROOT, P_JOBS, P_URLS, P_PROCESSED
from job_search.utils import path_names

import aw
utils.jupyter_css_style()

In [None]:
# P_job_list = []

# jobs_paths = pd.Series([p for p in P_JOBS.glob('*.md')])
# for P_job in (pbar := tqdm(jobs_paths)):
#     # pbar.set_description(P_job.name)
#     if os.stat(P_job).st_size == 0:
#         P_job_list.append(P_job)
#         P_url = P_URLS / f"{P_job.stem}.html"
#         print(f'"{P_url}"')
#         if P_url.exists():
#             P_url.unlink()
#         P_job.unlink()

In [4]:
QUERY_LIST = [
    DS_NORCAL := 'DS_NorCal',
    HEALTH := 'Healthcare',
    SF := 'SF',
    DS_REMOTE := 'DS_Remote',
    # DS_SOCAL := 'DS_SoCal',
    # DS_SEATTLE := 'DS_Seattle',
    # DS_NY := 'DS_NY',
    # DS_MIDWEST := 'DS_Midwest',
    # DS_DC := 'DS_DC',
    # SW := 'SW',
    # SW_REMOTE := 'SW_Remote',
]
STEM = 'Healthcare'
_date = com.now(time=False)
# _date = "2025-10-12"
# _date = "2025-11-06"
P_save_dict = {stem: P_PROCESSED / (f'{_date}/{stem}/{stem}.html') for stem in QUERY_LIST}
P_companies_dict = {stem: P_save_dict[stem].parents[3] / f"cache/{stem}_company_urls" for stem in QUERY_LIST}
P_query_dict = {stem: P_save_dict[stem].parents[3] / f"queries/{stem}.txt" for stem in QUERY_LIST}
P_save = P_save_dict[STEM]
P_companies = P_companies_dict[STEM]
P_query = P_query_dict[STEM]

In [5]:
# jdf = com.load_jdf(P_save)
# cdf = com.load_cdf(P_companies)
# jdf_dict = {stem: com.load_jdf(P_save_dict[stem]) for stem in QUERY_LIST}
# cdf_dict = {stem: com.load_cdf(P_companies_dict[stem]) for stem in QUERY_LIST}
df_dict = {stem: com.load_df(P_save_dict[stem]).to_pandas() for stem in QUERY_LIST}
dfc_dict = {stem: com.load_dfc(P_save_dict[stem]).to_pandas() for stem in QUERY_LIST}

In [6]:
df = pd.concat(df_dict).drop_duplicates(subset='hash').pipe(pl.DataFrame)
dfc = df.group_by('company').agg([
    pl.len(),
    pl.col('title').implode(),
    pl.col('company_summary').first(),
    pl.col('hours').min().alias('days') // 24,
    pl.col('bay').pipe(com.pl_reduce_list),
    pl.col('location').pipe(com.pl_reduce_list),
    pl.col('skills').pipe(com.pl_reduce_list),
    # pl.col('onsite').pipe(com.pl_enum_min, df['onsite'].dtype),
    # pl.col('full_time').pipe(com.pl_enum_max, df['full_time'].dtype),
    pl.col('lower').min(),
    pl.col('median').mean().round(2),
    pl.col('upper').max(),
    # pl.col('url2').n_unique(),
]).to_pandas()
df = df.to_pandas()
df_hash_set = set(df['hash'])

In [7]:
print(f"{'ALL':12}: {len(df):,} jobs among {len(dfc):,} companies")
for stem in QUERY_LIST:
    print(f"{stem:12}: {len(df_dict[stem]):,} jobs among {len(dfc_dict[stem]):,} companies")
# ALL         : 3,648 jobs among 1,769 companies
# DS_NorCal   : 767 jobs among 404 companies
# Healthcare  : 930 jobs among 627 companies
# SF          : 1,193 jobs among 634 companies
# DS_Remote   : 1,582 jobs among 859 companies

ALL         : 3,712 jobs among 1,784 companies
DS_NorCal   : 969 jobs among 586 companies
Healthcare  : 502 jobs among 283 companies
SF          : 977 jobs among 556 companies
DS_Remote   : 2,208 jobs among 1,223 companies


In [None]:
P_urls_df_parquet = com.P_DATA / 'cache' / f'P_urls_df_{_date}.parquet'
if not P_urls_df_parquet.exists():
    com.path_df(P_URLS).to_parquet(P_urls_df_parquet)
urls_df = pd.read_parquet(P_urls_df_parquet).query('stsize > 0').sort_values('ctime', ascending=False).drop_duplicates('hash').reset_index(drop=True)
urls_df['company'] = urls_df['name'].str.split(' - ', n=1).str[0]
for stem in QUERY_LIST:
    urls_df[stem] = urls_df['hash'].isin(set(df_dict[stem]['hash']))
urls_df = urls_df.sort_values(QUERY_LIST, ascending=False, kind='stable').reset_index(drop=True)
urls_df['DS'] = urls_df['DS_NorCal'] | urls_df['DS_Remote']
urls_df['ALL'] = urls_df['hash'].isin(set(df['hash']))
companies_df = urls_df.drop_duplicates('company').reset_index(drop=True)

In [10]:
# aw.combo_sizes([set(urls_df.query(stem)['hash']) for stem in QUERY_LIST], QUERY_LIST)
# aw.combo_sizes([set(urls_df.query(stem)['hash']) for stem in QUERY_LIST[:3]], QUERY_LIST[:3])
# aw.combo_sizes([set(urls_df.query(stem)['hash']) for stem in QUERY_LIST[:2]], QUERY_LIST[:2])
aw.combo_sizes([set(urls_df.query(stem)['hash']) for stem in ['DS', HEALTH]], ['DS', HEALTH])
#  	DS	Healthcare	Size	%
# 1	-	-	2735	100.0
# 2	Yes	-	1852	67.7
# 3	-	Yes	929	34.0
# 4	Yes	Yes	46	1.7


Unnamed: 0,DS,Healthcare,Size,%
1,-,-,2932,100.0
2,Yes,-,2484,84.7
3,-,Yes,502,17.1
4,Yes,Yes,54,1.8


In [11]:
import pickle

P_DICT = P_JOBS.parent / 'dicts'
P_DICT.mkdir(exist_ok=True)

industries_list = []
# _companies_hash_names = list(companies_df.query('ALL')[['hash', 'name']].iterrows())
_hash_names = list(urls_df.query('ALL')[['hash', 'name']].iterrows())
for _, (hash, name) in tqdm(_hash_names):
    P_dict = P_DICT / f'{name}.pkl'
    if P_dict.exists():
        continue
    next_data_dict = com.viewhash(hash)
    with open(P_dict, 'wb') as f:
        pickle.dump(next_data_dict, f)

  0%|          | 0/3647 [00:00<?, ?it/s]

In [None]:
all_dsize_list = []
_hash_names = list(urls_df.query('ALL')[['hash', 'name']].iterrows())
for _, (hash, name) in tqdm(_hash_names):
    P_dict = P_DICT / f'{name}.pkl'
    all_dsize_list.append(os.path.getsize(P_dict))

  0%|          | 0/3647 [00:00<?, ?it/s]

In [None]:
all_urls_df = urls_df.query('ALL').copy()
all_urls_df['dsize'] = all_dsize_list

In [11]:
industries_list = []
_hash_names = list(companies_df.query('ALL')[['hash', 'name']].iterrows())
# _hash_names = list(urls_df.query('ALL')[['hash', 'name']].iterrows())
for _, (hash, name) in tqdm(_hash_names):
    next_data_dict = com.hash2dict(hash)
    try:
        industries = next_data_dict['props']['pageProps']['job']['v5_processed_company_data']['industries']
    except KeyError:
        industries = None
    industries_list.append(industries)

  0%|          | 0/1776 [00:00<?, ?it/s]

Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Avispa - 11h.elqqytqcx05h4r9z.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Outschool - 12h.79x6x52ey5cndj96.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Blue Yonder - 2mo.0syhcg0w6sqlyusr.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Acubed - AI Engineer – LLM,RAG.7wq9509zoojuo633.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Weave - AI Research Scientist.vwugdze68nj3j5hp.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Owner.com - Staff Product Data Scientist, Risk and Payments.bxq6d49uwhpyighn.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Trunk Tools - NLP Engineer.wepeuneok4ur4p3w.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/cache/dicts/Toma - Prompt Engineer & Support Specialist (AI_LLM SaaS).moxtzpqc7lwhhw45.html.pkl
Writing to /mnt/c/Users/alexa/Dev/Companies/data/ca

In [15]:
import pickle
P1_dict = com.P_DICT / f"{companies_df.query('ALL').iloc[0]['name']}.pkl"
with open(P1_dict, 'rb') as f:
    next_data_dict = pickle.load(f)
# next_data_dict

In [None]:
companies_df['industries'] = pd.Series(industries_list)

In [28]:
companies_df['industries'].dropna().explode().drop_duplicates().reset_index(drop=True).to_csv('industries.txt', header=False, index=False)
companies_df.query('Healthcare')['industries'].dropna().explode().drop_duplicates().reset_index(drop=True).to_csv('healthcare_industries.txt', header=False, index=False)

In [18]:
for hash in urls_df.query('company == "Genentech" and DS')['hash']:
    print(com.viewhash(hash)['props']['pageProps']['job']['v5_processed_company_data']['industries'])

['Biotechnology Companies', 'Medical Organizations', 'Pharmaceutical Companies', 'Health Care Companies']
['Biotechnology Companies', 'Medical Organizations', 'Pharmaceutical Companies', 'Health Care Companies']
['Biotechnology Companies', 'Medical Organizations', 'Pharmaceutical Companies', 'Health Care Companies']


## Keywords

In [19]:
# df_dict[STEM].head()['hash'].apply(com.viewhash)

In [12]:
com.viewhash('diudfs6trmpabwjn')['props']['pageProps']['job']#.keys()

{'id': 'workday___roche-wd3-rog-a2o-gene___medical-data---analytics-engineer_202510-127004-1',
 'board_token': 'roche-wd3-rog-a2o-gene',
 'source': 'workday',
 'apply_url': 'https://roche.wd3.myworkdayjobs.com/rog-a2o-gene/job/South-San-Francisco/Medical-Data---Analytics-Engineer_202510-127004-1',
 'source_and_board_token': 'workday_roche-wd3-rog-a2o-gene',
 'job_information': {'title': 'Medical Data & Analytics Engineer',
  'job_title_raw': 'Medical Data & Analytics Engineer',
  'description': '<p><span>A healthier future. It’s what drives us to innovate. To continuously advance science and ensure everyone has access to the healthcare they need today and for generations to come. Creating a world where we all have more time with the people we love.\xa0</span></p><p></p><p><b>The Opportunity</b></p><p><span>The Medical Data &amp; Analytics Engineer plays a crucial role in accelerating evidence generation, decision-making, and innovation within USM and beyond, by bridging medical data st

In [None]:
# next_data_dict['props']['__N_SSG']#.keys()
# next_data_dict['query']['requisitionId']#.keys()
# next_data_dict['props']['pageProps']['validThrough']
# next_data_job = next_data_dict['props']['pageProps']['job']
# next_data_job.keys()

dict_keys(['id', 'board_token', 'source', 'apply_url', 'source_and_board_token', 'job_information', 'v5_processed_job_data', 'v5_processed_company_data', '_geoloc', 'requisition_id', 'collapse_key', 'is_expired', 'objectID'])

In [156]:
from IPython.display import Markdown
from job_search.dataset import md
_next_data_job_description = next_data_job['job_information']['description']
data_job_description = md(_next_data_job_description, heading_style='ATX')
Markdown(data_job_description)

## ****About the role****

As an early engineer at Galileo, you will play a foundational role in designing, building, and scaling our products and team. We’re looking for an exceptional Senior Software Engineer, interested in solving complex problems at the intersection of Data and ML and passionate about the opportunities in Observability and Reliability for GenAI.

## What you'll be doing

* ****Technical Design and Architecture**** - you will champion for the right scalable, reliable architecture and obtain buy-in from all the stakeholders
* ****Planning and Execution -****  you will work with your team to help plan the roadmap and execute it
* ****PR and Design Reviews -**** you will uphold the engineering excellence/quality bar by reviewing your peer’s PRs
* ****Collaboration -**** you will collaborate closely with Product Managers, designers and other TLs to ensure we are building the right strategy and leveraging each other’s work.
* ****Advance Engineering**** - you will participate in design reviews, on-call and support, participate in and present during Tech Talks and learning sessions & help interview other engineering candidates

## ****What we're looking for****

* 5+ years of experience building Data or AI/ML products
* Experience building APIs in Python (FastAPI preferred)
* Experience building large scale distributed systems and familiarity with
* + Messaging and Pub-Sub systems such as RabbitMQ/Kafka
  + Storage/database systems similar to Postgres, MongoDB, Cassandra, S3
  + OLAP systems such as ClickHouse, Pinot
  + Orchestration systems such as Celery
* Experience building test suites using frameworks such as PyTest/PyUnit
* Experience working effectively during the product development process, working cross-functionally with a product manager, designer, user researcher, and data scientist
* Excellent communication skills for collaborating with cross-functional partners
* High productivity and care to help teams collaborate more effectively and efficiently
* A startup mindset, biasing towards thoughtful action with minimal direction.

## ****Bonus Points****

* Experience working with Docker and Kubernetes.
* Experience building applications on at least one of AWS, Google Cloud, or Microsoft Azure.
* Experience building customer facing products with ML frameworks (like PyTorch, Tensorflow, Keras, etc)
* Experience building agentic applications (like LLMs, rags, agents, etc.) with common frameworks (like OpenAI, langChain, crewAI, etc.)

## ****Why Galileo****

* Join a seasoned founding team that has previously led product and engineering teams from 0 to $100M+ in revenue and from 0 to 1B+ users globally
* We obsess over our team’s culture driven by inclusivity, empathy and curiosity
* We invest in our team’s development and happiness because our employees are the keys to our success and ensuring happy customers – towards that end, we offer:
* + 🏥 Medical, Dental and Vision Insurance
  + 🌴 Unlimited PTO
  + 👶 Parental Leave (birthing & non-birthing) - 100% pay for 8 weeks
  + 💰 401(k) Retirement Savings Plan
  + 📈 Pre-IPO Stock Options
  + 🚌 Commuter Benefits (pre-tax + company sponsored)
  + 🧘‍♂️ Mental & Physical Wellness Stipend
  + 🍱 Catered Meals on in-office days
  + 🏢 HQ in Burlingame + hub in NYC + hub in Bangalore
  + 🤝 Build the company alongside the Founders

**\*Benefits/perks may vary by state, country and employment type - please reach out to your recruiter for more information**

**Galileo is an equal opportunity employer supporting workforce diversity. We do not discriminate on the basis of race, religion, color, national origin, gender identity, sexual orientation, age, marital status, protected veteran status, disability status, or any other unlawful factor.**

**Galileo is committed to providing any necessary accommodations for individuals with disabilities within our application and interview process. To request accommodation due to a disability, please inform your recruiter.**

Galileo is committed to fair and equitable compensation practices. The pay range for this role is listed below and represents the expected salary range for non-commissionable roles or on-target earnings for commissionable roles. Actual compensation packages are based on several factors that are unique to each candidate, including but not limited to job-related skills, depth of experience, relevant certifications and training, and specific work location. The total compensation package for this position may also include eligibility for commission, equity, and company sponsored benefits/perks.

* The annual pay range for this role is $180,000 - $225,000 USD.

#LI-JF1

In [108]:
next_data_job['apply_url']

'https://ats.rippling.com/galileo/jobs/6fef1da5-107b-44cf-8fb9-d8b52dce5342'

In [140]:
def disp_job(html_description):
    html_output = ("""
        <style>
        #alex {
            background-color: #eee;
            padding: 2rem;
        }
        </style><div id='alex'>
    """ + html_description + "</div>")
    display(HTML(html_output))

disp_job(next_data_job['job_information']['description'])

In [7]:
from IPython.display import HTML
# HTML(next_data_job['job_information']['description'])
# next_data_job['v5_processed_job_data']#['requirements_summary']
# next_data_job['v5_processed_job_data']#['company_activities_synonyms']
# next_data_job['v5_processed_job_data']#['company_name']
# next_data_job['v5_processed_job_data']#['estimated_publish_date']
# next_data_job['v5_processed_job_data']#['position_employer_type']
# next_data_job['v5_processed_job_data']#['company_tagline']
# next_data_job['v5_processed_job_data']#['company_activities']
# next_data_job['v5_processed_job_data']#['company_sector_and_industry']
# next_data_job['v5_processed_job_data']#['technical_tools']
# next_data_job['v5_processed_job_data']#['requirements_summary']

In [71]:
# P_url
com.viewjob('zsugiy65ldqr3xnv')

'https://hiring.cafe/viewjob/zsugiy65ldqr3xnv'

In [None]:
# <script src="https://cdn.jsdelivr.net/npm/@tailwindcss/browser@4"></script>