In [1]:
%load_ext autoreload
%autoreload 2

## Setup

In [2]:
import os
from pathlib import Path

import pandas as pd
import polars as pl
from tqdm.notebook import tqdm

import job_search.dataset as dataset
import job_search.company as com
import job_search.resume as res
import job_search.utils as utils
from job_search.config import P_ROOT, P_JOBS, P_URLS, P_PROCESSED
from job_search.utils import path_names
import aw
utils.jupyter_css_style()

In [3]:
QUERY_LIST = [
    DS_NORCAL := 'DS_NorCal',
    HEALTH := 'Healthcare',
    SF := 'SF',
    DS_REMOTE := 'DS_Remote',
    # DS_SOCAL := 'DS_SoCal',
    # DS_SEATTLE := 'DS_Seattle',
    # DS_NY := 'DS_NY',
    # DS_MIDWEST := 'DS_Midwest',
    # DS_DC := 'DS_DC',
    # SW := 'SW',
    # SW_REMOTE := 'SW_Remote',
]
STEM = 'Healthcare'
_date = com.now(time=False)
# _date = "2025-10-12"
P_save_dict = {stem: P_PROCESSED / (f'{_date}/{stem}/{stem}.html') for stem in QUERY_LIST}
P_companies_dict = {stem: P_save_dict[stem].parents[3] / f"cache/{stem}_company_urls" for stem in QUERY_LIST}
P_query_dict = {stem: P_save_dict[stem].parents[3] / f"queries/{stem}.txt" for stem in QUERY_LIST}
P_save = P_save_dict[STEM]
P_companies = P_companies_dict[STEM]
P_query = P_query_dict[STEM]

In [4]:
jdf = com.load_jdf(P_save)
# cdf = com.load_cdf(P_companies)
jdf_dict = {stem: com.load_jdf(P_save_dict[stem]) for stem in QUERY_LIST}
cdf_dict = {stem: com.load_cdf(P_companies_dict[stem]) for stem in QUERY_LIST}
df_dict = {stem: com.load_df(P_save_dict[stem]).to_pandas() for stem in QUERY_LIST}
dfc_dict = {stem: com.load_dfc(P_save_dict[stem]).to_pandas() for stem in QUERY_LIST}

In [5]:
df = pd.concat(df_dict).drop_duplicates(subset='hash').pipe(pl.DataFrame)
dfc = df.group_by('company').agg([
    pl.len(),
    pl.col('title').implode(),
    pl.col('company_summary').first(),
    pl.col('hours').min().alias('days') // 24,
    pl.col('bay').pipe(com.pl_reduce_list),
    pl.col('location').pipe(com.pl_reduce_list),
    pl.col('skills').pipe(com.pl_reduce_list),
    # pl.col('onsite').pipe(com.pl_enum_min, df['onsite'].dtype),
    # pl.col('full_time').pipe(com.pl_enum_max, df['full_time'].dtype),
    pl.col('lower').min(),
    pl.col('median').mean().round(2),
    pl.col('upper').max(),
    # pl.col('url2').n_unique(),
]).to_pandas()
df = df.to_pandas()

In [6]:
print(f"{'ALL':12}: {len(df):,} jobs among {len(dfc):,} companies")
for stem in QUERY_LIST:
    print(f"{stem:12}: {len(df_dict[stem]):,} jobs among {len(dfc_dict[stem]):,} companies")
# ALL         : 3,548 jobs among 1,778 companies
# DS_NorCal   : 956 jobs among 583 companies
# Healthcare  : 499 jobs among 327 companies
# SF          : 903 jobs among 526 companies
# DS_Remote   : 2,149 jobs among 1,208 companies

ALL         : 3,439 jobs among 1,621 companies
DS_NorCal   : 959 jobs among 578 companies
Healthcare  : 397 jobs among 221 companies
SF          : 958 jobs among 540 companies
DS_Remote   : 2,039 jobs among 1,102 companies


## Keywords

In [22]:
hash = 'diudfs6trmpabwjn'
next_data_job = com.viewhash(hash)['props']['pageProps']['job']#.keys()
# next_data_job

In [21]:
from IPython.display import Markdown
from job_search.dataset import md
_next_data_job_description = next_data_job['job_information']['description']
data_job_description = md(_next_data_job_description, heading_style='ATX')
# Markdown(data_job_description)

In [20]:
def disp_job(html_description):
    html_output = ("""
        <style>
        #alex {
            background-color: #eee;
            padding: 2rem;
        }
        </style><div id='alex'>
    """ + html_description + "</div>")
    display(HTML(html_output))

# disp_job(next_data_job['job_information']['description'])

In [None]:
# HTML(next_data_job['job_information']['description'])
# next_data_job['v5_processed_job_data']#['requirements_summary']
# next_data_job['v5_processed_job_data']#['company_activities_synonyms']
# next_data_job['v5_processed_job_data']#['company_name']
# next_data_job['v5_processed_job_data']#['estimated_publish_date']
# next_data_job['v5_processed_job_data']#['position_employer_type']
# next_data_job['v5_processed_job_data']#['company_tagline']
# next_data_job['v5_processed_job_data']#['company_activities']
# next_data_job['v5_processed_job_data']#['company_sector_and_industry']
# next_data_job['v5_processed_job_data']#['technical_tools']
# next_data_job['v5_processed_job_data']#['requirements_summary']

# next_data_dict['props']['__N_SSG']#.keys()
# next_data_dict['query']['requisitionId']#.keys()
# next_data_dict['props']['pageProps']['validThrough']
# next_data_job = next_data_dict['props']['pageProps']['job']
# next_data_job.keys()

In [None]:
# <script src="https://cdn.jsdelivr.net/npm/@tailwindcss/browser@4"></script>