# Hudi + Lance Demo: Intelligent Recruitment Platform
**(Hybrid Search + Analytics on the Lakehouse)**

### Flow:
1. Load real job postings from HuggingFace
2. User "Uploads" a Resume (Vector Search)
3. Apply Business Rules (Hybrid Search: Vector + SQL Filters)
4. Show Executive Dashboard (Analytics on the same data)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datasets import load_dataset
import shutil
import os
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

## Configuration

In [2]:
CONFIG = {
    "table_path": "/tmp/hudi_recruiting_lake",
    "table_name": "job_market",
    "embedding_model": "all-MiniLM-L6-v2",
    "clean_start": True
}

## 1. Spark Setup

In [9]:
def create_spark():
    return (SparkSession.builder.appName("Recruiting-Lakehouse")
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog")
            .config("spark.ui.showConsoleProgress", "false")
            .getOrCreate())

## 2. Load Dataset from HuggingFace

In [4]:
def load_job_data():
    """Load real data science job descriptions from HuggingFace."""
    print("Loading job descriptions from HuggingFace...")
    ds = load_dataset("nathansutton/data-science-job-descriptions", split="train")

    data = []
    for i, row in enumerate(ds):
        data.append({
            "job_id": f"job_{i:04d}",
            "company": row["company"],
            "title": row["title"],
            "job_description": row["job_description"],
            "text_for_vector": f"{row['title']} {row['job_description']}"
        })

    companies = set(r["company"] for r in data)
    print(f"\u2713 Loaded {len(data)} job postings from {len(companies)} companies.")
    return data

## 3. Ingestion (The "Lakehouse" Foundation)

In [5]:
def ingest_data(spark, data):
    # 1. Embed Descriptions
    model = SentenceTransformer(CONFIG["embedding_model"])
    embeddings = model.encode([r["text_for_vector"] for r in data], show_progress_bar=True)

    for i, row in enumerate(data):
        row["embedding"] = embeddings[i].tolist()

    # 2. Define Schema
    schema = StructType([
        StructField("job_id", StringType(), False),
        StructField("company", StringType(), False),
        StructField("title", StringType(), False),
        StructField("job_description", StringType(), False),
        StructField("text_for_vector", StringType(), False),
        StructField("embedding", ArrayType(FloatType()), False),
    ])

    # 3. Write to Hudi (Lance Format)
    if CONFIG["clean_start"] and os.path.exists(CONFIG["table_path"]):
        shutil.rmtree(CONFIG["table_path"])

    df = spark.createDataFrame(data, schema=schema)

    hudi_options = {
        "hoodie.table.name": CONFIG["table_name"],
        "hoodie.datasource.write.recordkey.field": "job_id",
        "hoodie.datasource.write.partitionpath.field": "company",
        "hoodie.datasource.write.table.type": "COPY_ON_WRITE",
        "hoodie.datasource.write.operation": "upsert",
        "hoodie.table.base.file.format": "lance",
        "hoodie.write.record.merge.custom.implementation.classes": "org.apache.hudi.DefaultSparkRecordMerger"
    }

    df.write.format("hudi").options(**hudi_options).mode("overwrite").save(CONFIG["table_path"])
    print(f"\u2713 Ingested {len(data)} jobs into the Lakehouse.")
    return model

## 4. The Demo: Resume Matching

In [None]:
def demo_resume_matching(spark, model):
    print("\n" + "="*50)
    print("DEMO PART 1: The 'Smart' Candidate Match")
    print("="*50)

    # Simulate a Resume Upload
    resume_text = """
    EXPERIENCE:
    - 5 years building Machine Learning models using Python and Scikit-Learn.
    - Deployed Large Language Models (LLMs) to production.
    - Strong background in backend engineering and API design.
    """
    print(f"\ud83d\udcc4 User Resume Uploaded: \n{resume_text.strip()}\n")

    # Vectorize Resume
    resume_vector = model.encode([resume_text])[0].tolist()

    # Register Query Vector
    spark.createDataFrame([(resume_vector,)], ["q_vec"]).createOrReplaceTempView("query_input")

    # --- SCENARIO A: Pure Vector Search ---
    print("\ud83d\udd0e Executing Vector Search (Semantic Match)...")
    matches = spark.sql(f"""
        SELECT title, company, (1 - _distance) as score
        FROM hudi_vector_search(
            '{CONFIG['table_path']}', 'embedding', (SELECT q_vec FROM query_input), 5, 'cosine'
        )
    """).collect()

    print("\nTop Matches for your Resume:")
    for row in matches:
        print(f"  \u2022 {row.title} at {row.company} \u2014 Score: {row.score:.2f}")

    # --- SCENARIO B: Hybrid Search (The Business Requirement) ---
    print("\n\u26a0\ufe0f  User Feedback: 'I specifically want to work at Reddit.'")
    print("\ud83d\udd0e Executing Hybrid Search (Vector + SQL Filters)...")

    hybrid_query = f"""
        SELECT * FROM (
            SELECT title, company, (1 - _distance) as score
            FROM hudi_vector_search(
                '{CONFIG['table_path']}', 'embedding', (SELECT q_vec FROM query_input), 500, 'cosine'
            )
        )
        WHERE company = 'Reddit'
        ORDER BY score DESC
        LIMIT 5
    """
    hybrid_matches = spark.sql(hybrid_query).collect()

    print("\nTop HYBRID Matches (Reddit only):")
    if not hybrid_matches:
        print("  (No matches found with these constraints)")
    for row in hybrid_matches:
        print(f"  \u2022 {row.title} at {row.company} \u2014 Score: {row.score:.2f}")

## 5. The Demo: Analytics Dashboard

In [7]:
def demo_analytics_dashboard(spark):
    print("\n" + "="*50)
    print("DEMO PART 2: The Executive Dashboard")
    print("Value: The SAME data matches resumes AND powers BI.")
    print("="*50)

    spark.read.format("hudi").load(CONFIG["table_path"]).createOrReplaceTempView("jobs_table")

    # 1. Hiring Activity by Company
    print("Generating 'Hiring Activity' Chart...")
    company_df = spark.sql("""
        SELECT company, count(*) as job_count
        FROM jobs_table
        GROUP BY company
        ORDER BY job_count DESC
        LIMIT 15
    """).toPandas()

    # 2. Most Common Job Titles
    print("Generating 'Top Roles' Chart...")
    title_df = spark.sql("""
        SELECT title, count(*) as title_count
        FROM jobs_table
        GROUP BY title
        ORDER BY title_count DESC
        LIMIT 15
    """).toPandas()

    # PLOTTING
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    # Plot 1: Hiring Activity
    axes[0].barh(company_df["company"], company_df["job_count"], color="green")
    axes[0].set_title("Hiring Activity: Postings by Company")
    axes[0].set_xlabel("Number of Postings")
    axes[0].invert_yaxis()

    # Plot 2: Top Roles
    axes[1].barh(title_df["title"], title_df["title_count"], color="skyblue")
    axes[1].set_title("Most Common Data Science Roles")
    axes[1].set_xlabel("Number of Postings")
    axes[1].invert_yaxis()

    plt.tight_layout()
    plt.show()

    print("\u2713 Dashboard generated from Hudi table.")
    print("  (In a real app, this would be a live Streamlit/Tableau view)")

## Run the Demo

In [10]:
spark = create_spark()

26/02/12 17:36:29 WARN Utils: Your hostname, Andys-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 10.255.41.247 instead (on interface en0)
26/02/12 17:36:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
26/02/12 17:36:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Exception in thread "main" org.apache.spark.SparkException: Failed to get main class in JAR with error 'File file:/Users/andywalner/onehouse_demos/pydata_vector_search/kaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.2.0-SNAPSHOT.jar does not exist'.  Please specify one with --class.
	at org.apache.spark.deploy.SparkSubmit.error(SparkSubmit.scala:1047)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:528)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:964)
	at org.apache.spark.deploy.SparkSubmit.doRunM

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [9]:
jobs_data = load_job_data()
model = ingest_data(spark, jobs_data)

Repo card metadata block was not found. Setting CardData to empty.


Loading job descriptions from HuggingFace...
âœ“ Loaded 2921 job postings from 969 companies.


Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Py4JJavaError: An error occurred while calling o55.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: hudi. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:738)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:256)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: hudi.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 16 more


In [None]:
demo_resume_matching(spark, model)

In [None]:
demo_analytics_dashboard(spark)

In [None]:
spark.stop()