Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
78633de
feat(huggingFace): add HuggingFaceModelResource for model browsing an…
PG1204 May 17, 2026
935ccc1
fix: address review feedback on HuggingFaceModelResource
PG1204 May 19, 2026
089c3c4
Merge branch 'apache:main' into hf/01-backend-skeleton
ELin2025 May 20, 2026
2aa865c
Merge branch 'apache:main' into hf/01-backend-skeleton
PG1204 May 27, 2026
0c30beb
Merge branch 'apache:main' into hf/01-backend-skeleton
PG1204 May 28, 2026
309ac54
fix(huggingFace): cap heap usage on /media-proxy and /audio-preview
PG1204 May 28, 2026
6857e34
chore: retrigger CI
PG1204 May 28, 2026
6f0f5fb
Merge branch 'apache:main' into hf/01-backend-skeleton
PG1204 May 28, 2026
fec6dfb
Merge branch 'main' into hf/01-backend-skeleton
xuang7 May 28, 2026
5e95bcd
Merge branch 'apache:main' into hf/01-backend-skeleton
PG1204 May 29, 2026
2b852ae
chore(huggingFace): annotate HuggingFaceModelResource with @RolesAllowed
PG1204 May 29, 2026
734de73
feat(huggingFace): refactor operator into per-task codegen + text-gen…
PG1204 May 27, 2026
8af456b
test(huggingFace): boost PR 2 patch coverage with null-tolerance + co…
PG1204 May 28, 2026
8350eb9
fix: scala lint fixes
PG1204 May 28, 2026
2efa337
Merge branch 'apache:main' into hf/02-operator-textgen
PG1204 Jun 2, 2026
c44d7d0
Merge branch 'apache:main' into hf/02-operator-textgen
PG1204 Jun 3, 2026
28fcab0
refactor(huggingFace): cap HTTP error detail + lift CHAT_ROUTES / OPE…
PG1204 Jun 5, 2026
8de9da9
feat(huggingFace): add image task family via ImageTaskCodegen
PG1204 May 28, 2026
2b46a9c
style: apply scalafmt and prettier to HF inference spec and image upl…
PG1204 Jun 3, 2026
0815d14
chore: add Apache license header to HF image upload template and styles
PG1204 Jun 3, 2026
76f606a
test(frontend): cover HuggingFaceImageUploadComponent
PG1204 Jun 3, 2026
ea3ea63
Merge branch 'apache:main' into hf/03-image-tasks
PG1204 Jun 5, 2026
ef59a1e
fix(huggingFace): zero-shot labels, polling progress logs, data-URL c…
PG1204 Jun 5, 2026
3975e0a
Merge branch 'apache:main' into hf/03-image-tasks
PG1204 Jun 7, 2026
8a83dc2
feat(huggingface): add audio and media tasks
anishshiva7 May 29, 2026
8507ca5
feat(huggingface): add qa and ranking tasks
anishshiva7 May 29, 2026
ca26c02
feat(frontend): add HuggingFace task selector and model browser compo…
ELin2025 May 29, 2026
e538271
feat(frontend): add HuggingFace audio upload component
ELin2025 May 29, 2026
45a9e3a
Merge branch 'apache:main' into hf/06b-audio-upload
ELin2025 Jun 8, 2026
95dd805
fix(frontend): add explicit type annotations to rxjs error callbacks
ELin2025 Jun 8, 2026
3a1389e
style(frontend): format HuggingFace components with prettier
ELin2025 Jun 8, 2026
177d404
fix(frontend): add takeUntil to rxjs subscribe calls in HuggingFaceCo…
ELin2025 Jun 8, 2026
56286b9
style(frontend): format audio upload spec with prettier
ELin2025 Jun 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import org.apache.texera.amber.operator.dummy.DummyOpDesc
import org.apache.texera.amber.operator.filter.SpecializedFilterOpDesc
import org.apache.texera.amber.operator.hashJoin.HashJoinOpDesc
import org.apache.texera.amber.operator.huggingFace.{
HuggingFaceInferenceOpDesc,
HuggingFaceIrisLogisticRegressionOpDesc,
HuggingFaceSentimentAnalysisOpDesc,
HuggingFaceSpamSMSDetectionOpDesc,
Expand Down Expand Up @@ -396,6 +397,7 @@ trait StateTransferFunc
),
new Type(value = classOf[SklearnDummyClassifierOpDesc], name = "SklearnDummyClassifier"),
new Type(value = classOf[SklearnPredictionOpDesc], name = "SklearnPrediction"),
new Type(value = classOf[HuggingFaceInferenceOpDesc], name = "HuggingFace"),
new Type(
value = classOf[HuggingFaceSentimentAnalysisOpDesc],
name = "HuggingFaceSentimentAnalysis"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.texera.amber.operator.huggingFace

import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription}
import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle
import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PortIdentity}
import org.apache.texera.amber.operator.PythonOperatorDescriptor
import org.apache.texera.amber.operator.huggingFace.codegen.{
AudioTaskCodegen,
CodegenContext,
ImageTaskCodegen,
MediaGenCodegen,
PythonCodegenBase,
QaRankingCodegen,
TaskCodegen,
TextGenCodegen
}
import org.apache.texera.amber.operator.metadata.annotations.AutofillAttributeName
import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo}
import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString

/**
* Generic Hugging Face inference operator.
*
* This is the first slice of a feature that will eventually cover ~20 HF
* pipeline tasks. PR 2 ships text-generation only; image, audio,
* media-generation, and QA task families land in subsequent PRs as new
* `TaskCodegen` implementations registered in `registeredCodegens`.
*
* The Python script that runs at execution time is assembled by
* `PythonCodegenBase.render(ctx, codegen)`, which composes the shared
* provider-fallback / request-loop infrastructure with the per-task
* payload + parse snippets supplied by the selected `TaskCodegen`.
*
* User-provided string fields are typed as [[EncodableString]] so the
* `pyb"..."` macro inside `PythonCodegenBase` emits them as
* base64-decoded expressions at runtime instead of raw Python literals —
* this is what allows the operator to satisfy
* `PythonCodeRawInvalidTextSpec`'s contract that arbitrary `@JsonProperty`
* values must not leak into generated source.
*/
class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {

@JsonProperty(value = "hfApiToken", required = true)
@JsonSchemaTitle("HF API Token")
@JsonPropertyDescription(
"Your Hugging Face API token (from https://huggingface.co/settings/tokens)"
)
var hfApiToken: EncodableString = ""

@JsonProperty(value = "task", required = true, defaultValue = "text-generation")
@JsonSchemaTitle("Task")
@JsonPropertyDescription("The Hugging Face pipeline task type")
var task: EncodableString = "text-generation"

@JsonProperty(
value = "modelId",
required = true,
defaultValue = "Qwen/Qwen2.5-72B-Instruct"
)
@JsonSchemaTitle("Model")
@JsonPropertyDescription("Select a Hugging Face model")
var modelId: EncodableString = "Qwen/Qwen2.5-72B-Instruct"

@JsonProperty(value = "promptColumn", required = true)
@JsonSchemaTitle("Prompt Column")
@JsonPropertyDescription("Column in the input table to use as the user prompt")
@AutofillAttributeName
var promptColumn: EncodableString = ""

@JsonProperty(value = "imageInput", required = false)
@JsonSchemaTitle("Image Upload")
@JsonPropertyDescription("Upload an image for Hugging Face image tasks")
var imageInput: EncodableString = ""

@JsonProperty(value = "inputImageColumn", required = false)
@JsonSchemaTitle("Input Image Column")
@JsonPropertyDescription("Column containing image data from the input table")
@AutofillAttributeName
var inputImageColumn: EncodableString = ""

@JsonProperty(value = "audioInput", required = false)
@JsonSchemaTitle("Audio Upload")
@JsonPropertyDescription("Upload audio for Hugging Face audio tasks")
var audioInput: EncodableString = ""

@JsonProperty(value = "inputAudioColumn", required = false)
@JsonSchemaTitle("Input Audio Column")
@JsonPropertyDescription("Column containing audio data from the input table")
@AutofillAttributeName
var inputAudioColumn: EncodableString = ""

@JsonProperty(value = "contextColumn", required = false)
@JsonSchemaTitle("Context Column")
@JsonPropertyDescription("Column containing the context passage for question answering")
@AutofillAttributeName
var contextColumn: EncodableString = ""

@JsonProperty(value = "candidateLabels", required = false)
@JsonSchemaTitle("Candidate Labels")
@JsonPropertyDescription("Comma-separated candidate labels for zero-shot classification")
var candidateLabels: EncodableString = ""

@JsonProperty(value = "sentencesColumn", required = false)
@JsonSchemaTitle("Sentences Column")
@JsonPropertyDescription(
"Column with comma-separated sentences for sentence similarity and text ranking"
)
@AutofillAttributeName
var sentencesColumn: EncodableString = ""

@JsonProperty(
value = "systemPrompt",
required = false,
defaultValue = "You are a helpful assistant."
)
@JsonSchemaTitle("System Prompt")
@JsonPropertyDescription("Optional system message to set model behavior")
var systemPrompt: EncodableString = "You are a helpful assistant."

@JsonProperty(value = "maxNewTokens", required = false, defaultValue = "256")
@JsonSchemaTitle("Max New Tokens")
@JsonPropertyDescription("Maximum number of tokens to generate (1-4096)")
var maxNewTokens: java.lang.Integer = 256

@JsonProperty(value = "temperature", required = false)
@JsonSchemaTitle("Temperature")
@JsonPropertyDescription("Sampling temperature (0.0 = deterministic, up to 2.0)")
var temperature: java.lang.Double = 0.7

@JsonProperty(
value = "resultColumn",
required = false,
defaultValue = "hf_response"
)
@JsonSchemaTitle("Result Column Name")
@JsonPropertyDescription("Name of the new column added to the output table")
var resultColumn: EncodableString = "hf_response"

/**
* Per-task code generators. New entries are added as task families land
* in subsequent PRs (e.g. ImageTaskCodegen, AudioTaskCodegen, etc.).
*
* An unrecognized task string falls back to [[TextGenCodegen]]; the
* generated Python's `else` branch then produces a generic `{"inputs":
* prompt_value}` payload and the HF endpoint surfaces the real error at
* runtime. This matches the original monolithic operator's behavior and
* keeps `generatePythonCode` total (it never throws on arbitrary input,
* which is required by `PythonCodeRawInvalidTextSpec`).
*/
private val registeredCodegens: Map[String, TaskCodegen] = {
val byTask = scala.collection.mutable.Map.empty[String, TaskCodegen]
byTask += (TextGenCodegen.task -> TextGenCodegen)
ImageTaskCodegen.tasks.foreach(t => byTask += (t -> ImageTaskCodegen))
AudioTaskCodegen.tasks.foreach(t => byTask += (t -> AudioTaskCodegen))
MediaGenCodegen.tasks.foreach(t => byTask += (t -> MediaGenCodegen))
QaRankingCodegen.tasks.foreach(t => byTask += (t -> QaRankingCodegen))
byTask.toMap
}

private def codegenForTask(t: String): TaskCodegen =
registeredCodegens.getOrElse(t, TextGenCodegen)

override def generatePythonCode(): String = {
val safeTask: EncodableString =
if (task == null || task.trim.isEmpty) "text-generation" else task
val safeModelId: EncodableString =
if (modelId == null) "" else modelId.trim
val safePromptCol: EncodableString =
if (promptColumn == null) "" else promptColumn
val safeResultCol: EncodableString =
if (resultColumn == null || resultColumn.trim.isEmpty) "hf_response" else resultColumn
val safeSystemPrompt: EncodableString =
if (systemPrompt == null) "" else systemPrompt
val safeToken: EncodableString =
if (hfApiToken == null) "" else hfApiToken

val safeMaxTokens =
math.max(1, math.min(if (maxNewTokens != null) maxNewTokens.intValue else 256, 4096))
val safeTemp =
math.max(0.0, math.min(if (temperature != null) temperature.doubleValue else 0.7, 2.0))

val safeImageInput: EncodableString =
if (imageInput == null) "" else imageInput
val safeInputImageColumn: EncodableString =
if (inputImageColumn == null) "" else inputImageColumn
val safeAudioInput: EncodableString =
if (audioInput == null) "" else audioInput
val safeInputAudioColumn: EncodableString =
if (inputAudioColumn == null) "" else inputAudioColumn
val safeContextColumn: EncodableString =
if (contextColumn == null) "" else contextColumn
val safeCandidateLabels: EncodableString =
if (candidateLabels == null) "" else candidateLabels
val safeSentencesColumn: EncodableString =
if (sentencesColumn == null) "" else sentencesColumn

val ctx = CodegenContext(
hfApiToken = safeToken,
modelId = safeModelId,
promptColumn = safePromptCol,
resultColumn = safeResultCol,
task = safeTask,
systemPrompt = safeSystemPrompt,
safeMaxTokens = safeMaxTokens,
safeTemp = safeTemp,
imageInput = safeImageInput,
inputImageColumn = safeInputImageColumn,
audioInput = safeAudioInput,
inputAudioColumn = safeInputAudioColumn,
contextColumn = safeContextColumn,
candidateLabels = safeCandidateLabels,
sentencesColumn = safeSentencesColumn
)

PythonCodegenBase.render(ctx, codegenForTask(safeTask))
}

override def operatorInfo: OperatorInfo =
OperatorInfo(
"Hugging Face",
"Call a Hugging Face model via the Inference API",
OperatorGroupConstants.HUGGINGFACE_GROUP,
inputPorts = List(InputPort()),
outputPorts = List(OutputPort())
)

override def getOutputSchemas(
inputSchemas: Map[PortIdentity, Schema]
): Map[PortIdentity, Schema] = {
val resCol =
if (resultColumn == null || resultColumn.trim.isEmpty) "hf_response"
else resultColumn
Map(
operatorInfo.outputPorts.head.id -> inputSchemas.values.head
.add(resCol, AttributeType.STRING)
)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.texera.amber.operator.huggingFace.codegen

/**
* Codegen for Hugging Face audio task families.
*
* ASR and audio-classification send audio bytes as the raw request body.
* Text-to-speech is prompt-driven and sends a JSON payload; its providers
* return either audio bytes directly or a JSON envelope pointing to audio.
*/
object AudioTaskCodegen extends TaskCodegen {

override val task: String = "automatic-speech-recognition"

override val tasks: Set[String] = Set(
"automatic-speech-recognition",
"audio-classification",
"text-to-speech"
)

override def payloadPython(ctx: CodegenContext): String =
""" if task in audio_only_tasks:
| payload = current_audio_bytes
| use_raw_binary_body = True
| raw_binary_headers = audio_headers
| elif task == "text-to-speech":
| payload = {"inputs": prompt_value}
| else:
| payload = {"inputs": prompt_value}""".stripMargin

override def parsePython(ctx: CodegenContext): String =
""" if task == "text-to-speech":
| if isinstance(body, dict):
| if "output" in body:
| out = body["output"]
| url = out[0] if isinstance(out, list) else out
| if isinstance(url, str) and url.startswith("http"):
| return self._audio_url_to_data_url(url)
| if "audio" in body:
| audio = body["audio"]
| if isinstance(audio, dict):
| if "url" in audio:
| return self._audio_url_to_data_url(audio["url"])
| if "b64_json" in audio:
| return f"data:audio/mpeg;base64,{audio['b64_json']}"
| if "data" in body:
| data = body["data"]
| if data and isinstance(data[0], dict):
| if "url" in data[0]:
| return self._audio_url_to_data_url(data[0]["url"])
| if "b64_json" in data[0]:
| return f"data:audio/mpeg;base64,{data[0]['b64_json']}"
| return json.dumps(body)
| elif task == "automatic-speech-recognition":
| if isinstance(body, dict):
| if "text" in body:
| return body["text"]
| if "generated_text" in body:
| return body["generated_text"]
| return json.dumps(body)
| elif task == "audio-classification":
| return json.dumps(body)""".stripMargin
}
Loading
Loading