feat:Adapt TongYi LLM speech transcription by Spring AI API (#3733)

Signed-off-by: yuluo-yx <yuluo08290126@gmail.com> Co-authored-by: yuluo-yx <yuluo08290126@gmail.com>
alibaba · May 21, 2024 · c52d1cf · c52d1cf
1 parent 4300772
commit c52d1cf
Show file tree

Hide file tree

Showing 36 changed files with 1,175 additions and 72 deletions.
diff --git a/05-17-10-23-13.txt b/05-17-10-23-13.txt
@@ -0,0 +1 @@
+这是由阿里巴巴达摩院语音实验室提供的实时语音识别技术。
diff --git a/...xample/src/main/java/com/alibaba/cloud/ai/example/tongyi/controller/TongYiController.java b/...xample/src/main/java/com/alibaba/cloud/ai/example/tongyi/controller/TongYiController.java
@@ -22,11 +22,13 @@
 import com.alibaba.cloud.ai.example.tongyi.models.ActorsFilms;
 import com.alibaba.cloud.ai.example.tongyi.models.Completion;
 import com.alibaba.cloud.ai.example.tongyi.service.TongYiService;
+import com.alibaba.dashscope.audio.asr.transcription.TranscriptionParam;
 
 import org.springframework.ai.chat.messages.AssistantMessage;
 import org.springframework.ai.image.ImageResponse;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.core.io.Resource;
 import org.springframework.web.bind.annotation.CrossOrigin;
 import org.springframework.web.bind.annotation.GetMapping;
 import org.springframework.web.bind.annotation.RequestMapping;
@@ -138,6 +140,24 @@ public String genAudio(@RequestParam(value = "prompt",
 		return tongYiAudioService.genAudio(prompt);
 	}
 
+	@Autowired
+	@Qualifier("tongYiAudioTranscriptionServiceImpl")
+	private TongYiService tongYiAudioTranscriptionService;
+
+	/**
+	 * audio transcription. Support urls audio resource.
+	 * {@link Resource}
+	 * {@link TranscriptionParam}
+	 * @param url audio url.
+	 * @return transcription result, is String type.
+	 */
+	@GetMapping("/audio/transcription")
+	public String audioTranscription(@RequestParam(value = "audioUrls",
+			defaultValue = "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/realtime_asr_example.wav") String url) {
+
+		return tongYiAudioTranscriptionService.audioTranscription(url);
+	}
+
 	@Autowired
 	@Qualifier("tongYiTextEmbeddingServiceImpl")
 	private TongYiService tongYiTextEmbeddingService;

diff --git a/.../src/main/java/com/alibaba/cloud/ai/example/tongyi/service/AbstractTongYiServiceImpl.java b/.../src/main/java/com/alibaba/cloud/ai/example/tongyi/service/AbstractTongYiServiceImpl.java
@@ -98,4 +98,11 @@ public List<Double> textEmbedding(String text) {
 				.getStackTrace()[2].getMethodName() + INFO_SUFFIX);
 	}
 
+	@Override
+	public String audioTranscription(String url) {
+
+		throw new RuntimeException(INFO_PREFIX + Thread.currentThread()
+				.getStackTrace()[2].getMethodName() + INFO_SUFFIX);
+	}
+
 }
diff --git a/...d-ai-example/src/main/java/com/alibaba/cloud/ai/example/tongyi/service/TongYiService.java b/...d-ai-example/src/main/java/com/alibaba/cloud/ai/example/tongyi/service/TongYiService.java
@@ -100,6 +100,13 @@ public interface TongYiService {
 	 */
 	String genAudio(String text);
 
+	/**
+	 * Audio Transcription.
+	 * @param audioUrls url of the audio file to be transcribed.
+	 * @return the result file Path.
+	 */
+	String audioTranscription(String audioUrls);
+
 	/**
 	 * TongYI LLM Text embedding.
 	 * @param text input text.

diff --git a/...ample/tongyi/service/impl/audio/README.md → ...ongyi/service/impl/audio/speech/README.md b/...ample/tongyi/service/impl/audio/README.md → ...ongyi/service/impl/audio/speech/README.md
diff --git a/...l/audio/TongYiAudioSimpleServiceImpl.java → .../speech/TongYiAudioSimpleServiceImpl.java b/...l/audio/TongYiAudioSimpleServiceImpl.java → .../speech/TongYiAudioSimpleServiceImpl.java
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-package com.alibaba.cloud.ai.example.tongyi.service.impl.audio;
+package com.alibaba.cloud.ai.example.tongyi.service.impl.audio.speech;
 
 import java.io.File;
 import java.io.FileOutputStream;
@@ -24,7 +24,7 @@
 
 import com.alibaba.cloud.ai.example.tongyi.service.AbstractTongYiServiceImpl;
 import com.alibaba.cloud.ai.example.tongyi.service.TongYiService;
-import com.alibaba.cloud.ai.tongyi.audio.api.SpeechClient;
+import com.alibaba.cloud.ai.tongyi.audio.speech.api.SpeechClient;
 import com.alibaba.dashscope.audio.tts.SpeechSynthesisAudioFormat;
 import lombok.extern.slf4j.Slf4j;
 import org.slf4j.Logger;

diff --git a/.../com/alibaba/cloud/ai/example/tongyi/service/impl/audio/transcription/README.md b/.../com/alibaba/cloud/ai/example/tongyi/service/impl/audio/transcription/README.md
@@ -0,0 +1,25 @@
+# Spring Cloud Alibaba AI Audio Transcription
+
+`TongYiController` 接受一个 HTTP GET 请求 `http://localhost:8080/ai/audio/transcription`
+`controller` 将会调用 `TongYiService` 中的 `audioTranscription` 方法，完成服务请求得到响应。
+
+可设置`file_urls`参数，提供一个或多个需要进行语音识别的音视频文件。
+
+## 构建和运行
+
+1. 修改配置文件 `application.yml` 中的 apikey 为有效的 apikey；
+2. 通过 IDE 或者 `./mvnw spring-boot:run` 运行应用程序。
+
+## 访问接口
+
+使用 curl 工具对接口发起请求：
+
+```shell
+$ curl -X GET "http://localhost:8080/ai/audio/transcription?audioUrls=url1&audioUrls=url2"
+
+# Response:
+D:\Code\spring-cloud-alibaba\05-13-20-47-08.txt
+D:\Code\spring-cloud-alibaba\05-13-20-47-09.txt
+```
+
+返回参数为保存到当前根路径下的音频转录文本文件的路径。
diff --git a/.../example/tongyi/service/impl/audio/transcription/TongYiAudioTranscriptionServiceImpl.java b/.../example/tongyi/service/impl/audio/transcription/TongYiAudioTranscriptionServiceImpl.java
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2023-2024 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.alibaba.cloud.ai.example.tongyi.service.impl.audio.transcription;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.List;
+
+import com.alibaba.cloud.ai.example.tongyi.service.AbstractTongYiServiceImpl;
+import com.alibaba.cloud.ai.example.tongyi.service.TongYiService;
+import com.alibaba.cloud.ai.tongyi.audio.transcription.TongYiAudioTranscriptionClient;
+import com.alibaba.cloud.ai.tongyi.audio.transcription.api.AudioTranscriptionPrompt;
+import com.alibaba.cloud.ai.tongyi.audio.transcription.api.AudioTranscriptionResult;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
+import lombok.extern.slf4j.Slf4j;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.core.io.Resource;
+import org.springframework.core.io.UrlResource;
+import org.springframework.stereotype.Service;
+
+/**
+ * @author xYLiu
+ * @since 2024/5/15 14:55
+ */
+@Slf4j
+@Service
+public class TongYiAudioTranscriptionServiceImpl extends AbstractTongYiServiceImpl {
+	private static final Logger logger = LoggerFactory.getLogger(TongYiService.class);
+	private final TongYiAudioTranscriptionClient audioTranscriptionClient;
+
+	@Autowired
+	public TongYiAudioTranscriptionServiceImpl(final TongYiAudioTranscriptionClient audioTranscriptionClient) {
+		this.audioTranscriptionClient = audioTranscriptionClient;
+	}
+
+	@Override
+	public String audioTranscription(String audioUrls) {
+
+		Resource resource;
+
+		try {
+			resource = new UrlResource(audioUrls);
+		}
+		catch (IOException e) {
+			logger.error("Failed to create resource.");
+			throw new RuntimeException(e);
+		}
+		AudioTranscriptionPrompt audioTranscriptionPrompt = new AudioTranscriptionPrompt(resource);
+
+		return save(audioTranscriptionClient.call(audioTranscriptionPrompt).getResults());
+	}
+
+	private String save(List<AudioTranscriptionResult> resultList) {
+		String currentPath = System.getProperty("user.dir");
+		DateTimeFormatter formatter = DateTimeFormatter.ofPattern("MM-dd-HH-mm-ss");
+		StringBuilder retPaths = new StringBuilder();
+		for (AudioTranscriptionResult audioTranscriptionResult : resultList) {
+			String tUrl = audioTranscriptionResult.getOutput();
+			LocalDateTime now = LocalDateTime.now();
+			String fileName = currentPath + File.separator + now.format(formatter) + ".txt";
+			retPaths.append(fileName).append("\n");
+			try {
+				URL url = new URL(tUrl);
+				HttpURLConnection connection = (HttpURLConnection) url.openConnection();
+				connection.setRequestMethod("GET");
+				StringBuilder sb = new StringBuilder();
+				int responseCode = connection.getResponseCode();
+				if (responseCode == HttpURLConnection.HTTP_OK) {
+					try (BufferedInputStream in = new BufferedInputStream(connection.getInputStream()); FileOutputStream fileOutputStream = new FileOutputStream(fileName)) {
+						byte[] dataBuffer = new byte[1024];
+						int bytesRead;
+						while ((bytesRead = in.read(dataBuffer, 0, 1024)) != -1) {
+							sb.append(new String(dataBuffer, 0, bytesRead));
+						}
+						JsonObject rootObj = JsonParser.parseString(sb.toString()).getAsJsonObject();
+						JsonArray transcriptsArray = rootObj.getAsJsonArray("transcripts");
+
+						for (var transcriptElement : transcriptsArray) {
+							JsonObject transcriptObj = transcriptElement.getAsJsonObject();
+							String text = transcriptObj.get("text").getAsString();
+							fileOutputStream.write(text.getBytes());
+						}
+						logger.info("File downloaded successfully：{}\n", fileName);
+					}
+				}
+				else {
+					logger.error("The download failed, and the response code：{}",
+							responseCode);
+				}
+				connection.disconnect();
+			}
+			catch (IOException e) {
+				logger.error("An error occurred during the file download process.");
+			}
+		}
+		return retPaths.toString();
+	}
+}
diff --git a/...ud-alibaba-examples/ai-example/spring-cloud-ai-example/src/main/resources/application.yml b/...ud-alibaba-examples/ai-example/spring-cloud-ai-example/src/main/resources/application.yml
@@ -21,7 +21,6 @@ spring:
   application:
     name: tongyi-example
 
-  cloud:
-    ai:
-      tongyi:
-        api-key: sk-0e6c387446ff45d0924111475a82462e
+# please setting api-key. suggestion by environment variable.
+# Note: api-key is invalid, please apply for a new one.
+# export SPRING_CLOUD_AI_TONGYI_API_KEY=sk-a3d73b1709bf4a178c28ed7c8b3b5a345
diff --git a/...n/java/com/alibaba/cloud/ai/example/tongyi/service/impl/textembedding/README.md b/...n/java/com/alibaba/cloud/ai/example/tongyi/service/impl/textembedding/README.md
diff --git a/...starter-alibaba-ai/src/main/java/com/alibaba/cloud/ai/tongyi/TongYiAutoConfiguration.java b/...starter-alibaba-ai/src/main/java/com/alibaba/cloud/ai/tongyi/TongYiAutoConfiguration.java
@@ -18,8 +18,10 @@
 
 import java.util.Objects;
 
-import com.alibaba.cloud.ai.tongyi.audio.TongYiAudioSpeechClient;
-import com.alibaba.cloud.ai.tongyi.audio.TongYiAudioSpeechProperties;
+import com.alibaba.cloud.ai.tongyi.audio.speech.TongYiAudioSpeechClient;
+import com.alibaba.cloud.ai.tongyi.audio.speech.TongYiAudioSpeechProperties;
+import com.alibaba.cloud.ai.tongyi.audio.transcription.TongYiAudioTranscriptionClient;
+import com.alibaba.cloud.ai.tongyi.audio.transcription.TongYiAudioTranscriptionProperties;
 import com.alibaba.cloud.ai.tongyi.chat.TongYiChatClient;
 import com.alibaba.cloud.ai.tongyi.chat.TongYiChatProperties;
 import com.alibaba.cloud.ai.tongyi.constants.TongYiConstants;
@@ -30,6 +32,7 @@
 import com.alibaba.cloud.ai.tongyi.image.TongYiImagesProperties;
 import com.alibaba.dashscope.aigc.generation.Generation;
 import com.alibaba.dashscope.aigc.imagesynthesis.ImageSynthesis;
+import com.alibaba.dashscope.audio.asr.transcription.Transcription;
 import com.alibaba.dashscope.audio.tts.SpeechSynthesizer;
 import com.alibaba.dashscope.common.MessageManager;
 import com.alibaba.dashscope.embeddings.TextEmbedding;
@@ -58,14 +61,16 @@
 		MessageManager.class,
 		TongYiChatClient.class,
 		TongYiImagesClient.class,
-		TongYiAudioSpeechClient.class
+		TongYiAudioSpeechClient.class,
+		TongYiAudioTranscriptionClient.class
 })
 @EnableConfigurationProperties({
 		TongYiChatProperties.class,
 		TongYiImagesProperties.class,
 		TongYiAudioSpeechProperties.class,
 		TongYiConnectionProperties.class,
-		TongYiTextEmbeddingProperties.class
+		TongYiTextEmbeddingProperties.class,
+		TongYiAudioTranscriptionProperties.class
 })
 public class TongYiAutoConfiguration {
 
@@ -101,6 +106,13 @@ public SpeechSynthesizer speechSynthesizer() {
 		return new SpeechSynthesizer();
 	}
 
+	@Bean
+	@ConditionalOnMissingBean
+	public Transcription transcription() {
+
+		return new Transcription();
+	}
+
 	@Bean
 	@ConditionalOnMissingBean
 	public TextEmbedding textEmbedding() {
@@ -173,19 +185,37 @@ public TongYiAudioSpeechClient tongYiAudioSpeechClient(
 
 	@Bean
 	@ConditionalOnProperty(
-			prefix = TongYiAudioSpeechProperties.CONFIG_PREFIX,
+			prefix = TongYiAudioTranscriptionProperties.CONFIG_PREFIX,
+			name = "enabled",
+			havingValue = "true",
+			matchIfMissing = true
+	)
+	public TongYiAudioTranscriptionClient tongYiAudioTranscriptionClient(
+			Transcription transcription,
+			TongYiAudioTranscriptionProperties transcriptionProperties,
+			TongYiConnectionProperties connectionProperties) {
+
+		settingApiKey(connectionProperties);
+
+		return new TongYiAudioTranscriptionClient(
+				transcriptionProperties.getOptions(),
+				transcription
+		);
+	}
+
+	@Bean
+	@ConditionalOnProperty(
+			prefix = TongYiTextEmbeddingProperties.CONFIG_PREFIX,
 			name = "enabled",
 			havingValue = "true",
 			matchIfMissing = true
 	)
 	public TongYiTextEmbeddingClient tongYiTextEmbeddingClient(
 			TextEmbedding textEmbedding,
-			TongYiTextEmbeddingProperties textEmbeddingProperties,
 			TongYiConnectionProperties connectionProperties
 	) {
 
 		settingApiKey(connectionProperties);
-
 		return new TongYiTextEmbeddingClient(textEmbedding);
 	}
 

diff --git a/...ba/cloud/ai/tongyi/audio/AudioModels.java → ...ud/ai/tongyi/audio/AudioSpeechModels.java b/...ba/cloud/ai/tongyi/audio/AudioModels.java → ...ud/ai/tongyi/audio/AudioSpeechModels.java
@@ -25,9 +25,9 @@
  * @since 2023.0.0.0-RC1
  */
 
-public final class AudioModels {
+public final class AudioSpeechModels {
 
-	private AudioModels() {
+	private AudioSpeechModels() {
 	}
 
 	/**
@@ -37,8 +37,4 @@ private AudioModels() {
 	 */
 	public static final String SAMBERT_ZHICHU_V1 = "sambert-zhichu-v1";
 
-
-
 }
-
-