diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..44dbcd6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +.idea/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..cd519f1 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,28 @@ +sudo: required +language: generic +compiler: + - gcc +notifications: + email: + on_success: change + on_failure: always + recipients: + - ros-contributions@amazon.com +env: + matrix: + - ROS_DISTRO="kinetic" ROS_REPOSITORY_PATH=http://packages.ros.org/ros/ubuntu + - ROS_DISTRO="kinetic" ROS_REPOSITORY_PATH=http://packages.ros.org/ros-shadow-fixed/ubuntu + - ROS_DISTRO="lunar" ROS_REPOSITORY_PATH=http://packages.ros.org/ros/ubuntu + - ROS_DISTRO="lunar" ROS_REPOSITORY_PATH=http://packages.ros.org/ros-shadow-fixed/ubuntu + - ROS_DISTRO="melodic" ROS_REPOSITORY_PATH=http://packages.ros.org/ros/ubuntu + - ROS_DISTRO="melodic" ROS_REPOSITORY_PATH=http://packages.ros.org/ros-shadow-fixed/ubuntu +matrix: + allow_failures: + - env: ROS_DISTRO="lunar" ROS_REPOSITORY_PATH=http://packages.ros.org/ros/ubuntu + - env: ROS_DISTRO="lunar" ROS_REPOSITORY_PATH=http://packages.ros.org/ros-shadow-fixed/ubuntu + - env: ROS_DISTRO="melodic" ROS_REPOSITORY_PATH=http://packages.ros.org/ros/ubuntu + - env: ROS_DISTRO="melodic" ROS_REPOSITORY_PATH=http://packages.ros.org/ros-shadow-fixed/ubuntu +install: + - git clone https://github.com/ros-industrial/industrial_ci.git .ros_ci +script: + - .ros_ci/travis.sh diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 3b64466..0000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,4 +0,0 @@ -## Code of Conduct -This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact -opensource-codeofconduct@amazon.com with any additional questions or comments. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 43b98b6..0000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,61 +0,0 @@ -# Contributing Guidelines - -Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional -documentation, we greatly value feedback and contributions from our community. - -Please read through this document before submitting any issues or pull requests to ensure we have all the necessary -information to effectively respond to your bug report or contribution. - - -## Reporting Bugs/Feature Requests - -We welcome you to use the GitHub issue tracker to report bugs or suggest features. - -When filing an issue, please check [existing open](https://github.com/aws/aws-ros-tts-ros1/issues), or [recently closed](https://github.com/aws/aws-ros-tts-ros1/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already -reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: - -* A reproducible test case or series of steps -* The version of our code being used -* Any modifications you've made relevant to the bug -* Anything unusual about your environment or deployment - - -## Contributing via Pull Requests -Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: - -1. You are working against the latest source on the *master* branch. -2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. -3. You open an issue to discuss any significant work - we would hate for your time to be wasted. - -To send us a pull request, please: - -1. Fork the repository. -2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. -3. Ensure local tests pass. -4. Commit to your fork using clear commit messages. -5. Send us a pull request, answering any default questions in the pull request interface. -6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. - -GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and -[creating a pull request](https://help.github.com/articles/creating-a-pull-request/). - - -## Finding contributions to work on -Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-ros-tts-ros1/labels/help%20wanted) issues is a great place to start. - - -## Code of Conduct -This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact -opensource-codeofconduct@amazon.com with any additional questions or comments. - - -## Security issue notifications -If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. - - -## Licensing - -See the [LICENSE](https://github.com/aws/aws-ros-tts-ros1/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. - -We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. diff --git a/NOTICE b/NOTICE deleted file mode 100644 index 14c7012..0000000 --- a/NOTICE +++ /dev/null @@ -1,2 +0,0 @@ -AWS Ros Tts Ros1 -Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/README.md b/README.md index 1bb2e76..37392d7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,246 @@ -## AWS Ros Tts Ros1 +# tts -ROS packages for facilitating text-to-speech and the use of Amazon Polly. -## License +## Overview +The `tts` ROS node enables a robot to speak with a human voice by providing a Text-To-Speech service. +Out of the box this package listens to a speech topic, submits text to the Amazon Polly cloud service to generate an audio stream file, +retrieves the audio stream from Amazon Polly, and plays the audio stream via the default output device. +The nodes can be configured to use different voices as well as custom lexicons and SSML tags which enable you to control aspects of speech, +such as pronunciation, volume, pitch, speed rate, etc. A [sample ROS application] with this node, +and more details on speech customization are available within the [Amazon Polly documentation]. -This library is licensed under the Apache 2.0 License. +**Amazon Polly Summary**: Amazon Polly is a service that turns text into lifelike speech, allowing you to create applications that talk, +and build entirely new categories of speech-enabled products. Amazon Polly is a Text-to-Speech service that uses advanced deep learning technologies to synthesize speech that sounds like a human voice. +With dozens of lifelike voices across a variety of languages, you can select the ideal voice and build speech-enabled applications that work in many different countries. + +**Features in Active Development**: +- Offline TTS + +### License +The source code is released under an [Apache 2.0]. + +**Author**: AWS RoboMaker
+**Affiliation**: [Amazon Web Services (AWS)]
+**Maintainer**: AWS RoboMaker, ros-contributions@amazon.com + +### Supported ROS Distributions +- Kinetic +- Lunar +- Melodic + + +## Installation + +### AWS Credentials +You will need to create an AWS Account and configure the credentials to be able to communicate with AWS services. You may find [AWS Configuration and Credential Files] helpful. + +This node will require the following AWS account IAM role permissions: +- `polly:SynthesizeSpeech` + +### Build and Test + +#### Build from Source + +If you test this package on versions of Ubuntu older than 18.x (16.04 for example), please upgrade `mock` to the latest version by using `pip`. + + sudo apt-get python-pip + pip install -U mock requests + +Create a ROS workspace and a source directory + + mkdir -p ~/ros-workspace/src + +To install from source, clone the latest version from master branch and compile the package + +- Clone the package into the source directory + + cd ~/ros-workspace + git clone https://github.com/aws-robotics/tts-ros1.git + +- Install dependencies + + cd ~/ros-workspace && sudo apt-get update + rosdep install --from-paths src --ignore-src -r -y + +- Install the packages + + cd ~/ros-workspace && colcon build + +- Configure ROS library Path + + source ~/ros-workspace/install/setup.bash + +- Build and run the unit tests + + colcon test --packages-select tts && colcon test-result --all + +#### Test on Containers/Virtual Machines + +Even if your container or virtual machine does not have audio device, you can still test TTS by leveraging an audio server. + +The following is an example setup on a MacBook with PulseAudio as the audio server. +If you are new to PulseAudio, you may want to read the [PulseAudio Documentation]. + +**Step 1: Start PulseAudio on your laptop** + +After installation, start the audio server with *module-native-protocol-tcp* loaded: + + pulseaudio --load=module-native-protocol-tcp --exit-idle-time=-1 --log-target=stderr -v + +Note the extra arguments `-v` and `--log-target` are used for easier troubleshooting. + +**Step 2: Run TTS nodes in container** + +In your container, make sure you set the right environment variables. +For example, you can start the container using `docker run -it -e PULSE_SERVER=docker.for.mac.localhost ubuntu:16.04`. + +Then you will be able to run ROS nodes in the container and hear the audio from your laptop speakers. + +**Troubleshooting** + +If your laptop has multiple audio output devices, make sure the right one has the right volume. +This command will give you a list of output devices and tell you which one has been selected: + + pacmd list-sinks | grep -E '(index:|name:|product.name)' + +## Launch Files +An example launch file called `sample_application.launch` is provided. + + +## Usage + +### Run the node +- **Plain text** + - `roslaunch tts sample_application.launch` + - `rosrun tts voicer.py 'Hello World'` + +- **SSML** + - `roslaunch tts sample_application.launch` + - `rosrun tts voicer.py 'Mary has a little lamb.' '{"text_type":"ssml"}'` + + +## Configuration File and Parameters +| Parameter Name | Type | Description | +| -------------- | ---- | ----------- | +| polly_action | *string* | Currently only one action named `SynthesizeSpeech` is supported. | +| text | *string* | The text to be synthesized. It can be plain text or SSML. See also `text_type`. | +| text_type | *string* | A user can choose from `text` and `ssml`. Default: `text`. | +| voice_id | *string* | The list of supported voices can be found on [official Amazon Polly document]. Default: Joanna | +| output_format | *string* | Valid formats are `ogg_vorbis`, `mp3` and `pcm`. Default: `ogg_vorbis` | +| output_path | *string* | The audio data will be saved as a local file for playback and reuse/inspection purposes. This parameter is to provide a preferred path to save the file. Default: `.` | +| sample_rate | *string* | Note `16000` is a valid sample rate for all supported formats. Default: `16000`. | + + +## Performance and Benchmark Results +We evaluated the performance of this node by runnning the followning scenario on a Raspberry Pi 3 Model B: +- Launch a baseline graph containing the talker and listener nodes from the [roscpp_tutorials package](https://wiki.ros.org/roscpp_tutorials), plus two additional nodes that collect CPU and memory usage statistics. Allow the nodes to run for 60 seconds. +- Launch the nodes `polly_node`, `synthesizer_node` and `tts_node` by using the launch file `sample_application.launch` as described above. At the same time, perform several calls to the action `tts/action/Speech.action` using the `voicer.py` script descried above, by running the following script in the background: + +```bash +rosrun tts voicer.py 'Amazon Polly is a Text-to-Speech (TTS) cloud service' '{"text_type":"ssml"}' ; sleep 1 +rosrun tts voicer.py 'that converts text into lifelike speech' '{"text_type":"ssml"}' ; sleep 1 +rosrun tts voicer.py 'You can use Amazon Polly to develop applications that increase engagement and accessibility' '{"text_type":"ssml"}' ; sleep 1 +rosrun tts voicer.py 'Amazon Polly supports multiple languages and includes a variety of lifelike voices' '{"text_type":"ssml"}' ; sleep 1 +rosrun tts voicer.py 'so you can build speech-enabled applications that work in multiple locations' '{"text_type":"ssml"}' ; sleep 1 +rosrun tts voicer.py 'and use the ideal voice for your customers' '{"text_type":"ssml"}' ; sleep 1 +``` + +- Allow the nodes to run for 180 seconds. +- Terminate the `polly_node`, `synthesizer_node` and `tts_node` nodes, and allow the reamaining nodes to run for 60 seconds. + +The following graph shows the CPU usage during that scenario. The 1 minute average CPU usage starts at 16.75% during the launch of the baseline graph, and stabilizes at 6%. When we launch the Polly nodes around second 85, the 1 minute average CPU increases up to a peak of 22.25% and stabilizes around 20%. After we stop making requests with the script `voicer.py` around second 206 the 1 minute average CPU usage moves to around 12%, and decreases gradually, and goes down again to 2.5 % after we stop the Polly nodes at the end of the scenario. + +![cpu](wiki/images/cpu.svg) + +The following graph shows the memory usage during that scenario. We start with a memory usage of around 227 MB that increases to around 335 MB (+47.58%) when we lanch the Polly nodes around second 85, and gets to a peak of 361 MB (+59% wrt. initial value) while we are calling the script `voicer.py`. The memory usage goes back to the initial values after stopping the Polly nodes. + +![memory](wiki/images/memory.svg) + + +## Nodes + +### polly +Polly node is the engine for the synthesizing job. It provides user-friendly yet powerful APIs so a user doesn't have to deal with technical details of AWS service calls. + +#### Services +- **`polly (tts/Polly)`** + + Call the service to use Amazon Polly to synthesize the audio. + +#### Reserved for future usage +- `language_code (string, default: None)` + + A user doesn't have to provide a language code and this is reserved for future usage. + +- `lexicon_content (string, default: None)` + +- `lexicon_name (string, default: None)` + +- `lexicon_names (string[], default: empty)` + +- `speech_mark_types (string[], default: empty)` + +- `max_results (uint32, default: None)` + +- `next_token (string, default: None)` + +- `sns_topic_arn (string, default: None)` + +- `task_id (string, default: None)` + +- `task_status (string, default: iNone)` + +- `output_s3_bucket_name (string, default: None)` + +- `output_s3_key_prefix (string, default: None)` + +- `include_additional_language_codes (bool, default: None)` + +### synthesizer node + +#### Services +- **`synthesizer (tts/Synthesizer)`** + + Call the service to synthesize. + +#### Parameters + +- **`text (string)`** + + The text to be synthesized. + +- **`metadata (string, JSON format)`** + + Optional, for user to have control over how synthesis happens. + +### tts node + +#### Action + +- **`speech`** + +#### Parameters + +- **`text (string)`** + + The text to be synthesized. + +- **`metadata (string, JSON format)`** + + Optional, for user to have control over how synthesis happens. + + +## Bugs & Feature Requests +Please contact the team directly if you would like to request a feature. + +Please report bugs in [Issue Tracker]. + + +[AWS Configuration and Credential Files]: https://docs.aws.amazon.com/cli/latest/userguide/cli-config-files.html +[Amazon Polly documentation]: https://docs.aws.amazon.com/polly/latest/dg/what-is.html +[Amazon Web Services (AWS)]: https://aws.amazon.com/ +[Apache 2.0]: https://aws.amazon.com/apache-2-0/ +[Issue Tracker]: https://github.com/aws-robotics/tts-ros1/issues +[PulseAudio Documentation]: https://www.freedesktop.org/wiki/Software/PulseAudio/Documentation/ +[official Amazon Polly document]: https://docs.aws.amazon.com/polly/latest/dg/voicelist.html +[sample ROS application]: https://github.com/aws-robotics/aws-robomaker-sample-application-voiceinteraction diff --git a/tts/CMakeLists.txt b/tts/CMakeLists.txt new file mode 100644 index 0000000..e5a05e9 --- /dev/null +++ b/tts/CMakeLists.txt @@ -0,0 +1,71 @@ +cmake_minimum_required(VERSION 2.8.3) +project(tts) + +find_package(catkin REQUIRED COMPONENTS actionlib_msgs message_generation rospy rosunit rostest std_msgs sound_play) + +catkin_python_setup() + +################################################ +## Declare ROS messages, services and actions ## +################################################ + +## Generate services in the 'srv' folder +add_service_files(FILES Synthesizer.srv Polly.srv) + +## Generate actions in the 'action' folder +add_action_files(FILES Speech.action) + +## Generate added messages and services with any dependencies listed here +generate_messages(DEPENDENCIES actionlib_msgs std_msgs) + +################################### +## catkin specific configuration ## +################################### +## The catkin_package macro generates cmake config files for your package +## Declare things to be passed to dependent projects +## LIBRARIES: libraries you create in this project that dependent projects also need +## CATKIN_DEPENDS: catkin_packages dependent projects also need +## DEPENDS: system dependencies of this project that dependent projects also need +catkin_package( + LIBRARIES tts + CATKIN_DEPENDS actionlib_msgs message_runtime rospy std_msgs +) + +############# +## Install ## +############# + +# all install targets should use catkin DESTINATION variables +# See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html + +## Mark executable scripts (Python etc.) for installation +## in contrast to setup.py, you can choose the destination +install(PROGRAMS + scripts/polly_node.py + scripts/synthesizer_node.py + scripts/tts_node.py + scripts/voicer.py + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +) + +install(DIRECTORY + config + launch + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} +) + +############# +## Testing ## +############# +if(CATKIN_ENABLE_TESTING) + ## Add folders to be run by python nosetests + catkin_add_nosetests(test/test_unit_synthesizer.py) + catkin_add_nosetests(test/test_unit_polly.py) + + if(BUILD_AWS_TESTING) + find_package(rostest REQUIRED COMPONENTS tts) + add_rostest(test/integration_tests.test DEPENDENCIES ${tts_EXPORTED_TARGETS}) + endif() +endif() + + diff --git a/LICENSE b/tts/LICENSE.txt similarity index 99% rename from LICENSE rename to tts/LICENSE.txt index d645695..d8a44c5 100644 --- a/LICENSE +++ b/tts/LICENSE.txt @@ -187,7 +187,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2018 Amazon.com, Inc. or its affiliates Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/tts/NOTICE.txt b/tts/NOTICE.txt new file mode 100644 index 0000000..13033fe --- /dev/null +++ b/tts/NOTICE.txt @@ -0,0 +1,4 @@ +Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +This product includes software developed by +Amazon Technologies, Inc (http://www.amazon.com/). diff --git a/tts/action/Speech.action b/tts/action/Speech.action new file mode 100644 index 0000000..2da3ffe --- /dev/null +++ b/tts/action/Speech.action @@ -0,0 +1,9 @@ +#goal definition +string text +string metadata +--- +#result definition +string response +--- +#feedback +string data diff --git a/tts/config/sample_configuration.yaml b/tts/config/sample_configuration.yaml new file mode 100644 index 0000000..4c74e77 --- /dev/null +++ b/tts/config/sample_configuration.yaml @@ -0,0 +1,6 @@ +# This is the AWS Client Configuration used by the AWS service client in the Node. If given the node will load the +# provided configuration when initializing the client. +aws_client_configuration: + # Specifies where you want the client to communicate. Examples include us-east-1 or us-west-1. You must ensure that + # the service you want to use has an endpoint in the region you configure. + region: "us-west-2" diff --git a/tts/launch/sample_application.launch b/tts/launch/sample_application.launch new file mode 100644 index 0000000..42137f3 --- /dev/null +++ b/tts/launch/sample_application.launch @@ -0,0 +1,13 @@ + + + + + + + + + + + + diff --git a/tts/launch/tts_polly.launch b/tts/launch/tts_polly.launch new file mode 100644 index 0000000..0e6fcec --- /dev/null +++ b/tts/launch/tts_polly.launch @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tts/package.xml b/tts/package.xml new file mode 100644 index 0000000..1e3d80f --- /dev/null +++ b/tts/package.xml @@ -0,0 +1,38 @@ + + + tts + 1.0.0 + Package enabling a robot to speak with a human voice by providing a Text-To-Speech ROS service + http://wiki.ros.org/tts + + AWS RoboMaker + AWS RoboMaker + + Apache 2.0 + + catkin + + actionlib_msgs + message_generation + rospy + std_msgs + python-boto3 + sound_play + rosunit + rostest + + actionlib_msgs + rospy + std_msgs + sound_play + + actionlib_msgs + rospy + std_msgs + message_runtime + python-boto3 + sound_play + + rosunit + rostest + diff --git a/tts/scripts/polly_node.py b/tts/scripts/polly_node.py new file mode 100755 index 0000000..d07dfd3 --- /dev/null +++ b/tts/scripts/polly_node.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + + +if __name__ == '__main__': + import tts.amazonpolly + tts.amazonpolly.main() diff --git a/tts/scripts/synthesizer_node.py b/tts/scripts/synthesizer_node.py new file mode 100755 index 0000000..937870b --- /dev/null +++ b/tts/scripts/synthesizer_node.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + + +if __name__ == "__main__": + import tts.synthesizer + tts.synthesizer.main() diff --git a/tts/scripts/tts_node.py b/tts/scripts/tts_node.py new file mode 100755 index 0000000..b8bdd6a --- /dev/null +++ b/tts/scripts/tts_node.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +"""A very simple Action Server that does TTS. + +It is a combination of a synthesizer and a player. Being an action server, it can be used in two different manners. + +1. Play and wait for it to finish +--------------------------------- + +A user can choose to be blocked until the audio playing is done. This is especially useful in interactive scenarios. + +Example:: + + rospy.init_node('tts_action_client') + client = actionlib.SimpleActionClient('tts', SpeechAction) + client.wait_for_server() + goal = SpeechGoal() + goal.text = 'Let me ask you a question, please give me your answer.' + client.send_goal(goal) + client.wait_for_result() + + # start listening to a response or waiting for some input to continue the interaction + +2. Play and forget +------------------ + +A user can also choose not to wait:: + + rospy.init_node('tts_action_client') + client = actionlib.SimpleActionClient('tts', SpeechAction) + client.wait_for_server() + goal = SpeechGoal() + goal.text = 'Let me talk, you can to something else in the meanwhile.' + client.send_goal(goal) + +This is useful when the robot wants to do stuff while the audio is being played. For example, a robot may start to +read some instructions and immediately get ready for any input. +""" + +import json + +import actionlib +import rospy +from tts.msg import SpeechAction, SpeechResult +from tts.srv import Synthesizer + +from sound_play.libsoundplay import SoundClient + + +def play(filename): + """plays the wav or ogg file using sound_play""" + SoundClient(blocking=True).playWave(filename) + + +def do_synthesize(goal): + """calls synthesizer service to do the job""" + rospy.wait_for_service('synthesizer') + synthesize = rospy.ServiceProxy('synthesizer', Synthesizer) + return synthesize(goal.text, goal.metadata) + + +def finish_with_result(s): + """responds the client""" + tts_server_result = SpeechResult(s) + server.set_succeeded(tts_server_result) + rospy.loginfo(tts_server_result) + + +def do_speak(goal): + """The action handler. + + Note that although it responds to client after the audio play is finished, a client can choose + not to wait by not calling ``SimpleActionClient.waite_for_result()``. + """ + rospy.loginfo('speech goal: {}'.format(goal)) + + res = do_synthesize(goal) + rospy.loginfo('synthesizer returns: {}'.format(res)) + + try: + r = json.loads(res.result) + except Exception as e: + s = 'Expecting JSON from synthesizer but got {}'.format(res.result) + rospy.logerr('{}. Exception: {}'.format(s, e)) + finish_with_result(s) + return + + result = '' + + if 'Audio File' in r: + audio_file = r['Audio File'] + rospy.loginfo('Will play {}'.format(audio_file)) + play(audio_file) + result = audio_file + + if 'Exception' in r: + result = '[ERROR] {}'.format(r) + rospy.logerr(result) + + finish_with_result(result) + + +if __name__ == '__main__': + rospy.init_node('tts_node') + server = actionlib.SimpleActionServer('tts', SpeechAction, do_speak, False) + server.start() + rospy.spin() diff --git a/tts/scripts/voicer.py b/tts/scripts/voicer.py new file mode 100755 index 0000000..9bbd551 --- /dev/null +++ b/tts/scripts/voicer.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +"""Usage: + +(assuming TTS action server has been started via `roslaunch tts tts_polly.launch`) + +Plain text:: + + $ rosrun tts voicer.py 'Hello World' + +SSML:: + + $ rosrun tts voicer.py \ + 'Mary has a little lamb.' \ + '{"text_type":"ssml"}' +""" + + +import sys +import actionlib +import rospy +from tts.msg import SpeechAction, SpeechGoal + + +if __name__ == '__main__': + rospy.init_node('tts_action_client') + client = actionlib.SimpleActionClient('tts', SpeechAction) + client.wait_for_server() + + goal = SpeechGoal() + + goal.text = sys.argv[1] if len(sys.argv) > 1 else 'I got no idea.' + goal.metadata = sys.argv[2] if len(sys.argv) > 2 else '' + + client.send_goal(goal) + client.wait_for_result() + print('\n' + client.get_result().response) diff --git a/tts/setup.py b/tts/setup.py new file mode 100644 index 0000000..66098bd --- /dev/null +++ b/tts/setup.py @@ -0,0 +1,33 @@ +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +import os +from distutils.core import setup +from catkin_pkg.python_setup import generate_distutils_setup + + +# ROS PACKAGING +# using distutils : https://docs.python.org/2/distutils +# fetch values from package.xml +setup_args = generate_distutils_setup( + packages=[ + 'tts', + ], + package_dir={ + '': 'src', + }, + package_data={ + '': ['data/*.ogg', 'data/models/polly/2016-06-10/*.json'] + }, +) +setup(**setup_args) diff --git a/tts/src/tts/__init__.py b/tts/src/tts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tts/src/tts/amazonpolly.py b/tts/src/tts/amazonpolly.py new file mode 100755 index 0000000..d6da9f4 --- /dev/null +++ b/tts/src/tts/amazonpolly.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +import json +import os +import sys +import wave +import traceback +import requests +from boto3 import Session +from botocore.credentials import CredentialProvider, RefreshableCredentials +from botocore.session import get_session +from botocore.exceptions import UnknownServiceError +from contextlib import closing +from optparse import OptionParser + +import rospy +from tts.srv import Polly, PollyRequest, PollyResponse + + +def get_ros_param(param, default=None): + try: + key = rospy.search_param(param) + return default if key is None else rospy.get_param(key, default) + except Exception as e: + rospy.logwarn('Failed to get ros param {}, will use default {}. Exception: '.format(param, default, e)) + return default + + +class AwsIotCredentialProvider(CredentialProvider): + METHOD = 'aws-iot' + CANONICAL_NAME = 'customIoTwithCertificate' + + DEFAULT_AUTH_CONNECT_TIMEOUT_MS = 5000 + DEFAULT_AUTH_TOTAL_TIMEOUT_MS = 10000 + + def __init__(self): + super(AwsIotCredentialProvider, self).__init__() + self.ros_param_prefix = 'iot/' + + def get_param(self, param, default=None): + return get_ros_param(self.ros_param_prefix + param, default) + + def retrieve_credentials(self): + try: + cert_file = self.get_param('certfile') + key_file = self.get_param('keyfile') + endpoint = self.get_param('endpoint') + role_alias = self.get_param('role') + connect_timeout = self.get_param('connect_timeout_ms', self.DEFAULT_AUTH_CONNECT_TIMEOUT_MS) + total_timeout = self.get_param('total_timeout_ms', self.DEFAULT_AUTH_TOTAL_TIMEOUT_MS) + thing_name = self.get_param('thing_name', '') + + if any(v is None for v in (cert_file, key_file, endpoint, role_alias, thing_name)): + return None + + headers = {'x-amzn-iot-thingname': thing_name} if len(thing_name) > 0 else None + url = 'https://{}/role-aliases/{}/credentials'.format(endpoint, role_alias) + timeout = (connect_timeout, total_timeout - connect_timeout) # see also: urllib3/util/timeout.py + + response = requests.get(url, cert=(cert_file, key_file), headers=headers, timeout=timeout) + d = response.json()['credentials'] + + rospy.loginfo('Credentials expiry time: {}'.format(d['expiration'])) + + return { + 'access_key': d['accessKeyId'], + 'secret_key': d['secretAccessKey'], + 'token': d['sessionToken'], + 'expiry_time': d['expiration'], + } + except Exception as e: + rospy.logwarn('Failed to fetch credentials from AWS IoT: {}'.format(e)) + return None + + def load(self): + return RefreshableCredentials.create_from_metadata( + self.retrieve_credentials(), + self.retrieve_credentials, + 'aws-iot-with-certificate' + ) + + +class AmazonPolly: + """A TTS engine that can be used in two different ways. + + Usage + ----- + + 1. It can run as a ROS service node. + + Start a polly node:: + + $ rosrun tts polly_node.py + + Call the service from command line:: + + $ rosservice call /polly SynthesizeSpeech 'hello polly' '' '' '' '' '' '' '' '' [] [] 0 '' '' '' '' '' '' false + + Call the service programmatically:: + + from tts.srv import Polly + rospy.wait_for_service('polly') + polly = rospy.ServiceProxy('polly', Polly) + res = polly(**kw) + + 2. It can also be used as a normal python class:: + + AmazonPolly().synthesize(text='hi polly') + + PollyRequest supports many parameters, but the majority of the users can safely ignore most of them and just + use the vanilla version which involves only one argument, ``text``. + + If in some use cases more control is needed, SSML will come handy. Example:: + + AmazonPolly().synthesize( + text='Mary has a little lamb.', + text_type='ssml' + ) + + A user can also control the voice, output format and so on. Example:: + + AmazonPolly().synthesize( + text='Mary has a little lamb.', + text_type='ssml', + voice_id='Joey', + output_format='mp3', + output_path='/tmp/blah' + ) + + + Parameters + ---------- + + Among the parameters defined in Polly.srv, the following are supported while others are reserved for future. + + * polly_action : currently only ``SynthesizeSpeech`` is supported + * text : the text to speak + * text_type : can be either ``text`` (default) or ``ssml`` + * voice_id : any voice id supported by Amazon Polly, default is Joanna + * output_format : ogg (default), mp3 or pcm + * output_path : where the audio file is saved + * sample_rate : default is 16000 for pcm or 22050 for mp3 and ogg + + The following are the reserved ones. Note that ``language_code`` is rarely needed (this may seem counter-intuitive). + See official Amazon Polly documentation for details (link can be found below). + + * language_code + * lexicon_content + * lexicon_name + * lexicon_names + * speech_mark_types + * max_results + * next_token + * sns_topic_arn + * task_id + * task_status + * output_s3_bucket_name + * output_s3_key_prefix + * include_additional_language_codes + + + Links + ----- + + Amazon Polly documentation: https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html + + """ + + def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None, region_name=None): + if region_name is None: + region_name = get_ros_param('aws_client_configuration/region', default='us-west-2') + + self.polly = self._get_polly_client(aws_access_key_id, aws_secret_access_key, aws_session_token, region_name) + self.default_text_type = 'text' + self.default_voice_id = 'Joanna' + self.default_output_format = 'ogg_vorbis' + self.default_output_folder = '.' + self.default_output_file_basename = 'output' + + def _get_polly_client(self, aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None, + region_name=None, with_service_model_patch=False): + """Note we get a new botocore session each time this function is called. + This is to avoid potential problems caused by inner state of the session. + """ + botocore_session = get_session() + + if with_service_model_patch: + # Older versions of botocore don't have polly. We can possibly fix it by appending + # extra path with polly service model files to the search path. + current_dir = os.path.dirname(os.path.abspath(__file__)) + service_model_path = os.path.join(current_dir, 'data', 'models') + botocore_session.set_config_variable('data_path', service_model_path) + rospy.loginfo('patching service model data path: {}'.format(service_model_path)) + + botocore_session.get_component('credential_provider').insert_after('boto-config', AwsIotCredentialProvider()) + + botocore_session.user_agent_extra = self._generate_user_agent_suffix() + + session = Session(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, + aws_session_token=aws_session_token, region_name=region_name, + botocore_session=botocore_session) + + try: + return session.client("polly") + except UnknownServiceError: + # the first time we reach here, we try to fix the problem + if not with_service_model_patch: + return self._get_polly_client(aws_access_key_id, aws_secret_access_key, aws_session_token, region_name, + with_service_model_patch=True) + else: + # we have tried our best, time to panic + rospy.logerr('Amazon Polly is not available. Please install the latest boto3.') + raise + + def _generate_user_agent_suffix(self): + exec_env = get_ros_param('exec_env', 'AWS_RoboMaker').strip() + if 'AWS_RoboMaker' in exec_env: + ver = get_ros_param('robomaker_version', None) + if ver: + exec_env += '-' + ver.strip() + ros_distro = get_ros_param('rosdistro', 'Unknown_ROS_DISTRO').strip() + ros_version = get_ros_param('rosversion', 'Unknown_ROS_VERSION').strip() + return 'exec-env/{} ros-{}/{}'.format(exec_env, ros_distro, ros_version) + + def _pcm2wav(self, audio_data, wav_filename, sample_rate): + """per Amazon Polly official doc, the pcm in a signed 16-bit, 1 channel (mono), little-endian format.""" + wavf = wave.open(wav_filename, 'w') + wavf.setframerate(int(sample_rate)) + wavf.setnchannels(1) # 1 channel + wavf.setsampwidth(2) # 2 bytes == 16 bits + wavf.writeframes(audio_data) + wavf.close() + + def _make_audio_file_fullpath(self, output_path, output_format): + """Makes a full path for audio file based on given output path and format. + + If ``output_path`` doesn't have a path, current path is used. + + :param output_path: the output path received + :param output_format: the audio format, e.g., mp3, ogg_vorbis, pcm + :return: a full path for the output audio file. File ext will be constructed from audio format. + """ + head, tail = os.path.split(output_path) + if not head: + head = self.default_output_folder + if not tail: + tail = self.default_output_file_basename + + file_ext = {'pcm': '.wav', 'mp3': '.mp3', 'ogg_vorbis': '.ogg'}[output_format.lower()] + if not tail.endswith(file_ext): + tail += file_ext + + return os.path.realpath(os.path.join(head, tail)) + + def _synthesize_speech_and_save(self, request): + """Calls Amazon Polly and writes the returned audio data to a local file. + + To make it practical, three things will be returned in a JSON form string, which are audio file path, + audio type and Amazon Polly response metadata. + + If the Amazon Polly call fails, audio file name will be an empty string and audio type will be "N/A". + + Please see https://boto3.readthedocs.io/reference/services/polly.html#Polly.Client.synthesize_speech + for more details on Amazon Polly API. + + :param request: an instance of PollyRequest + :return: a string in JSON form with two attributes, "Audio File" and "Amazon Polly Response". + """ + kws = { + 'LexiconNames': request.lexicon_names if request.lexicon_names else [], + 'OutputFormat': request.output_format if request.output_format else self.default_output_format, + 'SampleRate': request.sample_rate, + 'SpeechMarkTypes': request.speech_mark_types if request.speech_mark_types else [], + 'Text': request.text, + 'TextType': request.text_type if request.text_type else self.default_text_type, + 'VoiceId': request.voice_id if request.voice_id else self.default_voice_id + } + + if not kws['SampleRate']: + kws['SampleRate'] = '16000' if kws['OutputFormat'].lower() == 'pcm' else '22050' + + rospy.loginfo('Amazon Polly Request: {}'.format(kws)) + response = self.polly.synthesize_speech(**kws) + rospy.loginfo('Amazon Polly Response: {}'.format(response)) + + if "AudioStream" in response: + audiofile = self._make_audio_file_fullpath(request.output_path, kws['OutputFormat']) + rospy.loginfo('will save audio as {}'.format(audiofile)) + + with closing(response["AudioStream"]) as stream: + if kws['OutputFormat'].lower() == 'pcm': + self._pcm2wav(stream.read(), audiofile, kws['SampleRate']) + else: + with open(audiofile, "wb") as f: + f.write(stream.read()) + + audiotype = response['ContentType'] + else: + audiofile = '' + audiotype = 'N/A' + + return json.dumps({ + 'Audio File': audiofile, + 'Audio Type': audiotype, + 'Amazon Polly Response Metadata': str(response['ResponseMetadata']) + }) + + def _dispatch(self, request): + """Amazon Polly supports a number of APIs. This will call the right one based on the content of request. + + Currently "SynthesizeSpeech" is the only recognized action. Basically this method just delegates the work + to ``self._synthesize_speech_and_save`` and returns the result as is. It will simply raise if a different + action is passed in. + + :param request: an instance of PollyRequest + :return: whatever returned by the delegate + """ + actions = { + 'SynthesizeSpeech': self._synthesize_speech_and_save + # ... more actions could go in here ... + } + + if request.polly_action not in actions: + raise RuntimeError('bad or unsupported Amazon Polly action: "' + request.polly_action + '".') + + return actions[request.polly_action](request) + + def _node_request_handler(self, request): + """The callback function for processing service request. + + It never raises. If anything unexpected happens, it will return a PollyResponse with details of the exception. + + :param request: an instance of PollyRequest + :return: a PollyResponse + """ + rospy.loginfo('Amazon Polly Request: {}'.format(request)) + + try: + response = self._dispatch(request) + rospy.loginfo('will return {}'.format(response)) + return PollyResponse(result=response) + except Exception as e: + current_dir = os.path.dirname(os.path.abspath(__file__)) + exc_type = sys.exc_info()[0] + + # not using `issubclass(exc_type, ConnectionError)` for the condition below because some versions + # of urllib3 raises exception when doing `from requests.exceptions import ConnectionError` + error_ogg_filename = 'connerror.ogg' if 'ConnectionError' in exc_type.__name__ else 'error.ogg' + + error_details = { + 'Audio File': os.path.join(current_dir, 'data', error_ogg_filename), + 'Audio Type': 'ogg', + 'Exception': { + 'Type': str(exc_type), + 'Module': exc_type.__module__, + 'Name': exc_type.__name__, + 'Value': str(e), + }, + 'Traceback': traceback.format_exc() + } + + error_str = json.dumps(error_details) + rospy.logerr(error_str) + return PollyResponse(result=error_str) + + def synthesize(self, **kws): + """Call this method if you want to use polly but don't want to start a node. + + :param kws: input as defined in Polly.srv + :return: a string in JSON form with detailed information, success or failure + """ + req = PollyRequest(polly_action='SynthesizeSpeech', **kws) + return self._node_request_handler(req) + + def start(self, node_name='polly_node', service_name='polly'): + """The entry point of a ROS service node. + + Details of the service API can be found in Polly.srv. + + :param node_name: name of ROS node + :param service_name: name of ROS service + :return: it doesn't return + """ + rospy.init_node(node_name) + + service = rospy.Service(service_name, Polly, self._node_request_handler) + + rospy.loginfo('polly running: {}'.format(service.uri)) + + rospy.spin() + + +def main(): + usage = '''usage: %prog [options] + ''' + + parser = OptionParser(usage) + + parser.add_option("-n", "--node-name", dest="node_name", default='polly_node', + help="name of the ROS node", + metavar="NODE_NAME") + parser.add_option("-s", "--service-name", dest="service_name", default='polly', + help="name of the ROS service", + metavar="SERVICE_NAME") + + (options, args) = parser.parse_args() + + node_name = options.node_name + service_name = options.service_name + + AmazonPolly().start(node_name=node_name, service_name=service_name) + + +if __name__ == "__main__": + main() diff --git a/tts/src/tts/data/connerror.ogg b/tts/src/tts/data/connerror.ogg new file mode 100644 index 0000000..7414355 Binary files /dev/null and b/tts/src/tts/data/connerror.ogg differ diff --git a/tts/src/tts/data/error.ogg b/tts/src/tts/data/error.ogg new file mode 100644 index 0000000..4cf247d Binary files /dev/null and b/tts/src/tts/data/error.ogg differ diff --git a/tts/src/tts/data/models/polly/2016-06-10/examples-1.json b/tts/src/tts/data/models/polly/2016-06-10/examples-1.json new file mode 100644 index 0000000..38205db --- /dev/null +++ b/tts/src/tts/data/models/polly/2016-06-10/examples-1.json @@ -0,0 +1,171 @@ +{ + "version": "1.0", + "examples": { + "DeleteLexicon": [ + { + "input": { + "Name": "example" + }, + "output": { + }, + "comments": { + "input": { + }, + "output": { + } + }, + "description": "Deletes a specified pronunciation lexicon stored in an AWS Region.", + "id": "to-delete-a-lexicon-1481922498332", + "title": "To delete a lexicon" + } + ], + "DescribeVoices": [ + { + "input": { + "LanguageCode": "en-GB" + }, + "output": { + "Voices": [ + { + "Gender": "Female", + "Id": "Emma", + "LanguageCode": "en-GB", + "LanguageName": "British English", + "Name": "Emma" + }, + { + "Gender": "Male", + "Id": "Brian", + "LanguageCode": "en-GB", + "LanguageName": "British English", + "Name": "Brian" + }, + { + "Gender": "Female", + "Id": "Amy", + "LanguageCode": "en-GB", + "LanguageName": "British English", + "Name": "Amy" + } + ] + }, + "comments": { + "input": { + }, + "output": { + } + }, + "description": "Returns the list of voices that are available for use when requesting speech synthesis. Displayed languages are those within the specified language code. If no language code is specified, voices for all available languages are displayed.", + "id": "to-describe-available-voices-1482180557753", + "title": "To describe available voices" + } + ], + "GetLexicon": [ + { + "input": { + "Name": "" + }, + "output": { + "Lexicon": { + "Content": "\r\n\r\n \r\n W3C\r\n World Wide Web Consortium\r\n \r\n", + "Name": "example" + }, + "LexiconAttributes": { + "Alphabet": "ipa", + "LanguageCode": "en-US", + "LastModified": 1478542980.117, + "LexemesCount": 1, + "LexiconArn": "arn:aws:polly:us-east-1:123456789012:lexicon/example", + "Size": 503 + } + }, + "comments": { + "input": { + }, + "output": { + } + }, + "description": "Returns the content of the specified pronunciation lexicon stored in an AWS Region.", + "id": "to-retrieve-a-lexicon-1481912870836", + "title": "To retrieve a lexicon" + } + ], + "ListLexicons": [ + { + "input": { + }, + "output": { + "Lexicons": [ + { + "Attributes": { + "Alphabet": "ipa", + "LanguageCode": "en-US", + "LastModified": 1478542980.117, + "LexemesCount": 1, + "LexiconArn": "arn:aws:polly:us-east-1:123456789012:lexicon/example", + "Size": 503 + }, + "Name": "example" + } + ] + }, + "comments": { + "input": { + }, + "output": { + } + }, + "description": "Returns a list of pronunciation lexicons stored in an AWS Region.", + "id": "to-list-all-lexicons-in-a-region-1481842106487", + "title": "To list all lexicons in a region" + } + ], + "PutLexicon": [ + { + "input": { + "Content": "file://example.pls", + "Name": "W3C" + }, + "output": { + }, + "comments": { + "input": { + }, + "output": { + } + }, + "description": "Stores a pronunciation lexicon in an AWS Region.", + "id": "to-save-a-lexicon-1482272584088", + "title": "To save a lexicon" + } + ], + "SynthesizeSpeech": [ + { + "input": { + "LexiconNames": [ + "example" + ], + "OutputFormat": "mp3", + "SampleRate": "8000", + "Text": "All Gaul is divided into three parts", + "TextType": "text", + "VoiceId": "Joanna" + }, + "output": { + "AudioStream": "TEXT", + "ContentType": "audio/mpeg", + "RequestCharacters": 37 + }, + "comments": { + "input": { + }, + "output": { + } + }, + "description": "Synthesizes plain text or SSML into a file of human-like speech.", + "id": "to-synthesize-speech-1482186064046", + "title": "To synthesize speech" + } + ] + } +} diff --git a/tts/src/tts/data/models/polly/2016-06-10/paginators-1.json b/tts/src/tts/data/models/polly/2016-06-10/paginators-1.json new file mode 100644 index 0000000..c24ff03 --- /dev/null +++ b/tts/src/tts/data/models/polly/2016-06-10/paginators-1.json @@ -0,0 +1,9 @@ +{ + "pagination": { + "DescribeVoices": { + "input_token": "NextToken", + "output_token": "NextToken", + "result_key": "Voices" + } + } +} diff --git a/tts/src/tts/data/models/polly/2016-06-10/service-2.json b/tts/src/tts/data/models/polly/2016-06-10/service-2.json new file mode 100644 index 0000000..bfd6491 --- /dev/null +++ b/tts/src/tts/data/models/polly/2016-06-10/service-2.json @@ -0,0 +1,1022 @@ +{ + "version":"2.0", + "metadata":{ + "apiVersion":"2016-06-10", + "endpointPrefix":"polly", + "protocol":"rest-json", + "serviceFullName":"Amazon Polly", + "serviceId":"Polly", + "signatureVersion":"v4", + "uid":"polly-2016-06-10" + }, + "operations":{ + "DeleteLexicon":{ + "name":"DeleteLexicon", + "http":{ + "method":"DELETE", + "requestUri":"/v1/lexicons/{LexiconName}", + "responseCode":200 + }, + "input":{"shape":"DeleteLexiconInput"}, + "output":{"shape":"DeleteLexiconOutput"}, + "errors":[ + {"shape":"LexiconNotFoundException"}, + {"shape":"ServiceFailureException"} + ], + "documentation":"

Deletes the specified pronunciation lexicon stored in an AWS Region. A lexicon which has been deleted is not available for speech synthesis, nor is it possible to retrieve it using either the GetLexicon or ListLexicon APIs.

For more information, see Managing Lexicons.

" + }, + "DescribeVoices":{ + "name":"DescribeVoices", + "http":{ + "method":"GET", + "requestUri":"/v1/voices", + "responseCode":200 + }, + "input":{"shape":"DescribeVoicesInput"}, + "output":{"shape":"DescribeVoicesOutput"}, + "errors":[ + {"shape":"InvalidNextTokenException"}, + {"shape":"ServiceFailureException"} + ], + "documentation":"

Returns the list of voices that are available for use when requesting speech synthesis. Each voice speaks a specified language, is either male or female, and is identified by an ID, which is the ASCII version of the voice name.

When synthesizing speech ( SynthesizeSpeech ), you provide the voice ID for the voice you want from the list of voices returned by DescribeVoices.

For example, you want your news reader application to read news in a specific language, but giving a user the option to choose the voice. Using the DescribeVoices operation you can provide the user with a list of available voices to select from.

You can optionally specify a language code to filter the available voices. For example, if you specify en-US, the operation returns a list of all available US English voices.

This operation requires permissions to perform the polly:DescribeVoices action.

" + }, + "GetLexicon":{ + "name":"GetLexicon", + "http":{ + "method":"GET", + "requestUri":"/v1/lexicons/{LexiconName}", + "responseCode":200 + }, + "input":{"shape":"GetLexiconInput"}, + "output":{"shape":"GetLexiconOutput"}, + "errors":[ + {"shape":"LexiconNotFoundException"}, + {"shape":"ServiceFailureException"} + ], + "documentation":"

Returns the content of the specified pronunciation lexicon stored in an AWS Region. For more information, see Managing Lexicons.

" + }, + "GetSpeechSynthesisTask":{ + "name":"GetSpeechSynthesisTask", + "http":{ + "method":"GET", + "requestUri":"/v1/synthesisTasks/{TaskId}", + "responseCode":200 + }, + "input":{"shape":"GetSpeechSynthesisTaskInput"}, + "output":{"shape":"GetSpeechSynthesisTaskOutput"}, + "errors":[ + {"shape":"InvalidTaskIdException"}, + {"shape":"ServiceFailureException"}, + {"shape":"SynthesisTaskNotFoundException"} + ], + "documentation":"

Retrieves a specific SpeechSynthesisTask object based on its TaskID. This object contains information about the given speech synthesis task, including the status of the task, and a link to the S3 bucket containing the output of the task.

" + }, + "ListLexicons":{ + "name":"ListLexicons", + "http":{ + "method":"GET", + "requestUri":"/v1/lexicons", + "responseCode":200 + }, + "input":{"shape":"ListLexiconsInput"}, + "output":{"shape":"ListLexiconsOutput"}, + "errors":[ + {"shape":"InvalidNextTokenException"}, + {"shape":"ServiceFailureException"} + ], + "documentation":"

Returns a list of pronunciation lexicons stored in an AWS Region. For more information, see Managing Lexicons.

" + }, + "ListSpeechSynthesisTasks":{ + "name":"ListSpeechSynthesisTasks", + "http":{ + "method":"GET", + "requestUri":"/v1/synthesisTasks", + "responseCode":200 + }, + "input":{"shape":"ListSpeechSynthesisTasksInput"}, + "output":{"shape":"ListSpeechSynthesisTasksOutput"}, + "errors":[ + {"shape":"InvalidNextTokenException"}, + {"shape":"ServiceFailureException"} + ], + "documentation":"

Returns a list of SpeechSynthesisTask objects ordered by their creation date. This operation can filter the tasks by their status, for example, allowing users to list only tasks that are completed.

" + }, + "PutLexicon":{ + "name":"PutLexicon", + "http":{ + "method":"PUT", + "requestUri":"/v1/lexicons/{LexiconName}", + "responseCode":200 + }, + "input":{"shape":"PutLexiconInput"}, + "output":{"shape":"PutLexiconOutput"}, + "errors":[ + {"shape":"InvalidLexiconException"}, + {"shape":"UnsupportedPlsAlphabetException"}, + {"shape":"UnsupportedPlsLanguageException"}, + {"shape":"LexiconSizeExceededException"}, + {"shape":"MaxLexemeLengthExceededException"}, + {"shape":"MaxLexiconsNumberExceededException"}, + {"shape":"ServiceFailureException"} + ], + "documentation":"

Stores a pronunciation lexicon in an AWS Region. If a lexicon with the same name already exists in the region, it is overwritten by the new lexicon. Lexicon operations have eventual consistency, therefore, it might take some time before the lexicon is available to the SynthesizeSpeech operation.

For more information, see Managing Lexicons.

" + }, + "StartSpeechSynthesisTask":{ + "name":"StartSpeechSynthesisTask", + "http":{ + "method":"POST", + "requestUri":"/v1/synthesisTasks", + "responseCode":200 + }, + "input":{"shape":"StartSpeechSynthesisTaskInput"}, + "output":{"shape":"StartSpeechSynthesisTaskOutput"}, + "errors":[ + {"shape":"TextLengthExceededException"}, + {"shape":"InvalidS3BucketException"}, + {"shape":"InvalidS3KeyException"}, + {"shape":"InvalidSampleRateException"}, + {"shape":"InvalidSnsTopicArnException"}, + {"shape":"InvalidSsmlException"}, + {"shape":"LexiconNotFoundException"}, + {"shape":"ServiceFailureException"}, + {"shape":"MarksNotSupportedForFormatException"}, + {"shape":"SsmlMarksNotSupportedForTextTypeException"}, + {"shape":"LanguageNotSupportedException"} + ], + "documentation":"

Allows the creation of an asynchronous synthesis task, by starting a new SpeechSynthesisTask. This operation requires all the standard information needed for speech synthesis, plus the name of an Amazon S3 bucket for the service to store the output of the synthesis task and two optional parameters (OutputS3KeyPrefix and SnsTopicArn). Once the synthesis task is created, this operation will return a SpeechSynthesisTask object, which will include an identifier of this task as well as the current status.

" + }, + "SynthesizeSpeech":{ + "name":"SynthesizeSpeech", + "http":{ + "method":"POST", + "requestUri":"/v1/speech", + "responseCode":200 + }, + "input":{"shape":"SynthesizeSpeechInput"}, + "output":{"shape":"SynthesizeSpeechOutput"}, + "errors":[ + {"shape":"TextLengthExceededException"}, + {"shape":"InvalidSampleRateException"}, + {"shape":"InvalidSsmlException"}, + {"shape":"LexiconNotFoundException"}, + {"shape":"ServiceFailureException"}, + {"shape":"MarksNotSupportedForFormatException"}, + {"shape":"SsmlMarksNotSupportedForTextTypeException"}, + {"shape":"LanguageNotSupportedException"} + ], + "documentation":"

Synthesizes UTF-8 input, plain text or SSML, to a stream of bytes. SSML input must be valid, well-formed SSML. Some alphabets might not be available with all the voices (for example, Cyrillic might not be read at all by English voices) unless phoneme mapping is used. For more information, see How it Works.

" + } + }, + "shapes":{ + "Alphabet":{"type":"string"}, + "AudioStream":{ + "type":"blob", + "streaming":true + }, + "ContentType":{"type":"string"}, + "DateTime":{"type":"timestamp"}, + "DeleteLexiconInput":{ + "type":"structure", + "required":["Name"], + "members":{ + "Name":{ + "shape":"LexiconName", + "documentation":"

The name of the lexicon to delete. Must be an existing lexicon in the region.

", + "location":"uri", + "locationName":"LexiconName" + } + } + }, + "DeleteLexiconOutput":{ + "type":"structure", + "members":{ + } + }, + "DescribeVoicesInput":{ + "type":"structure", + "members":{ + "LanguageCode":{ + "shape":"LanguageCode", + "documentation":"

The language identification tag (ISO 639 code for the language name-ISO 3166 country code) for filtering the list of voices returned. If you don't specify this optional parameter, all available voices are returned.

", + "location":"querystring", + "locationName":"LanguageCode" + }, + "IncludeAdditionalLanguageCodes":{ + "shape":"IncludeAdditionalLanguageCodes", + "documentation":"

Boolean value indicating whether to return any bilingual voices that use the specified language as an additional language. For instance, if you request all languages that use US English (es-US), and there is an Italian voice that speaks both Italian (it-IT) and US English, that voice will be included if you specify yes but not if you specify no.

", + "location":"querystring", + "locationName":"IncludeAdditionalLanguageCodes" + }, + "NextToken":{ + "shape":"NextToken", + "documentation":"

An opaque pagination token returned from the previous DescribeVoices operation. If present, this indicates where to continue the listing.

", + "location":"querystring", + "locationName":"NextToken" + } + } + }, + "DescribeVoicesOutput":{ + "type":"structure", + "members":{ + "Voices":{ + "shape":"VoiceList", + "documentation":"

A list of voices with their properties.

" + }, + "NextToken":{ + "shape":"NextToken", + "documentation":"

The pagination token to use in the next request to continue the listing of voices. NextToken is returned only if the response is truncated.

" + } + } + }, + "ErrorMessage":{"type":"string"}, + "Gender":{ + "type":"string", + "enum":[ + "Female", + "Male" + ] + }, + "GetLexiconInput":{ + "type":"structure", + "required":["Name"], + "members":{ + "Name":{ + "shape":"LexiconName", + "documentation":"

Name of the lexicon.

", + "location":"uri", + "locationName":"LexiconName" + } + } + }, + "GetLexiconOutput":{ + "type":"structure", + "members":{ + "Lexicon":{ + "shape":"Lexicon", + "documentation":"

Lexicon object that provides name and the string content of the lexicon.

" + }, + "LexiconAttributes":{ + "shape":"LexiconAttributes", + "documentation":"

Metadata of the lexicon, including phonetic alphabetic used, language code, lexicon ARN, number of lexemes defined in the lexicon, and size of lexicon in bytes.

" + } + } + }, + "GetSpeechSynthesisTaskInput":{ + "type":"structure", + "required":["TaskId"], + "members":{ + "TaskId":{ + "shape":"TaskId", + "documentation":"

The Amazon Polly generated identifier for a speech synthesis task.

", + "location":"uri", + "locationName":"TaskId" + } + } + }, + "GetSpeechSynthesisTaskOutput":{ + "type":"structure", + "members":{ + "SynthesisTask":{ + "shape":"SynthesisTask", + "documentation":"

SynthesisTask object that provides information from the requested task, including output format, creation time, task status, and so on.

" + } + } + }, + "IncludeAdditionalLanguageCodes":{"type":"boolean"}, + "InvalidLexiconException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

Amazon Polly can't find the specified lexicon. Verify that the lexicon's name is spelled correctly, and then try again.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "InvalidNextTokenException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The NextToken is invalid. Verify that it's spelled correctly, and then try again.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "InvalidS3BucketException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The provided Amazon S3 bucket name is invalid. Please check your input with S3 bucket naming requirements and try again.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "InvalidS3KeyException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The provided Amazon S3 key prefix is invalid. Please provide a valid S3 object key name.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "InvalidSampleRateException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The specified sample rate is not valid.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "InvalidSnsTopicArnException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The provided SNS topic ARN is invalid. Please provide a valid SNS topic ARN and try again.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "InvalidSsmlException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The SSML you provided is invalid. Verify the SSML syntax, spelling of tags and values, and then try again.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "InvalidTaskIdException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The provided Task ID is not valid. Please provide a valid Task ID and try again.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "LanguageCode":{ + "type":"string", + "enum":[ + "cmn-CN", + "cy-GB", + "da-DK", + "de-DE", + "en-AU", + "en-GB", + "en-GB-WLS", + "en-IN", + "en-US", + "es-ES", + "es-US", + "fr-CA", + "fr-FR", + "is-IS", + "it-IT", + "ja-JP", + "hi-IN", + "ko-KR", + "nb-NO", + "nl-NL", + "pl-PL", + "pt-BR", + "pt-PT", + "ro-RO", + "ru-RU", + "sv-SE", + "tr-TR" + ] + }, + "LanguageCodeList":{ + "type":"list", + "member":{"shape":"LanguageCode"} + }, + "LanguageName":{"type":"string"}, + "LanguageNotSupportedException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The language specified is not currently supported by Amazon Polly in this capacity.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "LastModified":{"type":"timestamp"}, + "LexemesCount":{"type":"integer"}, + "Lexicon":{ + "type":"structure", + "members":{ + "Content":{ + "shape":"LexiconContent", + "documentation":"

Lexicon content in string format. The content of a lexicon must be in PLS format.

" + }, + "Name":{ + "shape":"LexiconName", + "documentation":"

Name of the lexicon.

" + } + }, + "documentation":"

Provides lexicon name and lexicon content in string format. For more information, see Pronunciation Lexicon Specification (PLS) Version 1.0.

" + }, + "LexiconArn":{"type":"string"}, + "LexiconAttributes":{ + "type":"structure", + "members":{ + "Alphabet":{ + "shape":"Alphabet", + "documentation":"

Phonetic alphabet used in the lexicon. Valid values are ipa and x-sampa.

" + }, + "LanguageCode":{ + "shape":"LanguageCode", + "documentation":"

Language code that the lexicon applies to. A lexicon with a language code such as \"en\" would be applied to all English languages (en-GB, en-US, en-AUS, en-WLS, and so on.

" + }, + "LastModified":{ + "shape":"LastModified", + "documentation":"

Date lexicon was last modified (a timestamp value).

" + }, + "LexiconArn":{ + "shape":"LexiconArn", + "documentation":"

Amazon Resource Name (ARN) of the lexicon.

" + }, + "LexemesCount":{ + "shape":"LexemesCount", + "documentation":"

Number of lexemes in the lexicon.

" + }, + "Size":{ + "shape":"Size", + "documentation":"

Total size of the lexicon, in characters.

" + } + }, + "documentation":"

Contains metadata describing the lexicon such as the number of lexemes, language code, and so on. For more information, see Managing Lexicons.

" + }, + "LexiconContent":{"type":"string"}, + "LexiconDescription":{ + "type":"structure", + "members":{ + "Name":{ + "shape":"LexiconName", + "documentation":"

Name of the lexicon.

" + }, + "Attributes":{ + "shape":"LexiconAttributes", + "documentation":"

Provides lexicon metadata.

" + } + }, + "documentation":"

Describes the content of the lexicon.

" + }, + "LexiconDescriptionList":{ + "type":"list", + "member":{"shape":"LexiconDescription"} + }, + "LexiconName":{ + "type":"string", + "pattern":"[0-9A-Za-z]{1,20}", + "sensitive":true + }, + "LexiconNameList":{ + "type":"list", + "member":{"shape":"LexiconName"}, + "max":5 + }, + "LexiconNotFoundException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

Amazon Polly can't find the specified lexicon. This could be caused by a lexicon that is missing, its name is misspelled or specifying a lexicon that is in a different region.

Verify that the lexicon exists, is in the region (see ListLexicons) and that you spelled its name is spelled correctly. Then try again.

", + "error":{"httpStatusCode":404}, + "exception":true + }, + "LexiconSizeExceededException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The maximum size of the specified lexicon would be exceeded by this operation.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "ListLexiconsInput":{ + "type":"structure", + "members":{ + "NextToken":{ + "shape":"NextToken", + "documentation":"

An opaque pagination token returned from previous ListLexicons operation. If present, indicates where to continue the list of lexicons.

", + "location":"querystring", + "locationName":"NextToken" + } + } + }, + "ListLexiconsOutput":{ + "type":"structure", + "members":{ + "Lexicons":{ + "shape":"LexiconDescriptionList", + "documentation":"

A list of lexicon names and attributes.

" + }, + "NextToken":{ + "shape":"NextToken", + "documentation":"

The pagination token to use in the next request to continue the listing of lexicons. NextToken is returned only if the response is truncated.

" + } + } + }, + "ListSpeechSynthesisTasksInput":{ + "type":"structure", + "members":{ + "MaxResults":{ + "shape":"MaxResults", + "documentation":"

Maximum number of speech synthesis tasks returned in a List operation.

", + "location":"querystring", + "locationName":"MaxResults" + }, + "NextToken":{ + "shape":"NextToken", + "documentation":"

The pagination token to use in the next request to continue the listing of speech synthesis tasks.

", + "location":"querystring", + "locationName":"NextToken" + }, + "Status":{ + "shape":"TaskStatus", + "documentation":"

Status of the speech synthesis tasks returned in a List operation

", + "location":"querystring", + "locationName":"Status" + } + } + }, + "ListSpeechSynthesisTasksOutput":{ + "type":"structure", + "members":{ + "NextToken":{ + "shape":"NextToken", + "documentation":"

An opaque pagination token returned from the previous List operation in this request. If present, this indicates where to continue the listing.

" + }, + "SynthesisTasks":{ + "shape":"SynthesisTasks", + "documentation":"

List of SynthesisTask objects that provides information from the specified task in the list request, including output format, creation time, task status, and so on.

" + } + } + }, + "MarksNotSupportedForFormatException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

Speech marks are not supported for the OutputFormat selected. Speech marks are only available for content in json format.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "MaxLexemeLengthExceededException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The maximum size of the lexeme would be exceeded by this operation.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "MaxLexiconsNumberExceededException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The maximum number of lexicons would be exceeded by this operation.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "MaxResults":{ + "type":"integer", + "max":100, + "min":1 + }, + "NextToken":{"type":"string"}, + "OutputFormat":{ + "type":"string", + "enum":[ + "json", + "mp3", + "ogg_vorbis", + "pcm" + ] + }, + "OutputS3BucketName":{ + "type":"string", + "pattern":"^[a-z0-9][\\.\\-a-z0-9]{1,61}[a-z0-9]$" + }, + "OutputS3KeyPrefix":{ + "type":"string", + "pattern":"^[0-9a-zA-Z\\/\\!\\-_\\.\\*\\'\\(\\)]{0,800}$" + }, + "OutputUri":{"type":"string"}, + "PutLexiconInput":{ + "type":"structure", + "required":[ + "Name", + "Content" + ], + "members":{ + "Name":{ + "shape":"LexiconName", + "documentation":"

Name of the lexicon. The name must follow the regular express format [0-9A-Za-z]{1,20}. That is, the name is a case-sensitive alphanumeric string up to 20 characters long.

", + "location":"uri", + "locationName":"LexiconName" + }, + "Content":{ + "shape":"LexiconContent", + "documentation":"

Content of the PLS lexicon as string data.

" + } + } + }, + "PutLexiconOutput":{ + "type":"structure", + "members":{ + } + }, + "RequestCharacters":{"type":"integer"}, + "SampleRate":{"type":"string"}, + "ServiceFailureException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

An unknown condition has caused a service failure.

", + "error":{"httpStatusCode":500}, + "exception":true, + "fault":true + }, + "Size":{"type":"integer"}, + "SnsTopicArn":{ + "type":"string", + "pattern":"^arn:aws(-(cn|iso(-b)?|us-gov))?:sns:.*:\\w{12}:.+$" + }, + "SpeechMarkType":{ + "type":"string", + "enum":[ + "sentence", + "ssml", + "viseme", + "word" + ] + }, + "SpeechMarkTypeList":{ + "type":"list", + "member":{"shape":"SpeechMarkType"}, + "max":4 + }, + "SsmlMarksNotSupportedForTextTypeException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

SSML speech marks are not supported for plain text-type input.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "StartSpeechSynthesisTaskInput":{ + "type":"structure", + "required":[ + "OutputFormat", + "OutputS3BucketName", + "Text", + "VoiceId" + ], + "members":{ + "LexiconNames":{ + "shape":"LexiconNameList", + "documentation":"

List of one or more pronunciation lexicon names you want the service to apply during synthesis. Lexicons are applied only if the language of the lexicon is the same as the language of the voice.

" + }, + "OutputFormat":{ + "shape":"OutputFormat", + "documentation":"

The format in which the returned output will be encoded. For audio stream, this will be mp3, ogg_vorbis, or pcm. For speech marks, this will be json.

" + }, + "OutputS3BucketName":{ + "shape":"OutputS3BucketName", + "documentation":"

Amazon S3 bucket name to which the output file will be saved.

" + }, + "OutputS3KeyPrefix":{ + "shape":"OutputS3KeyPrefix", + "documentation":"

The Amazon S3 key prefix for the output speech file.

" + }, + "SampleRate":{ + "shape":"SampleRate", + "documentation":"

The audio frequency specified in Hz.

The valid values for mp3 and ogg_vorbis are \"8000\", \"16000\", and \"22050\". The default value is \"22050\".

Valid values for pcm are \"8000\" and \"16000\" The default value is \"16000\".

" + }, + "SnsTopicArn":{ + "shape":"SnsTopicArn", + "documentation":"

ARN for the SNS topic optionally used for providing status notification for a speech synthesis task.

" + }, + "SpeechMarkTypes":{ + "shape":"SpeechMarkTypeList", + "documentation":"

The type of speech marks returned for the input text.

" + }, + "Text":{ + "shape":"Text", + "documentation":"

The input text to synthesize. If you specify ssml as the TextType, follow the SSML format for the input text.

" + }, + "TextType":{ + "shape":"TextType", + "documentation":"

Specifies whether the input text is plain text or SSML. The default value is plain text.

" + }, + "VoiceId":{ + "shape":"VoiceId", + "documentation":"

Voice ID to use for the synthesis.

" + }, + "LanguageCode":{ + "shape":"LanguageCode", + "documentation":"

Optional language code for the Speech Synthesis request. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).

If a bilingual voice is used and no language code is specified, Amazon Polly will use the default language of the bilingual voice. The default language for any voice is the one returned by the DescribeVoices operation for the LanguageCode parameter. For example, if no language code is specified, Aditi will use Indian English rather than Hindi.

" + } + } + }, + "StartSpeechSynthesisTaskOutput":{ + "type":"structure", + "members":{ + "SynthesisTask":{ + "shape":"SynthesisTask", + "documentation":"

SynthesisTask object that provides information and attributes about a newly submitted speech synthesis task.

" + } + } + }, + "SynthesisTask":{ + "type":"structure", + "members":{ + "TaskId":{ + "shape":"TaskId", + "documentation":"

The Amazon Polly generated identifier for a speech synthesis task.

" + }, + "TaskStatus":{ + "shape":"TaskStatus", + "documentation":"

Current status of the individual speech synthesis task.

" + }, + "TaskStatusReason":{ + "shape":"TaskStatusReason", + "documentation":"

Reason for the current status of a specific speech synthesis task, including errors if the task has failed.

" + }, + "OutputUri":{ + "shape":"OutputUri", + "documentation":"

Pathway for the output speech file.

" + }, + "CreationTime":{ + "shape":"DateTime", + "documentation":"

Timestamp for the time the synthesis task was started.

" + }, + "RequestCharacters":{ + "shape":"RequestCharacters", + "documentation":"

Number of billable characters synthesized.

" + }, + "SnsTopicArn":{ + "shape":"SnsTopicArn", + "documentation":"

ARN for the SNS topic optionally used for providing status notification for a speech synthesis task.

" + }, + "LexiconNames":{ + "shape":"LexiconNameList", + "documentation":"

List of one or more pronunciation lexicon names you want the service to apply during synthesis. Lexicons are applied only if the language of the lexicon is the same as the language of the voice.

" + }, + "OutputFormat":{ + "shape":"OutputFormat", + "documentation":"

The format in which the returned output will be encoded. For audio stream, this will be mp3, ogg_vorbis, or pcm. For speech marks, this will be json.

" + }, + "SampleRate":{ + "shape":"SampleRate", + "documentation":"

The audio frequency specified in Hz.

The valid values for mp3 and ogg_vorbis are \"8000\", \"16000\", and \"22050\". The default value is \"22050\".

Valid values for pcm are \"8000\" and \"16000\" The default value is \"16000\".

" + }, + "SpeechMarkTypes":{ + "shape":"SpeechMarkTypeList", + "documentation":"

The type of speech marks returned for the input text.

" + }, + "TextType":{ + "shape":"TextType", + "documentation":"

Specifies whether the input text is plain text or SSML. The default value is plain text.

" + }, + "VoiceId":{ + "shape":"VoiceId", + "documentation":"

Voice ID to use for the synthesis.

" + }, + "LanguageCode":{ + "shape":"LanguageCode", + "documentation":"

Optional language code for a synthesis task. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).

If a bilingual voice is used and no language code is specified, Amazon Polly will use the default language of the bilingual voice. The default language for any voice is the one returned by the DescribeVoices operation for the LanguageCode parameter. For example, if no language code is specified, Aditi will use Indian English rather than Hindi.

" + } + }, + "documentation":"

SynthesisTask object that provides information about a speech synthesis task.

" + }, + "SynthesisTaskNotFoundException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The Speech Synthesis task with requested Task ID cannot be found.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "SynthesisTasks":{ + "type":"list", + "member":{"shape":"SynthesisTask"} + }, + "SynthesizeSpeechInput":{ + "type":"structure", + "required":[ + "OutputFormat", + "Text", + "VoiceId" + ], + "members":{ + "LexiconNames":{ + "shape":"LexiconNameList", + "documentation":"

List of one or more pronunciation lexicon names you want the service to apply during synthesis. Lexicons are applied only if the language of the lexicon is the same as the language of the voice. For information about storing lexicons, see PutLexicon.

" + }, + "OutputFormat":{ + "shape":"OutputFormat", + "documentation":"

The format in which the returned output will be encoded. For audio stream, this will be mp3, ogg_vorbis, or pcm. For speech marks, this will be json.

When pcm is used, the content returned is audio/pcm in a signed 16-bit, 1 channel (mono), little-endian format.

" + }, + "SampleRate":{ + "shape":"SampleRate", + "documentation":"

The audio frequency specified in Hz.

The valid values for mp3 and ogg_vorbis are \"8000\", \"16000\", and \"22050\". The default value is \"22050\".

Valid values for pcm are \"8000\" and \"16000\" The default value is \"16000\".

" + }, + "SpeechMarkTypes":{ + "shape":"SpeechMarkTypeList", + "documentation":"

The type of speech marks returned for the input text.

" + }, + "Text":{ + "shape":"Text", + "documentation":"

Input text to synthesize. If you specify ssml as the TextType, follow the SSML format for the input text.

" + }, + "TextType":{ + "shape":"TextType", + "documentation":"

Specifies whether the input text is plain text or SSML. The default value is plain text. For more information, see Using SSML.

" + }, + "VoiceId":{ + "shape":"VoiceId", + "documentation":"

Voice ID to use for the synthesis. You can get a list of available voice IDs by calling the DescribeVoices operation.

" + }, + "LanguageCode":{ + "shape":"LanguageCode", + "documentation":"

Optional language code for the Synthesize Speech request. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).

If a bilingual voice is used and no language code is specified, Amazon Polly will use the default language of the bilingual voice. The default language for any voice is the one returned by the DescribeVoices operation for the LanguageCode parameter. For example, if no language code is specified, Aditi will use Indian English rather than Hindi.

" + } + } + }, + "SynthesizeSpeechOutput":{ + "type":"structure", + "members":{ + "AudioStream":{ + "shape":"AudioStream", + "documentation":"

Stream containing the synthesized speech.

" + }, + "ContentType":{ + "shape":"ContentType", + "documentation":"

Specifies the type audio stream. This should reflect the OutputFormat parameter in your request.

", + "location":"header", + "locationName":"Content-Type" + }, + "RequestCharacters":{ + "shape":"RequestCharacters", + "documentation":"

Number of characters synthesized.

", + "location":"header", + "locationName":"x-amzn-RequestCharacters" + } + }, + "payload":"AudioStream" + }, + "TaskId":{ + "type":"string", + "max":128, + "min":1 + }, + "TaskStatus":{ + "type":"string", + "enum":[ + "scheduled", + "inProgress", + "completed", + "failed" + ] + }, + "TaskStatusReason":{"type":"string"}, + "Text":{"type":"string"}, + "TextLengthExceededException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The value of the \"Text\" parameter is longer than the accepted limits. For the SynthesizeSpeech API, the limit for input text is a maximum of 6000 characters total, of which no more than 3000 can be billed characters. For the StartSpeechSynthesisTask API, the maximum is 200,000 characters, of which no more than 100,000 can be billed characters. SSML tags are not counted as billed characters.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "TextType":{ + "type":"string", + "enum":[ + "ssml", + "text" + ] + }, + "UnsupportedPlsAlphabetException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The alphabet specified by the lexicon is not a supported alphabet. Valid values are x-sampa and ipa.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "UnsupportedPlsLanguageException":{ + "type":"structure", + "members":{ + "message":{"shape":"ErrorMessage"} + }, + "documentation":"

The language specified in the lexicon is unsupported. For a list of supported languages, see Lexicon Attributes.

", + "error":{"httpStatusCode":400}, + "exception":true + }, + "Voice":{ + "type":"structure", + "members":{ + "Gender":{ + "shape":"Gender", + "documentation":"

Gender of the voice.

" + }, + "Id":{ + "shape":"VoiceId", + "documentation":"

Amazon Polly assigned voice ID. This is the ID that you specify when calling the SynthesizeSpeech operation.

" + }, + "LanguageCode":{ + "shape":"LanguageCode", + "documentation":"

Language code of the voice.

" + }, + "LanguageName":{ + "shape":"LanguageName", + "documentation":"

Human readable name of the language in English.

" + }, + "Name":{ + "shape":"VoiceName", + "documentation":"

Name of the voice (for example, Salli, Kendra, etc.). This provides a human readable voice name that you might display in your application.

" + }, + "AdditionalLanguageCodes":{ + "shape":"LanguageCodeList", + "documentation":"

Additional codes for languages available for the specified voice in addition to its default language.

For example, the default language for Aditi is Indian English (en-IN) because it was first used for that language. Since Aditi is bilingual and fluent in both Indian English and Hindi, this parameter would show the code hi-IN.

" + } + }, + "documentation":"

Description of the voice.

" + }, + "VoiceId":{ + "type":"string", + "enum":[ + "Geraint", + "Gwyneth", + "Mads", + "Naja", + "Hans", + "Marlene", + "Nicole", + "Russell", + "Amy", + "Brian", + "Emma", + "Raveena", + "Ivy", + "Joanna", + "Joey", + "Justin", + "Kendra", + "Kimberly", + "Matthew", + "Salli", + "Conchita", + "Enrique", + "Miguel", + "Penelope", + "Chantal", + "Celine", + "Lea", + "Mathieu", + "Dora", + "Karl", + "Carla", + "Giorgio", + "Mizuki", + "Liv", + "Lotte", + "Ruben", + "Ewa", + "Jacek", + "Jan", + "Maja", + "Ricardo", + "Vitoria", + "Cristiano", + "Ines", + "Carmen", + "Maxim", + "Tatyana", + "Astrid", + "Filiz", + "Vicki", + "Takumi", + "Seoyeon", + "Aditi", + "Zhiyu" + ] + }, + "VoiceList":{ + "type":"list", + "member":{"shape":"Voice"} + }, + "VoiceName":{"type":"string"} + }, + "documentation":"

Amazon Polly is a web service that makes it easy to synthesize speech from text.

The Amazon Polly service provides API operations for synthesizing high-quality speech from plain text and Speech Synthesis Markup Language (SSML), along with managing pronunciations lexicons that enable you to get the best results for your application domain.

" +} diff --git a/tts/src/tts/synthesizer.py b/tts/src/tts/synthesizer.py new file mode 100755 index 0000000..91b6e07 --- /dev/null +++ b/tts/src/tts/synthesizer.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +import os +import time +import json +import rospy +import hashlib +from optparse import OptionParser +from tts.srv import Synthesizer, SynthesizerResponse + + +class SpeechSynthesizer: + """This class serves as a ROS service node that should be an entry point of a TTS task. + + Although the current implementation uses Amazon Polly as the synthesis engine, it is not hard to let it support + more heterogeneous engines while keeping the API the same. + + In order to support a variety of engines, the SynthesizerRequest was designed with flexibility in mind. It + has two fields: text and metadata. Both are strings. In most cases, a user can ignore the metadata and call + the service with some plain text. If the use case needs any control or engine-specific feature, the extra + information can be put into the JSON-form metadata. This class will use the information when calling the engine. + + The decoupling of the synthesizer and the actual synthesis engine will benefit the users in many ways. + + First, a user will be able to use a unified interface to do the TTS job and have the freedom to use different + engines available with no or very little change from the client side. + + Second, by applying some design patterns, the synthesizer can choose an engine dynamically. For example, a user + may prefer to use Amazon Polly but is also OK with an offline solution when network is not reliable. + + Third, engines can be complicated, thus difficult to use. As an example, Amazon Polly supports dozens of parameters + and is able to accomplish nontrivial synthesis jobs, but majority of the users never need those features. This + class provides a clean interface with two parameters only, so that it is much easier and pleasant to use. If by + any chance the advanced features are required, the user can always leverage the metadata field or even go to the + backend engine directly. + + Also, from an engineering perspective, simple and decoupled modules are easier to maintain. + + This class supports two modes of using polly. It can either call a service node or use AmazonPolly as a library. + + Start the service node:: + + $ rosrun tts synthesizer_node.py # use default configuration + $ rosrun tts synthesizer_node.py -e POLLY_LIBRARY # will not call polly service node + + Call the service:: + + $ rosservice call /synthesizer 'hello' '' + $ rosservice call /synthesizer 'hello' '"{\"text_type\":\"ssml\"}"' + """ + + class PollyViaNode: + def __init__(self, polly_service_name='polly'): + self.service_name = polly_service_name + + def __call__(self, **kwargs): + rospy.loginfo('will call service {}'.format(self.service_name)) + from tts.srv import Polly + rospy.wait_for_service(self.service_name) + polly = rospy.ServiceProxy(self.service_name, Polly) + return polly(polly_action='SynthesizeSpeech', **kwargs) + + class PollyDirect: + def __init__(self): + pass + + def __call__(self, **kwargs): + rospy.loginfo('will import amazonpolly.AmazonPolly') + from tts.amazonpolly import AmazonPolly + node = AmazonPolly() + return node.synthesize(**kwargs) + + ENGINES = { + 'POLLY_SERVICE': PollyViaNode, + 'POLLY_LIBRARY': PollyDirect, + } + + class BadEngineError(NameError): + pass + + def __init__(self, engine='POLLY_SERVICE', polly_service_name='polly'): + if engine not in self.ENGINES: + msg = 'bad engine {} which is not one of {}'.format(engine, ', '.join(SpeechSynthesizer.ENGINES.keys())) + raise SpeechSynthesizer.BadEngineError(msg) + + engine_kwargs = {'polly_service_name': polly_service_name} if engine == 'POLLY_SERVICE' else {} + self.engine = self.ENGINES[engine](**engine_kwargs) + + self.default_text_type = 'text' + self.default_voice_id = 'Joanna' + self.default_output_format = 'ogg_vorbis' + + def _call_engine(self, **kw): + """Call engine to do the job. + + If no output path is found from input, the audio file will be put into /tmp and the file name will have + a prefix of the md5 hash of the text. + + :param kw: what AmazonPolly needs to synthesize + :return: response from AmazonPolly + """ + if 'output_path' not in kw: + tmp_filename = hashlib.md5(kw['text']).hexdigest() + tmp_filepath = os.path.join(os.sep, 'tmp', 'voice_{}_{}'.format(tmp_filename, str(time.time()))) + kw['output_path'] = os.path.abspath(tmp_filepath) + rospy.loginfo('audio will be saved as {}'.format(kw['output_path'])) + + return self.engine(**kw) + + def _parse_request_or_raise(self, request): + """It will raise if request is malformed. + + :param request: an instance of SynthesizerRequest + :return: a dict + """ + md = json.loads(request.metadata) if request.metadata else {} + + md['output_format'] = md.get('output_format', self.default_output_format) + md['voice_id'] = md.get('voice_id', self.default_voice_id) + md['sample_rate'] = md.get('sample_rate', '16000' if md['output_format'].lower() == 'pcm' else '22050') + md['text_type'] = md.get('text_type', self.default_text_type) + md['text'] = request.text + + return md + + def _node_request_handler(self, request): + """The callback function for processing service request. + + It never raises. If anything unexpected happens, it will return a SynthesizerResponse with the exception. + + :param request: an instance of SynthesizerRequest + :return: a SynthesizerResponse + """ + rospy.loginfo(request) + try: + kws = self._parse_request_or_raise(request) + res = self._call_engine(**kws).result + + return SynthesizerResponse(res) + except Exception as e: + return SynthesizerResponse('Exception: {}'.format(e)) + + def start(self, node_name='synthesizer_node', service_name='synthesizer'): + """The entry point of a ROS service node. + + :param node_name: name of ROS node + :param service_name: name of ROS service + :return: it doesn't return + """ + rospy.init_node(node_name) + + service = rospy.Service(service_name, Synthesizer, self._node_request_handler) + + rospy.loginfo('{} running: {}'.format(node_name, service.uri)) + + rospy.spin() + + +def main(): + usage = '''usage: %prog [options] + ''' + + parser = OptionParser(usage) + + parser.add_option("-n", "--node-name", dest="node_name", default='synthesizer_node', + help="name of the ROS node", + metavar="NODE_NAME") + parser.add_option("-s", "--service-name", dest="service_name", default='synthesizer', + help="name of the ROS service", + metavar="SERVICE_NAME") + parser.add_option("-e", "--engine", dest="engine", default='POLLY_SERVICE', + help="name of the synthesis engine", + metavar="ENGINE") + parser.add_option("-p", "--polly-service-name", dest="polly_service_name", default='polly', + help="name of the polly service", + metavar="POLLY_SERVICE_NAME") + + (options, args) = parser.parse_args() + + node_name = options.node_name + service_name = options.service_name + engine = options.engine + polly_service_name = options.polly_service_name + + if engine == 'POLLY_SERVICE': + synthesizer = SpeechSynthesizer(engine=engine, polly_service_name=polly_service_name) + else: + synthesizer = SpeechSynthesizer(engine=engine) + synthesizer.start(node_name=node_name, service_name=service_name) + + +if __name__ == "__main__": + main() diff --git a/tts/srv/Polly.srv b/tts/srv/Polly.srv new file mode 100644 index 0000000..02ac7ad --- /dev/null +++ b/tts/srv/Polly.srv @@ -0,0 +1,22 @@ +string polly_action +string text +string text_type +string language_code +string voice_id +string output_format +string output_path +string sample_rate +string lexicon_content +string lexicon_name +string[] lexicon_names +string[] speech_mark_types +uint32 max_results +string next_token +string sns_topic_arn +string task_id +string task_status +string output_s3_bucket_name +string output_s3_key_prefix +bool include_additional_language_codes +--- +string result diff --git a/tts/srv/Synthesizer.srv b/tts/srv/Synthesizer.srv new file mode 100644 index 0000000..d623bb2 --- /dev/null +++ b/tts/srv/Synthesizer.srv @@ -0,0 +1,4 @@ +string text +string metadata +--- +string result diff --git a/tts/test/integration_tests.test b/tts/test/integration_tests.test new file mode 100644 index 0000000..4f662be --- /dev/null +++ b/tts/test/integration_tests.test @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tts/test/test_integration.py b/tts/test/test_integration.py new file mode 100755 index 0000000..3f798f1 --- /dev/null +++ b/tts/test/test_integration.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +from __future__ import print_function + +import sys +import json +import unittest + +import rospy +import rostest + +from tts.srv import Polly +from tts.srv import PollyResponse +from tts.srv import Synthesizer +from tts.srv import SynthesizerResponse + +# import tts which is a relay package, otherwise things don't work +# +# devel/lib/python2.7/dist-packages/ +# +-- tts +# | +-- __init__.py +# +-- ... +# +# per http://docs.ros.org/api/catkin/html/user_guide/setup_dot_py.html: +# +# A relay package is a folder with an __init__.py folder and nothing else. +# Importing this folder in python will execute the contents of __init__.py, +# which will in turn import the original python modules in the folder in +# the sourcespace using the python exec() function. + + +PKG = 'tts' +NAME = 'amazonpolly' + + +class TestPlainText(unittest.TestCase): + + def test_plain_text_to_wav_via_polly_node(self): + rospy.wait_for_service('polly') + polly = rospy.ServiceProxy('polly', Polly) + + test_text = 'Mary has a little lamb, little lamb, little lamb.' + res = polly(polly_action='SynthesizeSpeech', text=test_text) + self.assertIsNotNone(res) + self.assertTrue(type(res) is PollyResponse) + + r = json.loads(res.result) + self.assertIn('Audio Type', r, 'result should contain audio type') + self.assertIn('Audio File', r, 'result should contain file path') + self.assertIn('Amazon Polly Response Metadata', r, 'result should contain metadata') + + audio_type = r['Audio Type'] + audio_file = r['Audio File'] + md = r['Amazon Polly Response Metadata'] + self.assertTrue("'HTTPStatusCode': 200," in md) + self.assertEqual('audio/ogg', audio_type) + self.assertTrue(audio_file.endswith('.ogg')) + + import subprocess + o = subprocess.check_output(['file', audio_file], stderr=subprocess.STDOUT) + import re + m = re.search(r'.*Ogg data, Vorbis audi.*', o, flags=re.MULTILINE) + self.assertIsNotNone(m) + + def test_plain_text_using_polly_class(self): + from tts.amazonpolly import AmazonPolly + polly = AmazonPolly() + test_text = 'Mary has a little lamb, little lamb, little lamb.' + res = polly.synthesize(text=test_text) + self.assertIsNotNone(res) + self.assertTrue(type(res) is PollyResponse) + + r = json.loads(res.result) + self.assertIn('Audio Type', r, 'result should contain audio type') + self.assertIn('Audio File', r, 'result should contain file path') + self.assertIn('Amazon Polly Response Metadata', r, 'result should contain metadata') + + audio_type = r['Audio Type'] + audio_file = r['Audio File'] + md = r['Amazon Polly Response Metadata'] + self.assertTrue("'HTTPStatusCode': 200," in md) + self.assertEqual('audio/ogg', audio_type) + self.assertTrue(audio_file.endswith('.ogg')) + + import subprocess + o = subprocess.check_output(['file', audio_file], stderr=subprocess.STDOUT) + import re + m = re.search(r'.*Ogg data, Vorbis audi.*', o, flags=re.MULTILINE) + self.assertIsNotNone(m) + + def test_plain_text_via_synthesizer_node(self): + rospy.wait_for_service('synthesizer') + speech_synthesizer = rospy.ServiceProxy('synthesizer', Synthesizer) + + text = 'Mary has a little lamb, little lamb, little lamb.' + res = speech_synthesizer(text=text) + self.assertIsNotNone(res) + self.assertTrue(type(res) is SynthesizerResponse) + + r = json.loads(res.result) + self.assertIn('Audio Type', r, 'result should contain audio type') + self.assertIn('Audio File', r, 'result should contain file path') + self.assertIn('Amazon Polly Response Metadata', r, 'result should contain metadata') + + audio_type = r['Audio Type'] + audio_file = r['Audio File'] + md = r['Amazon Polly Response Metadata'] + self.assertTrue("'HTTPStatusCode': 200," in md) + self.assertEqual('audio/ogg', audio_type) + self.assertTrue(audio_file.endswith('.ogg')) + + import subprocess + o = subprocess.check_output(['file', audio_file], stderr=subprocess.STDOUT) + import re + m = re.search(r'.*Ogg data, Vorbis audi.*', o, flags=re.MULTILINE) + self.assertIsNotNone(m) + + def test_plain_text_to_mp3_via_polly_node(self): + rospy.wait_for_service('polly') + polly = rospy.ServiceProxy('polly', Polly) + + test_text = 'Mary has a little lamb, little lamb, little lamb.' + res = polly(polly_action='SynthesizeSpeech', text=test_text, output_format='mp3') + self.assertIsNotNone(res) + self.assertTrue(type(res) is PollyResponse) + + r = json.loads(res.result) + self.assertIn('Audio Type', r, 'result should contain audio type') + self.assertIn('Audio File', r, 'result should contain file path') + self.assertIn('Amazon Polly Response Metadata', r, 'result should contain metadata') + + audio_type = r['Audio Type'] + audio_file = r['Audio File'] + md = r['Amazon Polly Response Metadata'] + self.assertTrue("'HTTPStatusCode': 200," in md) + self.assertEqual('audio/mpeg', audio_type) + self.assertTrue(audio_file.endswith('.mp3')) + + import subprocess + o = subprocess.check_output(['file', audio_file], stderr=subprocess.STDOUT) + import re + m = re.search(r'.*MPEG.*layer III.*', o, flags=re.MULTILINE) + self.assertIsNotNone(m) + + def test_simple_ssml_via_polly_node(self): + rospy.wait_for_service('polly') + polly = rospy.ServiceProxy('polly', Polly) + + text = 'Mary has a little lamb, little lamb, little lamb.' + res = polly(polly_action='SynthesizeSpeech', text=text, text_type='ssml') + self.assertIsNotNone(res) + self.assertTrue(type(res) is PollyResponse) + + r = json.loads(res.result) + self.assertIn('Audio Type', r, 'result should contain audio type') + self.assertIn('Audio File', r, 'result should contain file path') + self.assertIn('Amazon Polly Response Metadata', r, 'result should contain metadata') + + audio_type = r['Audio Type'] + audio_file = r['Audio File'] + md = r['Amazon Polly Response Metadata'] + self.assertTrue("'HTTPStatusCode': 200," in md) + self.assertEqual('audio/ogg', audio_type) + self.assertTrue(audio_file.endswith('.ogg')) + + import subprocess + o = subprocess.check_output(['file', audio_file], stderr=subprocess.STDOUT) + import re + m = re.search(r'.*Ogg data, Vorbis audi.*', o, flags=re.MULTILINE) + self.assertIsNotNone(m) + + def test_simple_ssml_via_synthesizer_node(self): + rospy.wait_for_service('synthesizer') + speech_synthesizer = rospy.ServiceProxy('synthesizer', Synthesizer) + + text = 'Mary has a little lamb, little lamb, little lamb.' + res = speech_synthesizer(text=text, metadata='''{"text_type":"ssml"}''') + self.assertIsNotNone(res) + self.assertTrue(type(res) is SynthesizerResponse) + + r = json.loads(res.result) + self.assertIn('Audio Type', r, 'result should contain audio type') + self.assertIn('Audio File', r, 'result should contain file path') + self.assertIn('Amazon Polly Response Metadata', r, 'result should contain metadata') + + audio_type = r['Audio Type'] + audio_file = r['Audio File'] + md = r['Amazon Polly Response Metadata'] + self.assertTrue("'HTTPStatusCode': 200," in md) + self.assertEqual('audio/ogg', audio_type) + self.assertTrue(audio_file.endswith('.ogg')) + + import subprocess + o = subprocess.check_output(['file', audio_file], stderr=subprocess.STDOUT) + import re + m = re.search(r'.*Ogg data, Vorbis audi.*', o, flags=re.MULTILINE) + self.assertIsNotNone(m) + + +if __name__ == '__main__': + rostest.rosrun(PKG, NAME, TestPlainText, sys.argv) diff --git a/tts/test/test_unit_polly.py b/tts/test/test_unit_polly.py new file mode 100755 index 0000000..98d18a2 --- /dev/null +++ b/tts/test/test_unit_polly.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +from __future__ import print_function + + +from mock import patch, MagicMock # python2 uses backport of unittest.mock(docs.python.org/3/library/unittest.mock.html) +import unittest + + +class TestPolly(unittest.TestCase): + + def setUp(self): + """important: import tts which is a relay package:: + + devel/lib/python2.7/dist-packages/ + +-- tts + | +-- __init__.py + +-- ... + + per http://docs.ros.org/api/catkin/html/user_guide/setup_dot_py.html: + + A relay package is a folder with an __init__.py folder and nothing else. + Importing this folder in python will execute the contents of __init__.py, + which will in turn import the original python modules in the folder in + the sourcespace using the python exec() function. + """ + import tts + self.assertIsNotNone(tts) + + @patch('tts.amazonpolly.Session') + def test_init(self, boto3_session_class_mock): + from tts.amazonpolly import AmazonPolly + AmazonPolly() + + boto3_session_class_mock.assert_called() + boto3_session_class_mock.return_value.client.assert_called_with('polly') + + @patch('tts.amazonpolly.Session') + def test_defaults(self, boto3_session_class_mock): + from tts.amazonpolly import AmazonPolly + polly = AmazonPolly() + + boto3_session_class_mock.assert_called() + boto3_session_class_mock.return_value.client.assert_called_with('polly') + + self.assertEqual('text', polly.default_text_type) + self.assertEqual('ogg_vorbis', polly.default_output_format) + self.assertEqual('Joanna', polly.default_voice_id) + self.assertEqual('.', polly.default_output_folder) + self.assertEqual('output', polly.default_output_file_basename) + + @patch('tts.amazonpolly.Session') + def test_good_synthesis_with_default_args(self, boto3_session_class_mock): + boto3_session_obj_mock = MagicMock() + boto3_polly_obj_mock = MagicMock() + boto3_polly_response_mock = MagicMock() + audio_stream_mock = MagicMock() + fake_audio_stream_data = 'I am audio.' + fake_audio_content_type = 'super tts' + fake_boto3_polly_response_metadata = {'foo': 'bar'} + + boto3_session_class_mock.return_value = boto3_session_obj_mock + boto3_session_obj_mock.client.return_value = boto3_polly_obj_mock + boto3_polly_obj_mock.synthesize_speech.return_value = boto3_polly_response_mock + audio_stream_mock.read.return_value = fake_audio_stream_data + d = { + 'AudioStream': audio_stream_mock, + 'ContentType': fake_audio_content_type, + 'ResponseMetadata': fake_boto3_polly_response_metadata + } + boto3_polly_response_mock.__contains__.side_effect = d.__contains__ + boto3_polly_response_mock.__getitem__.side_effect = d.__getitem__ + + from tts.amazonpolly import AmazonPolly + polly_under_test = AmazonPolly() + + boto3_session_class_mock.assert_called() + boto3_session_obj_mock.client.assert_called_with('polly') + + res = polly_under_test.synthesize(text='hello') + + expected_synthesize_speech_kwargs = { + 'LexiconNames': [], + 'OutputFormat': 'ogg_vorbis', + 'SampleRate': '22050', + 'SpeechMarkTypes': [], + 'Text': 'hello', + 'TextType': 'text', + 'VoiceId': 'Joanna', + } + boto3_polly_obj_mock.synthesize_speech.assert_called_with(**expected_synthesize_speech_kwargs) + + from tts.srv import PollyResponse + self.assertTrue(isinstance(res, PollyResponse)) + + import json + j = json.loads(res.result) + observed_audio_file_content = open(j['Audio File']).read() + self.assertEqual(fake_audio_stream_data, observed_audio_file_content) + + self.assertEqual(fake_audio_content_type, j['Audio Type']) + self.assertEqual(str(fake_boto3_polly_response_metadata), j['Amazon Polly Response Metadata']) + + @patch('tts.amazonpolly.Session') + def test_polly_raises(self, boto3_session_class_mock): + boto3_session_obj_mock = MagicMock() + boto3_polly_obj_mock = MagicMock() + boto3_polly_response_mock = MagicMock() + audio_stream_mock = MagicMock() + fake_audio_stream_data = 'I am audio.' + fake_audio_content_type = 'super voice' + fake_boto3_polly_response_metadata = {'foo': 'bar'} + + boto3_session_class_mock.return_value = boto3_session_obj_mock + boto3_session_obj_mock.client.return_value = boto3_polly_obj_mock + boto3_polly_obj_mock.synthesize_speech.side_effect = RuntimeError('Amazon Polly Exception') + audio_stream_mock.read.return_value = fake_audio_stream_data + d = { + 'AudioStream': audio_stream_mock, + 'ContentType': fake_audio_content_type, + 'ResponseMetadata': fake_boto3_polly_response_metadata + } + boto3_polly_response_mock.__contains__.side_effect = d.__contains__ + boto3_polly_response_mock.__getitem__.side_effect = d.__getitem__ + + from tts.amazonpolly import AmazonPolly + polly_under_test = AmazonPolly() + + boto3_session_class_mock.assert_called() + boto3_session_obj_mock.client.assert_called_with('polly') + + res = polly_under_test.synthesize(text='hello') + + expected_synthesize_speech_kwargs = { + 'LexiconNames': [], + 'OutputFormat': 'ogg_vorbis', + 'SampleRate': '22050', + 'SpeechMarkTypes': [], + 'Text': 'hello', + 'TextType': 'text', + 'VoiceId': 'Joanna', + } + boto3_polly_obj_mock.synthesize_speech.assert_called_with(**expected_synthesize_speech_kwargs) + + from tts.srv import PollyResponse + self.assertTrue(isinstance(res, PollyResponse)) + + import json + j = json.loads(res.result) + self.assertTrue('Exception' in j) + self.assertTrue('Traceback' in j) + + @patch('tts.amazonpolly.AmazonPolly') + def test_cli(self, amazon_polly_class_mock): + import sys + with patch.object(sys, 'argv', ['polly_node.py', '-n', 'polly-node']): + from tts import amazonpolly + amazonpolly.main() + amazon_polly_class_mock.assert_called() + amazon_polly_class_mock.return_value.start.assert_called_with(node_name='polly-node', service_name='polly') + + +if __name__ == '__main__': + import rosunit + rosunit.unitrun('tts', 'unittest-polly', TestPolly) diff --git a/tts/test/test_unit_synthesizer.py b/tts/test/test_unit_synthesizer.py new file mode 100755 index 0000000..0148906 --- /dev/null +++ b/tts/test/test_unit_synthesizer.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +from __future__ import print_function + +from mock import patch, MagicMock # python2 uses backport of unittest.mock(docs.python.org/3/library/unittest.mock.html) +import unittest + + +class TestSynthesizer(unittest.TestCase): + + def setUp(self): + """important: import tts which is a relay package:: + + devel/lib/python2.7/dist-packages/ + +-- tts + | +-- __init__.py + +-- ... + + per http://docs.ros.org/api/catkin/html/user_guide/setup_dot_py.html: + + A relay package is a folder with an __init__.py folder and nothing else. + Importing this folder in python will execute the contents of __init__.py, + which will in turn import the original python modules in the folder in + the sourcespace using the python exec() function. + """ + import tts + self.assertIsNotNone(tts) + + def test_init(self): + from tts.synthesizer import SpeechSynthesizer + speech_synthesizer = SpeechSynthesizer() + self.assertEqual('text', speech_synthesizer.default_text_type) + + @patch('tts.amazonpolly.AmazonPolly') + def test_good_synthesis_with_mostly_default_args_using_polly_lib(self, polly_class_mock): + polly_obj_mock = MagicMock() + polly_class_mock.return_value = polly_obj_mock + + test_text = 'hello' + test_metadata = ''' + { + "output_path": "/tmp/test" + } + ''' + expected_polly_synthesize_args = { + 'output_format': 'ogg_vorbis', + 'voice_id': 'Joanna', + 'sample_rate': '22050', + 'text_type': 'text', + 'text': test_text, + 'output_path': "/tmp/test" + } + + from tts.synthesizer import SpeechSynthesizer + from tts.srv import SynthesizerRequest + speech_synthesizer = SpeechSynthesizer(engine='POLLY_LIBRARY') + request = SynthesizerRequest(text=test_text, metadata=test_metadata) + response = speech_synthesizer._node_request_handler(request) + + polly_class_mock.assert_called() + polly_obj_mock.synthesize.assert_called_with(**expected_polly_synthesize_args) + + self.assertEqual(response.result, polly_obj_mock.synthesize.return_value.result) + + @patch('tts.amazonpolly.AmazonPolly') + def test_synthesis_with_bad_metadata_using_polly_lib(self, polly_class_mock): + polly_obj_mock = MagicMock() + polly_class_mock.return_value = polly_obj_mock + + test_text = 'hello' + test_metadata = '''I am no JSON''' + + from tts.synthesizer import SpeechSynthesizer + from tts.srv import SynthesizerRequest + speech_synthesizer = SpeechSynthesizer(engine='POLLY_LIBRARY') + request = SynthesizerRequest(text=test_text, metadata=test_metadata) + response = speech_synthesizer._node_request_handler(request) + + self.assertTrue(response.result.startswith('Exception: ')) + + @patch('tts.amazonpolly.AmazonPolly') + def test_bad_engine(self, polly_class_mock): + polly_obj_mock = MagicMock() + polly_class_mock.return_value = polly_obj_mock + + ex = None + + from tts.synthesizer import SpeechSynthesizer + try: + SpeechSynthesizer(engine='NON-EXIST ENGINE') + except Exception as e: + ex = e + + self.assertTrue(isinstance(ex, SpeechSynthesizer.BadEngineError)) + + def test_cli_help_message(self): + import os + source_file_dir = os.path.dirname(os.path.abspath(__file__)) + synthersizer_path = os.path.join(source_file_dir, '..', 'scripts', 'synthesizer_node.py') + import subprocess + o = subprocess.check_output(['python', synthersizer_path, '-h']) + self.assertTrue(str(o).startswith('Usage: ')) + + @patch('tts.synthesizer.SpeechSynthesizer') + def test_cli_engine_dispatching_1(self, speech_synthesizer_class_mock): + import sys + with patch.object(sys, 'argv', ['synthesizer_node.py']): + import tts.synthesizer + tts.synthesizer.main() + speech_synthesizer_class_mock.assert_called_with(engine='POLLY_SERVICE', polly_service_name='polly') + speech_synthesizer_class_mock.return_value.start.assert_called_with(node_name='synthesizer_node', + service_name='synthesizer') + + @patch('tts.synthesizer.SpeechSynthesizer') + def test_cli_engine_dispatching_2(self, speech_synthesizer_class_mock): + import sys + with patch.object(sys, 'argv', ['synthesizer_node.py', '-e', 'POLLY_LIBRARY']): + from tts import synthesizer + synthesizer.main() + speech_synthesizer_class_mock.assert_called_with(engine='POLLY_LIBRARY') + speech_synthesizer_class_mock.return_value.start.assert_called() + + @patch('tts.synthesizer.SpeechSynthesizer') + def test_cli_engine_dispatching_3(self, speech_synthesizer_class_mock): + import sys + with patch.object(sys, 'argv', ['synthesizer_node.py', '-p', 'apolly']): + from tts import synthesizer + synthesizer.main() + speech_synthesizer_class_mock.assert_called_with(engine='POLLY_SERVICE', polly_service_name='apolly') + speech_synthesizer_class_mock.return_value.start.assert_called() + + +if __name__ == '__main__': + import rosunit + rosunit.unitrun('tts', 'unittest-synthesizer', TestSynthesizer) diff --git a/wiki/images/cpu.svg b/wiki/images/cpu.svg new file mode 100644 index 0000000..54b4080 --- /dev/null +++ b/wiki/images/cpu.svg @@ -0,0 +1,4 @@ + +CPU %44668810101212141416161818202022220.0000.0000.0000.0000.0006.0896.0896.0896.0896.0896.08912.18012.18012.18012.18012.18012.18018.26818.26818.26818.26818.26818.26824.35124.35124.35124.35124.35124.35130.44730.44730.44730.44730.44730.44736.53936.53936.53936.53936.53936.53942.63142.63142.63142.63142.63142.63148.71648.71648.71648.71648.71648.71654.79954.79954.79954.79954.79954.79960.86260.86260.86260.86260.86260.86266.91766.91766.91766.91766.91766.91772.98572.98572.98572.98572.98572.98572.98579.05479.05479.05479.05479.05479.05485.10485.10485.10485.10485.10485.10491.15491.15491.15491.15491.15491.15497.23397.23397.23397.23397.23397.233103.317103.317103.317103.317103.317103.317109.365109.365109.365109.365109.365109.365115.472115.472115.472115.472115.472115.472121.523121.523121.523121.523121.523121.523127.567127.567127.567127.567127.567127.567133.620133.620133.620133.620133.620133.620139.701139.701139.701139.701139.701139.701145.750145.750145.750145.750145.750145.750151.804151.804151.804151.804151.804151.804157.857157.857157.857157.857157.857157.857163.949163.949163.949163.949163.949163.949163.949170.020170.020170.020170.020170.020170.020176.095176.095176.095176.095176.095176.095182.164182.164182.164182.164182.164182.164188.260188.260188.260188.260188.260188.260194.350194.350194.350194.350194.350194.350200.439200.439200.439200.439200.439200.439206.527206.527206.527206.527206.527206.527212.613212.613212.613212.613212.613212.613218.709218.709218.709218.709218.709218.709224.795224.795224.795224.795224.795224.795230.881230.881230.881230.881230.881230.881236.970236.970236.970236.970236.970236.970236.970243.056243.056243.056243.056243.056243.056249.150249.150249.150249.150249.150249.150255.244255.244255.244255.244255.244255.244261.330261.330261.330261.330261.330261.330267.416267.416267.416267.416267.416267.416273.502273.502273.502273.502273.502273.502279.589279.589279.589279.589279.589279.589285.675285.675285.675285.675285.675285.675291.761291.761291.761291.761291.761291.761297.847297.847297.847297.847297.847297.847303.942303.942303.942303.942303.942303.942303.942310.029310.029310.029310.029310.029310.029316.116316.116316.116316.116316.116316.116322.208322.208322.208322.208322.208322.208328.293328.293328.293328.293328.293328.293334.383334.383334.383334.383334.383334.383340.468340.468340.468340.468340.468340.468346.552346.552346.552346.552346.552346.552352.638352.638352.638352.638352.638352.638358.724358.724358.724358.724358.724358.724364.809364.809364.809364.809364.809364.809370.898370.898370.898370.898370.898370.898376.990376.990376.990376.990376.990376.990376.990383.080383.080383.080383.080383.080383.080389.165389.165389.165389.165389.165389.165395.249395.249395.249395.249395.249395.249401.333401.333CPU %timeoad Average (1min)Load Average (5min)Load Average (15min) \ No newline at end of file diff --git a/wiki/images/memory.svg b/wiki/images/memory.svg new file mode 100644 index 0000000..da9f3fd --- /dev/null +++ b/wiki/images/memory.svg @@ -0,0 +1,4 @@ + +Memory (MB)2102102202202302302402402502502602602702702802802902903003003103103203203303303403403503503603600.0000.0000.0000.0005.0375.0375.0375.0375.03710.06710.06710.06710.06710.06715.09615.09615.09615.09615.09620.12620.12620.12620.12620.12625.15625.15625.15625.15625.15630.18630.18630.18630.18630.18635.21935.21935.21935.21935.21940.24840.24840.24840.24840.24845.27845.27845.27845.27845.27850.30850.30850.30850.30850.30855.33855.33855.33855.33855.33860.37060.37060.37060.37060.37065.40365.40365.40365.40365.40370.42170.42170.42170.42170.42175.45475.45475.45475.45475.45480.47380.47380.47380.47380.47385.49085.49085.49085.49085.49090.51390.51390.51390.51390.51395.54595.54595.54595.54595.545100.569100.569100.569100.569100.569105.587105.587105.587105.587105.587110.602110.602110.602110.602110.602115.619115.619115.619115.619115.619120.635120.635120.635120.635120.635125.655125.655125.655125.655125.655130.671130.671130.671130.671130.671135.688135.688135.688135.688135.688140.722140.722140.722140.722140.722145.738145.738145.738145.738145.738150.754150.754150.754150.754150.754155.771155.771155.771155.771155.771160.789160.789160.789160.789160.789165.819165.819165.819165.819165.819170.837170.837170.837170.837170.837175.853175.853175.853175.853175.853180.883180.883180.883180.883180.883185.913185.913185.913185.913185.913190.943190.943190.943190.943190.943195.974195.974195.974195.974195.974200.991200.991200.991200.991200.991200.991206.023206.023206.023206.023206.023211.054211.054211.054211.054211.054216.084216.084216.084216.084216.084221.115221.115221.115221.115221.115226.146226.146226.146226.146226.146231.176231.176231.176231.176231.176236.192236.192236.192236.192236.192241.221241.221241.221241.221241.221246.252246.252246.252246.252246.252251.282251.282251.282251.282251.282256.313256.313256.313256.313256.313261.343261.343261.343261.343261.343266.373266.373266.373266.373266.373271.403271.403271.403271.403271.403276.433276.433276.433276.433276.433281.464281.464281.464281.464281.464286.494286.494286.494286.494286.494291.525291.525291.525291.525291.525296.556296.556296.556296.556296.556301.586301.586301.586301.586301.586306.617306.617306.617306.617306.617311.647311.647311.647311.647311.647316.677316.677316.677316.677316.677321.708321.708321.708321.708321.708326.738326.738326.738326.738326.738331.768331.768331.768331.768331.768336.799336.799336.799336.799336.799341.830341.830341.830341.830341.830346.860346.860346.860346.860346.860351.891351.891351.891351.891351.891356.907356.907356.907356.907356.907361.937361.937361.937361.937361.937366.967366.967366.967366.967366.967371.998371.998371.998371.998371.998371.998377.039377.039377.039377.039377.039382.068382.068382.068382.068382.068387.098387.098387.098387.098387.098392.129392.129392.129392.129392.129397.160397.160397.160397.160397.160402.190402.190Memory (MB)timesed MemoryFree Memory \ No newline at end of file