# Worms speechbank synthesis example with LLMs and TTS

This is a brief example of using Vertex AI's new Generative AI capabilities to generate text and speech for making the game Worms 2: Armageddon more dynamic.


## Data preprocessing


In [1]:
import IPython.display as ipd


In [2]:
from pathlib import Path

import pandas as pd

raw = Path("data.txt").read_text()
rows = pd.Series(raw.splitlines())
rows.sample(n=10, random_state=17)

636     Robot accent. First added in Worms 2 (?), and ...
39                                                       
2202           *Dostin! Do you copy? Over! (Come On Then)
3003    *Vorräte! / Vorräte sind da. / Die könnten das...
1858                                  * That hurt! (Ouch)
540                            * Take cover! (Take cover)
830                      *We are the champions (Victory) 
2529          *It is a noble sacrifice I make! (Kamikaze)
957                           * You disgust me! (Traitor)
45                                                __TOC__
dtype: object

In [3]:
rows.head(n=20)


0     '''FSpeech'''/'''Voices'''/'''Speechbanks''' (...
1                                                      
2                                      == Speech Key ==
3        * Representation - When a worm start his turn.
4     *Hello / Wait - Plays when no controls have be...
5     *Come On Then - Plays when [[Turn Time]] is do...
6     *Hurry - Plays when [[Turn Time]] is down to 5...
7     *Watch This - Plays while a variable power wea...
8     *Panic - One of several voice files that play ...
9                                            **Run Away
10                                     **Nooo - Falling
11    **Grenade - Panic file specifically for enemy ...
12    **Take Cover - When an enemy calls an [[Air St...
13    *Stupid / Oi Nutter- Plays when a enemy worm d...
14    *Traitor - Plays when a worm damages teammate(...
15             *Hurt - Enemy Worm after being attacked.
16                    *Missed - Enemy missed an attack.
17                               *Oops - Attack 

In [4]:
rows.tail(n=20)


3005    *Langweilig. / Das war alles? (No attacks duri...
3006                          *Bye bye. / Oh nein. (Dies)
3007                               *Gewinner! (Team wins)
3008          *Zahnräder in Bewegung! (Crafting a weapon)
3009    *Spezialapparat eingesetzt! (Activating specia...
3010                                                     
3011                                           ==Trivia==
3012    *One of Soul Man's lines, "Like a stacks machi...
3013    *The quote "Say hello to my little friend!" fr...
3014    *The phrases "Shoryuken" and "Hadouken" are re...
3015    *In ''Worms 2: Armageddon'', the phrase "Fire ...
3016    *The phrase from the Horror soundbank "I'll te...
3017    *In Worms HD for the PS3, some speech banks ha...
3018                       **Double Oh Seven is named 17.
3019                      **Poor Rapper is called Rapper.
3020                        **Snitch is named Backstreet.
3021    *Worms saying the word "fatality" is likely a ...
3022    *In [[

In [5]:
# TODO Make parsing of the wiki text less hackathony. :)
df = rows[20:-13].str.extract(r"\*(.+)\((.*)\)").dropna()
df.columns = ["speech_line", "speech_key"]
df.sample(n=10)

Unnamed: 0,speech_line,speech_key
1758,Voçê não me escapa! (You can't escape from me!),Just you wait!
2539,"By God sir, a marvel!",Amazing
931,Dis gettin' got close!,Fatality
2067,"Come over here, fool!",Static gun
1835,That coconut sure smell good!,Drop
3009,Spezialapparat eingesetzt!,Activating special [[Crafted Weapon]]
2929,Now I know how the rats feel,Submerged
1705,Żegnajcie! (Goodbye!),bye bye
2928,It's gonna get wet!,Grenade
615,The court orders you to pay full damages,You'll regret that


In [6]:
# Collect most frequent speech keys from the parsed text.
speech_keys = (
    df.groupby("speech_key")
    .count()
    .nlargest(25, columns="speech_line")
    .reset_index()["speech_key"]
)


In [7]:
# TODO Filter out "speech keys" with only a single occurence (or better yet: manually type out the allowed speech keys in a separate file first).
rows = df.rename(columns={"speech_key": "input", "speech_line": "output"}).to_dict(
    orient="records"
)

# TODO Send semistructured data directly to Vertex AI instead of a string?
speechbank = ""
for d in rows:
    speechbank += f"input: {d['input']}\noutput: {d['output']}\n\n"


In [8]:
# Load a random personality.
personality = pd.read_csv("personalities.csv", delimiter=";").sample()
personality

Unnamed: 0,Team Name,Personality Traits,Team Description
0,The Scandi-Statisticians,Equipped with the sharpest analytical minds f...,"The Scandi-Statisticians are logical, methodi..."


In [9]:
task_description = """
You are a game developer for Worms.
Your are responsible for the speech lines of the worms.
Given a game event as input, and personality traits and team description as context, you should output a funny speech line with a lot of colorful personality based on the traits.
Focus a lot on the personality traits and only use the speechbank examples as general inspiration (don't follow them too closely).
"""

## Run text generation model to get speech lines


In [10]:
import vertexai
from vertexai.language_models import TextGenerationModel

vertexai.init(project="genai-nordics23sto-2304", location="us-central1")

parameters = {"temperature": 0.9, "max_output_tokens": 64, "top_p": 0.8, "top_k": 40}
model = TextGenerationModel.from_pretrained("text-bison")

# Select a random speech key from the included ones.
speech_key = speech_keys.sample().iloc[0]

# Construct prompt.
prompt = f"""
Task: {task_description}

Team name: {personality['Team Name']}

Team description: {personality['Team Description']}

Personality traits: {personality['Personality Traits']}

Speechbank examples: {speechbank[:1000]}

input: {speech_key}\noutput:
"""

# Run model.
text_response = model.predict(prompt, **parameters)
f"{speech_key}: {text_response.text}"


'Fatality: The end is coming! But at least the statistics will be correct!'

## Run text-to-speech (TTS) model to get audio


In [11]:
from google.cloud import texttospeech

client = texttospeech.TextToSpeechClient()

input_text = texttospeech.SynthesisInput(text=text_response.text)

voice = texttospeech.VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Studio-M",
)

audio_config = texttospeech.AudioConfig(
    audio_encoding=texttospeech.AudioEncoding.LINEAR16, speaking_rate=1.0
)

audio_response = client.synthesize_speech(
    request={"input": input_text, "voice": voice, "audio_config": audio_config}
)

ipd.display(text_response.text, ipd.Audio(audio_response.audio_content, autoplay=True))


'The end is coming! But at least the statistics will be correct!'

## Bonus: pitch-shifted audio for more Worms-esque voices with librosa


In [12]:
import io

import librosa as lr

sample_rate = 16000

with io.BytesIO(audio_response.audio_content) as f:
    waveform, _ = lr.load(f, sr=sample_rate)
waveform = lr.effects.pitch_shift(waveform, sr=sample_rate, n_steps=12)

ipd.display(text_response.text, ipd.Audio(waveform, rate=sample_rate, autoplay=True))


'The end is coming! But at least the statistics will be correct!'