# zlib pydub audio tokenizer (ver. 0.5)

***

Powered by tegridy-tools: https://github.com/asigalov61/tegridy-tools

***

#### Project Los Angeles

#### Tegridy Code 2024

***

# (SETUP ENVIRONMENT)

In [None]:
# @title Install dependencies
!pip install pydub -U
!wget https://github.com/asigalov61/tegridy-tools/raw/main/tegridy-tools/seed-melody.wav

#(TOKENIZE AND DETOKENIZE)

## (ZLIB and PYDUB)

In [None]:
# @title zlib compressed pydub audio tokenizer
import numpy as np

from pydub import AudioSegment
from pydub.utils import make_chunks

import zlib

from IPython.display import display, Audio

#===============================================================================

wav_audio_file = '/content/seed-melody.wav'

number_of_audio_channels = 1 # Mono(1) or Stereo(2)
audio_sample_rate = 14000 # In Hz
audio_sample_width = 2 # In bytes
chunk_length_ms = 1000  # chunk length in milliseconds
downsampling_factor = 1 # Downsampling value
tokens_range = 32 # Encoding tokens range

#===============================================================================

# Load the audio file
myaudio = AudioSegment.from_file(wav_audio_file, 'wav')

# Convert audio to mono and set frame rate to 16000 Hz
myaudio = myaudio.set_channels(number_of_audio_channels)
myaudio = myaudio.set_frame_rate(audio_sample_rate)
myaudio = myaudio.set_sample_width(audio_sample_width)

# Chunk the audio
chunks = make_chunks(myaudio, chunk_length_ms)

# Convert each chunk to a numpy array
samples = [np.array(chunk.get_array_of_samples())[::downsampling_factor] for chunk in chunks]

# Normalize samples to the range 0-1023
normalized_samples = [np.interp(s, (s.min(), s.max()), (0, tokens_range)) for s in samples]

# Convert normalized samples into tokens
tokens = [np.asarray(s, dtype=np.int16) for s in normalized_samples]

compressed_tokens = [list(zlib.compress(bytes(np.asarray(s, dtype=np.int16)))) for s in tokens]

print('=' * 70)
print('Min/Max seq_len:', len(min(compressed_tokens, key=len)), '/', len(max(compressed_tokens, key=len)))
print('=' * 70)

#===============================================================================

# Convert tokens back to samples
restored_samples = [np.frombuffer(zlib.decompress(bytes(t)), dtype=np.int16) for t in compressed_tokens]

# Upsample the samples
upsampled_samples = [np.repeat(s, downsampling_factor) for s in restored_samples]

# Concatenate all chunks
restored_samples_concat = np.concatenate(upsampled_samples)

# Create a new AudioSegment instance
audio = AudioSegment(
    data=restored_samples_concat.tobytes(),  # convert numpy array to bytes
    sample_width=myaudio.sample_width,  # 2 bytes
    frame_rate=myaudio.frame_rate,  # use original frame rate
    channels=myaudio.channels  # mono audio
)

# Increase the volume by 10 dB
louder_audio = audio.apply_gain(50)

# Export the louder audio to a new file
louder_audio.export('louder_restored_audio.wav', format='wav')

display(Audio(filename='/content/louder_restored_audio.wav', rate=audio_sample_rate))

## (PURE PYDUB)

In [None]:
# @title pydub-only tokenizer without zlib compression
import numpy as np

from pydub import AudioSegment
from pydub.utils import make_chunks

from IPython.display import display, Audio

#===============================================================================

wav_audio_file = '/content/seed-melody.wav'

number_of_audio_channels = 1 # Mono(1) or Stereo(2)
audio_sample_rate = 16000 # In Hz
audio_sample_width = 2 # In bytes
chunk_length_ms = 16  # chunk length in milliseconds
tokens_range = 129 # Encoding tokens range

#===============================================================================

# Load the audio file
myaudio = AudioSegment.from_file(wav_audio_file, 'wav')

# Convert audio to mono and set frame rate to 16000 Hz
myaudio = myaudio.set_channels(number_of_audio_channels)
myaudio = myaudio.set_frame_rate(audio_sample_rate)
myaudio = myaudio.set_sample_width(audio_sample_width)

myaudio_array = np.array(myaudio.get_array_of_samples())

normalized_audio = np.interp(myaudio_array, (myaudio_array.min(), myaudio_array.max()), (0, tokens_range))

# Chunk the audio
chunks = make_chunks(normalized_audio, chunk_length_ms)

# Convert normalized samples into tokens
tokens = [np.array(s, dtype=np.int16) for s in chunks]


print('=' * 70)
print('Min/Max seq_len:', len(min(tokens, key=len)), '/', len(max(tokens, key=len)))
print('=' * 70)

#===============================================================================

# Reverse normalization
restored_audio_array = np.interp(tokens[:-1], (0, tokens_range), (myaudio_array.min(), myaudio_array.max()))

# Concatenate all chunks
restored_samples_concat = np.asarray(np.concatenate(restored_audio_array), dtype=np.int16)

# Create a new AudioSegment instance
audio = AudioSegment(
    data=restored_samples_concat.tobytes(),  # convert numpy array to bytes
    sample_width=myaudio.sample_width,  # 2 bytes
    frame_rate=myaudio.frame_rate,  # use original frame rate
    channels=myaudio.channels  # mono audio
)

# Export the louder audio to a new file
audio.export('louder_restored_audio.wav', format='wav')

display(Audio(filename='/content/louder_restored_audio.wav', rate=audio_sample_rate))

# Congrats! You did it :)