# Text-to-Speech Model Using Orpheus TTS
## Instructions to Run the Code
If you intend to perform fine-tuning, make sure you have both the transcript and audio files ready. You may need to update the file paths to point to your dataset. The transcript must follow the format specified in the GitHub repository.

If you already have the model parameters and only intend to run inference, you can execute all cells except the training cell. The training cell is labeled with the comment:
```
# Do not run this if you already have the model parameters and only want to perform inference.
```

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
  !pip install unsloth
else:
  # Do this only in Colab notebooks! Otherwise use pip install unsloth
  !pip install -- no-deps bitsandbytes accelerate xformers == 0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
  !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
  !pip install -- no-deps unsloth
!pip install snac

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# restart the google colab session after running this cell
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.17 (from unsloth)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.19-py3-none-any.whl.metadata (9.9 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting protobuf<4.0.0 (from unsloth)
  Downloa

In [None]:
from unsloth import FastLanguageModel
import torch
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
  model_name = "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit",
  max_seq_length= 2048,
  dtype = dtype,
  load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
  model,
  r = 64,
  target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
  "gate_proj", "up_proj", "down_proj",],
  lora_alpha = 64,
  lora_dropout = 0,
  bias = "none",
  use_gradient_checkpointing = "unsloth",
  random_state = 3407,
  use_rslora = False,
  loftq_config = None,
)

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
data = {"audio": [], "text": []}
!pwd
# Open your text file (adjust the filename as needed)
with open("/content/drive/MyDrive/MyTTSData/000010.TXT", "r", encoding="utf-8-sig") as f:
    lines = f.readlines()

# Process the file two lines at a time (skip the duplicate transcript)
for i in range(0, len(lines), 2):
    # Strip any extra whitespace/newlines
    line = lines[i].strip()
    if not line:
        continue  # Skip empty lines
    # Split by tab to separate the ID and the transcript
    parts = line.split("\t")
    if len(parts) < 2:
        continue  # Skip lines that don't have both an ID and a transcript
    file_id, transcript = parts[0], parts[1]
    # Create the audio path using the file ID
    audio_path = f"/content/drive/MyDrive/MyTTSData/wav/{file_id}.WAV"
    data["audio"].append(audio_path)
    data["text"].append(transcript)

print(data)


/content
{'audio': ['/content/drive/MyDrive/MyTTSData/wav/000010001.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010002.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010003.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010005.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010006.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010007.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010008.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010009.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010010.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010011.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010012.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010013.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010014.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010015.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010016.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010018.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010019.WAV', '/content/drive/MyDrive/MyTTSData/wav/000010020.WAV', '/conten

In [None]:

from datasets import load_dataset, Dataset, Audio
# dataset = load_dataset("MrDragonFox/Elise", split = "train")
dataset = Dataset.from_dict(data)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(dataset.features)


{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None)}


In [None]:
print(dataset.features)


{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None)}


In [None]:
#@title Tokenisation Function

import locale
import torchaudio.transforms as T
import os
import torch
from snac import SNAC
locale.getpreferredencoding = lambda: "UTF-8"
ds_sample_rate = dataset[0]["audio"]["sampling_rate"]

snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model = snac_model.to("cuda")
def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
  waveform = resample_transform(waveform)

  waveform = waveform.unsqueeze(0).to("cuda")



  #generate the codes from snac
  with torch.inference_mode():
    codes = snac_model.encode(waveform)

  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
  return all_codes


def add_codes(example):
  # Always initialize codes_list to None
  codes_list = None

  try:
    answer_audio = example.get("audio")
    # If there's a valid audio array, tokenise it
    if answer_audio and "array" in answer_audio:
      audio_array = answer_audio["array"]
      codes_list = tokenise_audio(audio_array)
  except Exception as e:
    print(f"Skipping row due to error: {e}")
    # Keep codes_list as None if we fail
  example["codes_list"] = codes_list

  return example

dataset = dataset.map(add_codes, remove_columns=["audio"])

tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009

start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2

start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4

start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
pad_token = tokeniser_length + 7

audio_tokens_start = tokeniser_length + 10

dataset = dataset.filter(lambda x: x["codes_list"] is not None)
dataset = dataset.filter(lambda x: len(x["codes_list"]) > 0)

def remove_duplicate_frames(example):
  vals = example["codes_list"]
  if len(vals) % 7 != 0:
    raise ValueError("Input list length must be divisible by 7")

  result = vals[:7]

  removed_frames = 0
  for i in range(7, len(vals), 7):
    current_first = vals[i]
    previous_first = result[-7]

    if current_first != previous_first:
      result.extend(vals[i:i+7])
    else:
      removed_frames += 1

  example["codes_list"] = result

  return example

dataset = dataset.map(remove_duplicate_frames)


tok_info = ''' *** HERE you can modify the text prompt
If you are training a multi-speaker model (e.g., canopylabs/orpheus-3b-0.1-ft),
ensure that the dataset includes a "source" field and format the input accordingly:
- Single-speaker: f"{example['text']}"
- Multi-speaker: f"{example['source']}: {example['text']}"
'''
print(tok_info)

def create_input_ids(example):
  # Determine whether to include the source field
  text_prompt = f"{example['source']}: {example['text']}" if "source" in example else example["text"]

  text_ids = tokenizer.encode(text_prompt, add_special_tokens=True)
  text_ids.append(end_of_text)

  example["text_tokens"] = text_ids
  input_ids = (
    [start_of_human]
    + example["text_tokens"]
    + [end_of_human]
    + [start_of_ai]
    + [start_of_speech]
    + example["codes_list"]
    + [end_of_speech]
    + [end_of_ai]
  )
  example["input_ids"] = input_ids
  example["labels"] = input_ids
  example["attention_mask"] = [1] * len(input_ids)

  return example

dataset = dataset.map(create_input_ids, remove_columns=["text", "codes_list"])
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in dataset.column_names if col not in columns_to_keep]

dataset = dataset.remove_columns(columns_to_remove)

config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/79.5M [00:00<?, ?B/s]



Map:   0%|          | 0/353 [00:00<?, ? examples/s]

Filter:   0%|          | 0/353 [00:00<?, ? examples/s]

Filter:   0%|          | 0/353 [00:00<?, ? examples/s]

Map:   0%|          | 0/353 [00:00<?, ? examples/s]

 *** HERE you can modify the text prompt
If you are training a multi-speaker model (e.g., canopylabs/orpheus-3b-0.1-ft),
ensure that the dataset includes a "source" field and format the input accordingly:
- Single-speaker: f"{example['text']}"
- Multi-speaker: f"{example['source']}: {example['text']}"



Map:   0%|          | 0/353 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer,DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = Trainer(
  model = model,
  train_dataset = dataset,
  args = TrainingArguments(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = 3, # Set this for 1 full training run.
    max_steps = 2000,
    learning_rate = 2e-4,
    fp16= not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
    report_to = "none", # Use this for WandB etc
  ),
)

In [None]:
# Do not run this if you already have the model parameters and only want to perform inference.
trainer_stats = trainer.train()

adapter_dir = "/content/drive/MyDrive/TTSModelParameter/peft_adapter_2000steps"
model.save_pretrained(adapter_dir)      # saves adapter_config.json + adapter_model.bin
tokenizer.save_pretrained(adapter_dir)  # lets you reload with the same tokenizer


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 353 | Num Epochs = 23 | Total steps = 2,000
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 97,255,424/3,000,000,000 (3.24% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,5.4472
2,5.6658
3,5.5623
4,5.5155
5,5.6069
6,5.3163
7,5.316
8,5.2452
9,5.3519
10,5.1243


('/content/drive/MyDrive/TTSModelParameter/peft_adapter_2000steps/tokenizer_config.json',
 '/content/drive/MyDrive/TTSModelParameter/peft_adapter_2000steps/special_tokens_map.json',
 '/content/drive/MyDrive/TTSModelParameter/peft_adapter_2000steps/tokenizer.json')

In [None]:
BASE_MODEL   = "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit"
ADAPTER_DIR  = "/content/drive/MyDrive/TTSModelParameter/peft_adapter_2000steps"        # change to your saved model path

import torch, torchaudio
from unsloth import FastLanguageModel
from snac   import SNAC
from IPython.display import Audio, display

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = BASE_MODEL,
    max_seq_length = 2048,
    dtype          = None,        # auto‑fp16 / bf16
    load_in_4bit   = True,
)
model.load_adapter(ADAPTER_DIR)
FastLanguageModel.for_inference(model)

model_device = next(model.parameters()).device

print(f"Model lives on → {model_device}")

snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(model_device)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model lives on → cuda:0


In [None]:
prompts = [
    # "Bruh I asked three times! ... three times! ... and they still forgot!!! How hard could it be to just listen?",
    "oh my god! I feel so tired!",
]

# special tokens
START_HUMAN  = 128259
END_TEXT     = 128009
END_HUMAN    = 128260
START_SPEECH = 128257
END_SPEECH   = 128258
CODES_OFFSET = 128266

def build_inputs(prompt: str):
    ids = [START_HUMAN] + tokenizer.encode(prompt, add_special_tokens=True) + [END_TEXT, END_HUMAN]
    return torch.tensor(ids, dtype=torch.int64, device=model_device).unsqueeze(0)

input_ids = torch.cat([build_inputs(p) for p in prompts])

gen_ids = model.generate(
    input_ids          = input_ids,
    max_new_tokens     = 600,
    do_sample          = True,
    temperature        = 0.6,
    top_p              = 0.95,
    repetition_penalty = 1.1,
    eos_token_id       = END_SPEECH,
    pad_token_id       = END_SPEECH,
)

def extract_codes(row: torch.Tensor):
    idx   = (row == START_SPEECH).nonzero(as_tuple=True)[0][-1] + 1
    codes = row[idx:][row[idx:] != END_SPEECH] - CODES_OFFSET
    return codes[: (codes.numel() // 7) * 7].tolist()

def redistribute_codes(code_list):
    l1, l2, l3 = [], [], []
    for i in range(len(code_list) // 7):
        l1.append(code_list[7*i])
        l2.append(code_list[7*i+1] - 4096)
        l3.extend([code_list[7*i+2] - 2*4096,
                   code_list[7*i+3] - 3*4096])
        l2.append(code_list[7*i+4] - 4*4096)
        l3.extend([code_list[7*i+5] - 5*4096,
                   code_list[7*i+6] - 6*4096])

    codes = [
        torch.tensor(l1, device=model_device).unsqueeze(0),
        torch.tensor(l2, device=model_device).unsqueeze(0),
        torch.tensor(l3, device=model_device).unsqueeze(0),
    ]
    with torch.no_grad():
        wav = snac_model.decode(codes)
    return wav.squeeze().cpu().numpy()

# play final result
for prompt, row in zip(prompts, gen_ids):
    print(prompt)
    display(Audio(redistribute_codes(extract_codes(row)), rate=24000))

oh my god! I feel so tired!
