<a href="https://colab.research.google.com/github/alexledd/So-VITS-SVC-Notebook/blob/main/so-vits-svc_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Before training

This program saves the last 3 generations of models to Google Drive. Since 1 generation of models is >1GB, you should have at least 3GB of free space in Google Drive. If you do not have such free space, it is recommended to create another Google Account.

Training requires >10GB VRAM. (T4 should be enough) Inference does not require such a lot of VRAM.

**Notes: be cautius with your file/folder name, preferably without spaces!**

**Also that playing audio directly in Colab can cause runtime to restart. To solve this, download it manually or move it inside /content/drive/MyDrive and play it over GDrive instead**

In [None]:
#@title NVIDIA SMI (GPU Check)
!nvidia-smi


# Dependencies & Mount Gdrive
Restart runtime after everything is installed


In [None]:
#@title Mount GDrive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
#@title Audio editor dependencies

!pip install yt_dlp
!pip install ffmpeg
!mkdir youtubeaudio
!python3 -m pip install -U demucs
!python3 -m pip install pydub

In [None]:
#@title SVC dependencies

!python -m pip install -U pip wheel
%pip install -U ipython
%pip install -U so-vits-svc-fork
!mkdir drive/MyDrive/so-vits-svc-fork
#@markdown pip may fail to resolve dependencies and raise ERROR, but it can be ignored.
#@markdown You need to restart the runtime after running this cell! (MUST!)


# Downloader
This cell is for downloading from the internet; url must be direct to the file

In [None]:
#@title Downloader
#@markdown The default downloads folder is in "/content/downloaded"
file_url = "https://huggingface.co/lexmill/alex-id_en/resolve/main/Alex_3200.pth" #@param {type:"string"}
file_url2 = "https://huggingface.co/lexmill/alex-id_en/raw/main/config.json" #@param {type:"string"}

!mkdir downloaded
!wget -N {file_url} -P downloaded/
!wget -N {file_url2} -P downloaded/

In [None]:
#@title YouTube Audio Downloader (WAV Output)
from __future__ import unicode_literals
import yt_dlp
import ffmpeg
import sys

ydl_opts = {
    'format': 'bestaudio/best',
#    'outtmpl': 'output.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }],
    "outtmpl": 'youtubeaudio/audio',  # this is where you can edit how you'd like the filenames to be formatted
}
def download_from_url(url):
    ydl.download([url])
    # stream = ffmpeg.input('output.m4a')
    # stream = ffmpeg.output(stream, 'output.wav')


with yt_dlp.YoutubeDL(ydl_opts) as ydl:
      url = "" #@param {type:"string"}
      download_from_url(url)


In [None]:
#@title Unzip Tool
ZIP_PATH = "" #@param {type:"string"}
FOLDER_NAME = "" #@param {type:"string"}

!unzip {ZIP_PATH} -d {FOLDER_NAME}

# Audio Editor
This set is for audio editing

In [None]:
#@title Convert to Waveform (.WAV)
#@markdown remove the file extension (.mp3;m4a) in input section. default output is in "/content/converted"
FFMPEG_INPUT = "" #@param {type:"string"}
FILE_EXT = "" #@param {type:"string"}
OUT = "" #@param {type:"string"}

!mkdir converted
!ffmpeg -i {FFMPEG_INPUT}.{FILE_EXT} -acodec pcm_s16le /content/converted/{OUT}.wav

In [None]:
#@title Demuxer (Seperate Vocal and Background)
import subprocess
AUDIO_INPUT = "" #@param {type:"string"}

command = f"demucs --two-stems=vocals {AUDIO_INPUT}"
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
print(result.stdout.decode())

In [None]:
#@title Analyzing Audio Volume
ANLZ_INPUT = "" #@param {type:"string"}

!ffmpeg -i {ANLZ_INPUT} -filter:a volumedetect -f null /dev/null!

In [None]:
#@title Volume Manipulation
VM_INPUT = "" #@param {type:"string"}
#@markdown Value can be in "1.5" (150% Increase) or in "10dB" (10dB Increase)
VM_VALUE = "" #@param {type:"string"}
#@markdown Output filename; In /content/volume_changed
VM_OUTPUT = "" #@param {type:"string"}

!mkdir volume_changed
!ffmpeg -i {VM_INPUT} -filter:a "volume={VM_VALUE}" -c:a pcm_s16le /content/volume_changed/{VM_OUTPUT}.volume.wav

In [None]:
#@title Audio Normalization
#@markdown * Audio Normalization input; this cell will also convert audio file to waveform.
AN_INPUT = "" #@param {type:"string"}
#@markdown * Target loudness; type just the value in dB (ex. "-6")
TARGET_LDNS = "-6" #@param {type:"string"}
#@markdown * The default Loudness Range is 11dB
RANGE_LDNS = "11" #@param {type:"string"}
#@markdown * The default value is -1.5dB
TRUE_PEAK = "-1.5" #@param {type:"string"}
#@markdown * Output filename; in /content/normalized
AN_OUTPUT = "" #@param {type:"string"}

!mkdir normalized
!ffmpeg -i {AN_INPUT} -af loudnorm=I={TARGET_LDNS}:LRA={RANGE_LDNS}:TP={TRUE_PEAK} -c:a pcm_s16le /content/normalized/{AN_OUTPUT}.normalized.wav



In [None]:
#@title Combine
from pydub import AudioSegment
!mkdir combined

AUDIO_01 = "" #@param {type:"string"}
AUDIO_02 = "" #@param {type:"string"}
DisplayAudio_Combined = False #@param {type:"boolean"}

sound1 = AudioSegment.from_file(AUDIO_01)
sound2 = AudioSegment.from_file(AUDIO_02)

combined = sound1.overlay(sound2)

combined.export("/content/combined/audio.combined.wav", format='wav')

def DisplayAudioResult():
    display(Audio(f"/content/combined/audio.combined.wav"))

if DisplayAudio_Combined :
  DisplayAudioResult()

# Training
This set is for training an SVC model

In [None]:
#@title Make dataset directory
!mkdir -p "dataset_raw"

#!rm -r "dataset_raw"
#!rm -r "dataset/44k"

In [None]:
#@title Copy your dataset
#@markdown **We assume that your dataset is in your Google Drive's `so-vits-svc-fork/dataset/(speaker_name)` directory.**
DATASET_NAME = "" #@param {type: "string"}
!cp -R /content/drive/MyDrive/so-vits-svc-fork/dataset/{DATASET_NAME}/ -t "dataset_raw/"

In [None]:
#@title Automatic preprocessing
!svc pre-resample

In [None]:
#@title Pre-Config for new dataset
!svc pre-config

In [None]:
#@title Copy configs file
!cp configs/44k/config.json drive/MyDrive/so-vits-svc-fork

In [None]:
#@title  Training Method
#@markdown The default is Dio
F0_METHOD = "dio" #@param ["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]
!svc pre-hubert -fm {F0_METHOD}

In [None]:
#@title Training
%load_ext tensorboard
%tensorboard --logdir drive/MyDrive/so-vits-svc-fork/logs/44k
!svc train --model-path drive/MyDrive/so-vits-svc-fork/logs/44k

In [None]:
#@title Training Cluster Model
!svc train-cluster --output-path drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt

# Inference
This set is for using the SVC model for conversion

In [None]:
#@title **INFERENCE**
#@markdown #INFERING USING PRE/TRAINED SVC MODEL
#@markdown * remove **".wav"** on AUDIO
from IPython.display import Audio

AUDIO = "" #@param {type:"string"}
MODEL = "" #@param {type:"string"}
CONFIG = "" #@param {type:"string"}
#@markdown * Change according to your model's voice pitch. 12 = 1 Octave | -12 = -1 Octave.
#@markdown * Higher pitch audio to Lower pitch Model usually use -12 to -24; Vice Versa
PITCH = -12 #@param {type:"integer"}
#@markdown * Options, or leave it by default
Auto_Predict = False #@param {type:"boolean"}
Pitch_Bypass = False #@param {type:"boolean"}
DisplayAudio_Infer = False #@param {type:"boolean"}

def Auto_PredictFalse():
  if Pitch_Bypass:
    !svc infer {AUDIO}.wav -c {CONFIG} -m {MODEL} -na
  else:
    !svc infer {AUDIO}.wav -c {CONFIG} -m {MODEL} -na -t {PITCH}

def Auto_PredictTrue():
  if Pitch_Bypass:
    !svc infer {AUDIO}.wav -c {CONFIG} -m {MODEL}
  else:
    !svc infer {AUDIO}.wav -c {CONFIG} -m {MODEL} -t {PITCH}

if Auto_Predict:
    Auto_PredictTrue()
else:
    Auto_PredictFalse()

#@markdown Displaying audio can restart the runtime sometimes
if DisplayAudio_Infer :
  display(Audio(f"{AUDIO}.out.wav"))