# RVC Algorithm

```
┌─────────────────────────────────────────────────────┐
│ INPUT: TTS Audio + .pth model + .index file         │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 1: Content Extraction                          │
│ Model: ContentVec/HuBERT                            │
│ Input: Audio waveform                               │
│ Output: Content features (what is said)             │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 2: Feature Retrieval (THE MAGIC)               │
│ Tool: FAISS Index                                   │
│ Input: Content features                             │
│ Output: Retrieved features (target voice style)     │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 3: Pitch Extraction                            │
│ Model: RMVPE/CREPE/FCPE                             │
│ Input: Audio waveform                               │
│ Output: Pitch values (melody)                       │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 4: Protection Blending                         │
│ Mix original + retrieved features                   │
│ Preserves emotion and character                     │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 5: Feature Upsampling                          │
│ Interpolate 50Hz → 100Hz                            │
│ Makes output smoother                               │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 6: Audio Generation                            │
│ Model: Generator (.pth file)                        │
│ Input: Features + Pitch                             │
│ Output: Final audio waveform                        │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ OUTPUT: Voice-converted audio file                  │
└─────────────────────────────────────────────────────┘
```

In [28]:
import os
import soundfile as sf
import sounddevice as sd
import numpy as np
import random
import time
import librosa

In [29]:
# audio samples
sample_short_mono = "./sample-short_mono.mp3"
sample_long_mono = "./sample-long_mono.mp3"
sample_short_stereo = "./sample-short_stereo.mp3"

### 1. Content Extraction

In [34]:
def get_soundinfo(file):
    sound_array, sample_rate = sf.read(file)
    sound_array_length = len(sound_array)

    # information about the sound
    print("\n[SOUNDFILE LIBRARY INFORMATION]")
    info = sf.info(file)
    print(info)

    print("\n[CALCULATION BASED INFORMATION]")

    # true duration
    true_duration = sound_array_length / sample_rate
    print(f"Calculated duration (array length / sample rate): {true_duration} s")

    # sample rate (data per second)
    calculated_sample_rate = sound_array_length / true_duration
    print(f"Calculated sample rate (array length / duration): {calculated_sample_rate} Hz")
    print()

def get_arrayinfo(sound_array, heading):
    print()
    print(f"[=== {heading} ===]")

    print("\n[SOUND ARRAY INFORMATION]\n")

    print(f"First {'100' if sound_array.size > 100 else sound_array.size} values\n{sound_array[:10]}\n")

    print(f"Sound array size: {sound_array.size}")
    print(f"Sound array shape: {sound_array.shape}")
    print(f"Sound array dtype: {sound_array.dtype}")
    print(f"Sound array min: {sound_array.min()}")
    print(f"Sound array max: {sound_array.max()}")
    print(f"Sound array mean: {sound_array.mean()}")
    print(f"Sound array std: {sound_array.std()}")

    print(f"Sound type '{'mono' if len(sound_array.shape) == 1  else 'stereo'}'")
    print()

def load_audio(_file, _sample_rate, **kwargs):
    try:
        # file path cleanup
        file = _file.strip(' "\n')
        if not os.path.isfile(file):
            raise FileNotFoundError(f"File not found: {file}")

        # get details of the file
        get_soundinfo(file)

        # load audio array and sample rate using soundfile
        sound_array, sr = sf.read(file)
        get_arrayinfo(sound_array, "Original Array")

        # reverse array
        r_sound_array = sound_array[::-1]
        get_arrayinfo(sound_array[::-1], "Reversed Array")

        # transpose array
        '''
        Ex: Original array (3, 2)
            Transposed array (2, 3)
        ------------------------------
        array = np.array([[1, 2],
                          [3, 4],
                          [5, 6]])

        print(array.shape) # (3, 2)

        transposed = array.T
        print(transposed.shape) # (2, 3)
        print(transposed)
        # [[1, 3, 5],
        #  [2, 4, 6]]
        ------------------------------
        for stereo audio, this will swap rows and columns not left and right channels
        '''

        t_sound_array = sound_array.T
        get_arrayinfo(t_sound_array, "Transposed Array")

        if len(sound_array.shape) > 1:

            # swapped channels array
            right_channel = sound_array[:, 1]
            left_channel = sound_array[:, 0]
            swapped_channels_array = np.array([right_channel, left_channel]).T
            get_arrayinfo(swapped_channels_array, "Swapped Channels Array")

            # monophonic array
            '''
            array = np.array([[1, 2, 3],
                              [4, 5, 6]])
            # Shape: (2, 3)

            # mean(axis=0) - collapse DOWN (across rows)
            result = array.mean(axis=0)
            print(result)  # [2.5, 3.5, 4.5]
            # Takes mean of each COLUMN: (1+4)/2, (2+5)/2, (3+6)/2
            # Result shape: (3,)

            # mean(axis=1) - collapse ACROSS (across columns)
            result = array.mean(axis=1)
            print(result)  # [2., 5.]
            # Takes mean of each ROW: (1+2+3)/3, (4+5+6)/3
            # Result shape: (2,)

            # mean() with no axis - collapse EVERYTHING
            result = array.mean()
            print(result)  # 3.5
            # (1+2+3+4+5+6) / 6
            # Result: single number

            # (2,)   →  [a, b]     (just a line of numbers)

            # (1, 2) →  [[a, b]]   (a table with 1 row)
            '''

            mono_array = sound_array.mean(axis=1)
            get_arrayinfo(mono_array, "Mono Array")


        # downscaled array
        '''
        goal is to keep the same duration

        duration = samples / sample_rate

        librosa gracefully manage decimation problem in resampling where keeping the duration as it is
        rather than manually doing

        new_sample_rate = int(sr / 2.5)
        new_array_length = sound_array.size / new_sample_rate
        spacing = sound_array.size / new_array_length

        resampled_array = sound_array[::spacing]
        '''
        new_sample_rate = int(sr * 2.5)
        resampled = librosa.resample(sound_array, orig_sr=sr, target_sr=new_sample_rate)
        get_arrayinfo(resampled, "Resampled Array")

    except Exception as e:
        print(f"Error loading audio: {e}")

load_audio(sample_short_stereo, 144)


[SOUNDFILE LIBRARY INFORMATION]
./sample-short_stereo.mp3
samplerate: 32000 Hz
channels: 2
duration: 16.743 s
format: MPEG-1/2 Audio [MP3]
subtype: MPEG Layer III [MPEG_LAYER_III]

[CALCULATION BASED INFORMATION]
Calculated duration (array length / sample rate): 16.74315625 s
Calculated sample rate (array length / duration): 32000.000000000004 Hz


[=== Original Array ===]

[SOUND ARRAY INFORMATION]

First 100 values
[[-4.46783270e-05 -5.31793776e-05]
 [-1.82155884e-04 -1.28738204e-04]
 [-1.31948633e-04 -2.01720934e-04]
 [ 4.34060406e-04  3.00895306e-04]
 [ 4.78800393e-05  2.49827397e-04]
 [-2.31990853e-04 -1.11741698e-04]
 [ 3.63951782e-04 -2.16669854e-04]
 [-1.36714865e-04  2.37334767e-04]
 [-3.90714471e-04 -1.25635765e-04]
 [ 4.07321204e-04 -3.47461173e-04]]

Sound array size: 1071562
Sound array shape: (535781, 2)
Sound array dtype: float64
Sound array min: -0.48401427268981934
Sound array max: 0.40086814761161804
Sound array mean: 3.973177252144369e-05
Sound array std: 0.06008486