# RVC Algorithm

```
┌─────────────────────────────────────────────────────┐
│ INPUT: TTS Audio + .pth model + .index file         │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 1: Content Extraction                          │
│ Model: ContentVec/HuBERT                            │
│ Input: Audio waveform                               │
│ Output: Content features (what is said)             │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 2: Feature Retrieval (THE MAGIC)               │
│ Tool: FAISS Index                                   │
│ Input: Content features                             │
│ Output: Retrieved features (target voice style)     │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 3: Pitch Extraction                            │
│ Model: RMVPE/CREPE/FCPE                             │
│ Input: Audio waveform                               │
│ Output: Pitch values (melody)                       │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 4: Protection Blending                         │
│ Mix original + retrieved features                   │
│ Preserves emotion and character                     │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 5: Feature Upsampling                          │
│ Interpolate 50Hz → 100Hz                            │
│ Makes output smoother                               │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ STEP 6: Audio Generation                            │
│ Model: Generator (.pth file)                        │
│ Input: Features + Pitch                             │
│ Output: Final audio waveform                        │
└─────────────────────────────────────────────────────┘
                        ↓
┌─────────────────────────────────────────────────────┐
│ OUTPUT: Voice-converted audio file                  │
└─────────────────────────────────────────────────────┘
```

In [93]:
import os
import soundfile as sf
import sounddevice as sd
from playsound import playsound
import numpy as np

In [94]:
# audio samples
sample_short_mono = "./sample-short_mono.mp3"
sample_long_mono = "./sample-long_mono.mp3"
sample_short_stereo = "./sample-short_stereo.mp3"

### 1. Content Extraction

In [95]:
def get_details(file):
    sound_array, sample_rate = sf.read(file)
    sound_array_length = len(sound_array)

    # information about the sound
    print("\n[SOUNDFILE LIBRARY INFORMATION]")
    info = sf.info(file)
    print(info)

    print("\n[CALCULATION BASED INFORMATION]")

    # true duration
    true_duration = sound_array_length / sample_rate
    print(f"Calculated duration (array length / sample rate): {true_duration} s")

    # sample rate (data per second)
    calculated_sample_rate = sound_array_length / true_duration
    print(f"Calculated sample rate (array length / duration): {calculated_sample_rate} Hz")

    print("\n[SOUND ARRAY INFORMATION]")

    print(f"Sound array size: {sound_array.size}")
    print(f"Sound array shape: {sound_array.shape}")
    print(f"Sound array dtype: {sound_array.dtype}")
    print(f"Sound array min: {sound_array.min()}")
    print(f"Sound array max: {sound_array.max()}")
    print(f"Sound array mean: {sound_array.mean()}")
    print(f"Sound array std: {sound_array.std()}")

    print(f"Sound type '{'mono' if len(sound_array.shape) == 1  else 'stereo'}'")

def load_audio(file, sample_rate, **kwargs):
    try:
        # file path cleanup
        file = file.strip(' "\n')
        if not os.path.isfile(file):
            raise FileNotFoundError(f"File not found: {file}")

        # get details of the file
        get_details(file)

        # load audio array and sample rate using soundfile
        sound_array, sample_rate = sf.read(file)

        print(sound_array.T)




    except Exception as e:
        print(f"Error loading audio: {e}")

load_audio(sample_short_stereo, 144)


[SOUNDFILE LIBRARY INFORMATION]
./sample-short_stereo.mp3
samplerate: 32000 Hz
channels: 2
duration: 16.743 s
format: MPEG-1/2 Audio [MP3]
subtype: MPEG Layer III [MPEG_LAYER_III]

[CALCULATION BASED INFORMATION]
Calculated duration (array length / sample rate): 16.74315625 s
Calculated sample rate (array length / duration): 32000.000000000004 Hz

[SOUND ARRAY INFORMATION]
Sound array size: 1071562
Sound array shape: (535781, 2)
Sound array dtype: float64
Sound array min: -0.48401427268981934
Sound array max: 0.40086814761161804
Sound array mean: 3.973177252144369e-05
Sound array std: 0.0600848674963194
Sound type 'stereo'
[[-4.46783270e-05 -1.82155884e-04 -1.31948633e-04 ...  2.19651866e-07
  -4.52506129e-06 -9.01522640e-07]
 [-5.31793776e-05 -1.28738204e-04 -2.01720934e-04 ... -1.96986530e-05
  -1.50217074e-05 -3.69524150e-06]]
