# Change RVC model (supported v2 model only)

In [None]:
import torch
from hf_rvc import SynthesizerTrnMs256NSFsidConfig
from transformers import VitsConfig
def openModel(file):
    from pathlib import Path
    modelDir = Path("models").joinpath(file)
    cpt = torch.load(modelDir.joinpath("model.pth"), map_location="cpu", weights_only=True)
    weight = cpt["weight"]
    modelConfig = SynthesizerTrnMs256NSFsidConfig(
        *cpt["config"]
    )
    output_sampling_rate = cpt["config"][-1]
    vitsConfig = VitsConfig(hidden_size=192, hidden_dropout=0.0, ffn_dim=768, num_attention_heads=2, ffn_kernel_size=3, layerdrop=0.0, window_size=10, ffn_dropout=0.0, prior_encoder_num_flows=4, prior_encoder_num_wavenet_layers=3, speaker_embedding_size=256)
    return modelConfig, vitsConfig, weight, output_sampling_rate, modelDir.joinpath("model.pth"), cpt["config"], modelDir.joinpath("metadata.json")

In [None]:
from typing import Dict
from hf_rvc import SynthesizerTrnMs_HfVits
def convert_rvc_to_hf_vits(rvc_state_dict: Dict[str, torch.Tensor],
                           config: SynthesizerTrnMs256NSFsidConfig,
                            vits_config: VitsConfig):
    """
    Converts and loads weights from an RVC state_dict into the hybrid model.

    Args:
        rvc_state_dict (Dict[str, torch.Tensor]): The state_dict from the original RVC model checkpoint.
        target_model (SynthesizerTrnMs_HfVits): An instance of the hybrid model to load weights into.
    """
    new_state_dict = {}
    n_layers = config.n_layers
    n_flows = vits_config.prior_encoder_num_flows
    n_wavenet_layers = vits_config.prior_encoder_num_wavenet_layers
    target_model = SynthesizerTrnMs_HfVits(config, vits_config)

    # --- Text Encoder Mapping (enc_p.encoder) ---
    print("Mapping Text Encoder weights...")
    for i in range(n_layers):
        new_state_dict[f'vistEncoder.layers.{i}.attention.emb_rel_k'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.emb_rel_k']
        new_state_dict[f'vistEncoder.layers.{i}.attention.emb_rel_v'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.emb_rel_v']
        new_state_dict[f'vistEncoder.layers.{i}.attention.q_proj.weight'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.conv_q.weight'].squeeze(-1)
        new_state_dict[f'vistEncoder.layers.{i}.attention.q_proj.bias'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.conv_q.bias']
        new_state_dict[f'vistEncoder.layers.{i}.attention.k_proj.weight'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.conv_k.weight'].squeeze(-1)
        new_state_dict[f'vistEncoder.layers.{i}.attention.k_proj.bias'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.conv_k.bias']
        new_state_dict[f'vistEncoder.layers.{i}.attention.v_proj.weight'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.conv_v.weight'].squeeze(-1)
        new_state_dict[f'vistEncoder.layers.{i}.attention.v_proj.bias'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.conv_v.bias']
        new_state_dict[f'vistEncoder.layers.{i}.attention.out_proj.weight'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.conv_o.weight'].squeeze(-1)
        new_state_dict[f'vistEncoder.layers.{i}.attention.out_proj.bias'] = rvc_state_dict[f'enc_p.encoder.attn_layers.{i}.conv_o.bias']
        new_state_dict[f'vistEncoder.layers.{i}.layer_norm.weight'] = rvc_state_dict[f'enc_p.encoder.norm_layers_1.{i}.gamma']
        new_state_dict[f'vistEncoder.layers.{i}.layer_norm.bias'] = rvc_state_dict[f'enc_p.encoder.norm_layers_1.{i}.beta']
        new_state_dict[f'vistEncoder.layers.{i}.final_layer_norm.weight'] = rvc_state_dict[f'enc_p.encoder.norm_layers_2.{i}.gamma']
        new_state_dict[f'vistEncoder.layers.{i}.final_layer_norm.bias'] = rvc_state_dict[f'enc_p.encoder.norm_layers_2.{i}.beta']
        new_state_dict[f'vistEncoder.layers.{i}.feed_forward.conv_1.weight'] = rvc_state_dict[f'enc_p.encoder.ffn_layers.{i}.conv_1.weight']
        new_state_dict[f'vistEncoder.layers.{i}.feed_forward.conv_1.bias'] = rvc_state_dict[f'enc_p.encoder.ffn_layers.{i}.conv_1.bias']
        new_state_dict[f'vistEncoder.layers.{i}.feed_forward.conv_2.weight'] = rvc_state_dict[f'enc_p.encoder.ffn_layers.{i}.conv_2.weight']
        new_state_dict[f'vistEncoder.layers.{i}.feed_forward.conv_2.bias'] = rvc_state_dict[f'enc_p.encoder.ffn_layers.{i}.conv_2.bias']

    # --- Flow Mapping (flow) ---
    print("Mapping Flow weights...")
    rvc_flow_indices = [0, 2, 4, 6]
    for i in range(n_flows):
        rvc_idx = rvc_flow_indices[i]
        # FIX: Add '.flows' to the key to match the new model structure
        prefix = f'vitsFlow.flows.{i}'
        rvc_prefix = f'flow.flows.{rvc_idx}.enc'

        new_state_dict[f'{prefix}.conv_pre.weight'] = rvc_state_dict[f'flow.flows.{rvc_idx}.pre.weight']
        new_state_dict[f'{prefix}.conv_pre.bias'] = rvc_state_dict[f'flow.flows.{rvc_idx}.pre.bias']
        new_state_dict[f'{prefix}.conv_post.weight'] = rvc_state_dict[f'flow.flows.{rvc_idx}.post.weight']
        new_state_dict[f'{prefix}.conv_post.bias'] = rvc_state_dict[f'flow.flows.{rvc_idx}.post.bias']
        for j in range(n_wavenet_layers):
            new_state_dict[f'{prefix}.wavenet.in_layers.{j}.bias'] = rvc_state_dict[f'{rvc_prefix}.in_layers.{j}.bias']
            new_state_dict[f'{prefix}.wavenet.in_layers.{j}.parametrizations.weight.original0'] = rvc_state_dict[f'{rvc_prefix}.in_layers.{j}.weight_g']
            new_state_dict[f'{prefix}.wavenet.in_layers.{j}.parametrizations.weight.original1'] = rvc_state_dict[f'{rvc_prefix}.in_layers.{j}.weight_v']
            new_state_dict[f'{prefix}.wavenet.res_skip_layers.{j}.bias'] = rvc_state_dict[f'{rvc_prefix}.res_skip_layers.{j}.bias']
            new_state_dict[f'{prefix}.wavenet.res_skip_layers.{j}.parametrizations.weight.original0'] = rvc_state_dict[f'{rvc_prefix}.res_skip_layers.{j}.weight_g']
            new_state_dict[f'{prefix}.wavenet.res_skip_layers.{j}.parametrizations.weight.original1'] = rvc_state_dict[f'{rvc_prefix}.res_skip_layers.{j}.weight_v']
        new_state_dict[f'{prefix}.wavenet.cond_layer.bias'] = rvc_state_dict[f'{rvc_prefix}.cond_layer.bias']
        new_state_dict[f'{prefix}.wavenet.cond_layer.parametrizations.weight.original0'] = rvc_state_dict[f'{rvc_prefix}.cond_layer.weight_g']
        new_state_dict[f'{prefix}.wavenet.cond_layer.parametrizations.weight.original1'] = rvc_state_dict[f'{rvc_prefix}.cond_layer.weight_v']

    # --- Other RVC-specific layers ---
    print("Mapping remaining RVC-specific layers...")
    # FIX: Change 'emb_phone' to 'phoneme_embedding'
    new_state_dict['phoneme_embedding.weight'] = rvc_state_dict['enc_p.emb_phone.weight']
    new_state_dict['phoneme_embedding.bias'] = rvc_state_dict['enc_p.emb_phone.bias']
    new_state_dict['emb_pitch.weight'] = rvc_state_dict['enc_p.emb_pitch.weight']
    new_state_dict['emb_g.weight'] = rvc_state_dict['emb_g.weight']
    new_state_dict['proj.weight'] = rvc_state_dict['enc_p.proj.weight']
    new_state_dict['proj.bias'] = rvc_state_dict['enc_p.proj.bias']

    # --- Decoder (dec) ---
    print("Mapping Decoder weights...")
    for key, value in rvc_state_dict.items():
        if key.startswith('dec.'):
            new_key = key.replace('dec.', 'hybrid_decoder.')
            new_state_dict[new_key] = value

    # --- Load the state dict ---
    print("\nLoading state dictionary...")
    missing_keys, unexpected_keys = target_model.load_state_dict(new_state_dict, strict=False)

    if unexpected_keys:
        print(f"\nWarning: Unexpected keys in state_dict: {unexpected_keys}")
    if missing_keys:
        print(f"\nWarning: Missing keys in state_dict: {missing_keys}")

    print("\nWeight transfer complete!")
    return target_model

In [None]:
config, vitsConfig, weight, output_sampling_rate, savePath, cpt, cptPath = openModel("your favorite model")

In [None]:
model = convert_rvc_to_hf_vits(weight, config, vitsConfig)

In [None]:
torch.save(model.state_dict(),savePath)

In [None]:
import json
with open(cptPath, "w") as f:
    json.dump(cpt, f, indent=4, ensure_ascii=False)

# export hubert
# Please download hubert_base.pt from original repository

In [None]:
from transformers.models.hubert.convert_hubert_original_pytorch_checkpoint_to_pytorch import  convert_hubert_checkpoint
convert_hubert_checkpoint("hubert_base.pt", "models/hubert_base_hf", is_finetuned=False)