# Barkify: an unoffical repo for training 'bark' like generative model



In [None]:
from glob import glob
from tqdm import tqdm
import os
import shutil
import json
import soundfile as sf
import subprocess
import numpy as np
import re

import random
from IPython.display import Audio
import IPython.display as iply

from pqdm.processes import pqdm
from pqdm.threads import pqdm as pqdmT

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

import torch
import torch.nn.functional as F

start_path = "../work_env"
start_path += '/'

cuda_devices = [0,1,2,3]*3
NJOB = len(cuda_devices)

In [None]:
def run_multiprocess(x):
    DEVICE, IJOB = x
    subprocess.call(f"CUDA_VISIBLE_DEVICES={DEVICE} "+
                   f"nohup python {os.path.join(start_path, 'tmp', 'temp.py')} {IJOB} "+
                   f"> {os.path.join(start_path, 'tmp','tmp_'+str(IJOB))} 2>&1",
                   shell=True)
    
def write_tmp(_script):
    with open(os.path.join(start_path, "tmp", "temp.py"), "w") as f:
        f.write(_script)

In [None]:
# put all wav to /raw/ folder
os.makedirs(start_path + "wavs", exist_ok=True)

def run(x):
    # 
    name = x.replace("/raw/","/wavs/")
    return subprocess.run(f"ffmpeg -hide_banner -loglevel panic -y -i '{x}' -ac 1 -ar 24000 {name}",
                   shell=True)
wavs_name = glob(os.path.join(start_path, "raw/*.wav"))
res = pqdm(wavs_name, run, n_jobs=128)

In [None]:
os.makedirs(start_path + "wavs16k", exist_ok=True)

def run(x):
    # 
    name = x.replace("/raw/","/wavs16k/")
    return subprocess.run(f"ffmpeg -hide_banner -loglevel panic -y -i '{x}' -ac 1 -ar 16000 {name}",
                   shell=True)
wavs_name = glob(os.path.join(start_path, "raw/*.wav"))
res = pqdm(wavs_name, run, n_jobs=128)

## Fetch features from wav2vec2-xlsr and encodec
根据[meta的论文](https://arxiv.org/pdf/2105.11084.pdf),w2v中15-18层都取得了不错的结果.<br>
在我们的实验中证实,w2v2-xlsr中第15层与bark的semantic idx的相关性最高.

在[audioLM](https://arxiv.org/pdf/2209.03143.pdf)中,使用了w2v-bert的7层作为特征.在w2v中,相关性也非常高.<br>
我们使用第15层作为实验的特征层.

另外,我们测试了bark代码中,coarse2fine的部分.我们发现,coarse和fine均从encodec中直接得到.<br>
因此,如果没有特殊需求,不建议重新训练.

1. Fetch w2v2 hiddens.
2. Cluster them by codes from fairseq.
3. Dump cluster idxs to numpy files.
4. Fetch discrete indices from encodec.

In [None]:
LAYER = 15 # use 15th layer.
Hubert = False # use hubert feature or use w2v2-xlsr feature

clusters = 2048 # semantic idx nums.
dtype = 32 if clusters > 65535 else 16

percent = 0.1 # use 10% datas for clustering. 
n_init = 10 # when this is larger than 10, some error may occur.

params = dict( # params for clusting. 
        init='k-means++', max_iter=100, batch_size=10000, 
        tol=0, max_no_improvement=100, n_init=n_init, reassignment_ratio=0,
        compute_labels=False, verbose=100
    )
params['n_clusters'] = clusters

### Fetch semantic hiddens


In [None]:
# process hubert/w2v2 data such that the Hz of semantic idx equals to 50 rather than 49.9
def process_audio(path):
    wav, fs = sf.read(path)
    safe_length = (wav.shape[0] // 640) * 640 + 160
    wav = wav[:safe_length]
    wav = np.pad(wav, (safe_length - wav.shape[0], 0))
    sf.write(path, wav, fs)
    
wavs_name = glob(start_path + "/wavs16k/*.wav")
res = pqdmT(wavs_name, process_audio, n_jobs=32)

In [None]:
os.makedirs(start_path + "tmp", exist_ok=True)
os.makedirs(start_path + "feats", exist_ok=True)

_script =f'''
from transformers import AutoProcessor, AutoModelForPreTraining, AutoModel
if {Hubert}:
    model = AutoModel.from_pretrained("TencentGameMate/chinese-hubert-large")
else:
    model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-large-xlsr-53")
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
print("downloaded!")
'''
write_tmp(_script) # download it.
run_multiprocess((0, 0))
    
_script += f'''
import os
import sys
import torch
import torch.nn.functional as F
import numpy as np
import soundfile as sf

from tqdm import tqdm
from glob import glob

device = 'cuda:0'
model = model.to(device)

start_path = '{start_path}'
NJOB={NJOB}
meta = glob(start_path+'/wavs16k/*.wav')
slice_len = (len(meta) + NJOB - 1) // NJOB
meta = meta[int(sys.argv[1])*slice_len : (int(sys.argv[1])+1)*slice_len]

for _dir in tqdm(meta):
    audio, fs = sf.read(_dir)
    assert fs == 16000
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
    for i in inputs:
        inputs[i] = inputs[i].cuda()
    with torch.no_grad():
        hidden = model(**inputs, output_hidden_states=True)['hidden_states'][{LAYER} - 1]
        hidden = F.layer_norm(hidden, hidden.shape)
    np.save(_dir.replace("wavs16k","feats").replace(".wav",""), hidden[0].cpu().numpy())

print("Finish!")
'''

In [None]:
write_tmp(_script)
res = pqdm(list(zip(cuda_devices, list(range(NJOB)))), run_multiprocess, n_jobs=NJOB)

###  Clustering

In [None]:
# Basicly from: https://github.com/facebookresearch/fairseq/tree/main/examples/hubert/simple_kmeans
# TODO: It seems pretty slow. Maybe try faiss or conduct PCA before clustering?

os.makedirs(start_path + "tmp", exist_ok=True)
os.makedirs(start_path + "assets", exist_ok=True)

_script = f'''
import os
import sys
import numpy as np
import random

from tqdm import tqdm
from glob import glob
import joblib

from sklearn.cluster import MiniBatchKMeans

params = {params}
kmeans = MiniBatchKMeans(**params)

start_path = '{start_path}'
meta = glob(start_path+'/feats/*.npy')
random.shuffle(meta)
meta = meta[ : int(len(meta)*{percent})]
meta = np.concatenate(
    [np.load(i) for i in meta], axis = 0
)
print("concated.")

kmeans.fit(meta)
joblib.dump(kmeans, start_path + "assets/km_model.joblib")

inertia = -kmeans.score(meta) / len(meta)
print("total intertia: %.5f", inertia)
print("Finish!")
'''

In [None]:
write_tmp(_script)

# without thread limit, some error may occur.
!echo OPENBLAS_NUM_THREADS=16 OMP_NUM_THREADS=16 python {start_path + '/tmp/temp.py'}

### Infer semantic indices


In [None]:
os.makedirs(start_path + "tmp", exist_ok=True)
os.makedirs(start_path + "semantic_idx", exist_ok=True)

_script = f'''
import os
import sys
import numpy as np
import random

from tqdm import tqdm
from glob import glob
import joblib

class ApplyKmeans(object):
    def __init__(self, km_path):
        self.km_model = joblib.load(km_path)
        self.C_np = self.km_model.cluster_centers_.transpose()
        self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)

    def __call__(self, x):
        dist = (
            (x ** 2).sum(1, keepdims=True)
            - 2 * np.matmul(x, self.C_np)
            + self.Cnorm_np
        )
        return np.argmin(dist, axis=1)
        

start_path = '{start_path}'
NJOB={NJOB}
meta = glob(start_path+'/feats/*.npy')
slice_len = (len(meta) + NJOB - 1) // NJOB
meta = meta[int(sys.argv[1])*slice_len : (int(sys.argv[1])+1)*slice_len]

apply_kmeans = ApplyKmeans(start_path + '/assets/km_model.joblib')

for _dir in tqdm(meta):
    _idxs = apply_kmeans(np.load(_dir)).astype(np.int{dtype})
    np.save(_dir.replace("feats","semantic_idx"), _idxs)
    
print("Finish!")
'''

In [None]:
write_tmp(_script)
res = pqdm(list(zip(cuda_devices, list(range(NJOB)))), run_multiprocess, n_jobs=NJOB)    

### Fetch discrete indices from encodec


In [None]:
os.makedirs(start_path + "tmp", exist_ok=True)
os.makedirs(start_path + "encodec_idx", exist_ok=True)

_script ='''
from encodec import EncodecModel
from encodec.utils import convert_audio

model = EncodecModel.encodec_model_24khz()
model.set_target_bandwidth(6.0)
print("downloaded!")
'''
write_tmp(_script) # download it.
run_multiprocess((0, 0))
    
_script += f'''
import os
import sys
import torch
import numpy as np
import torchaudio

from tqdm import tqdm
from glob import glob

device = 'cuda:0'
model = model.to(device)

start_path = '{start_path}'
NJOB={NJOB}
meta = glob(start_path+'/wavs/*.wav')
slice_len = (len(meta) + NJOB - 1) // NJOB
meta = meta[int(sys.argv[1])*slice_len : (int(sys.argv[1])+1)*slice_len]

for _dir in tqdm(meta):
    wav, sr = torchaudio.load(_dir)
    wav = wav[:, :wav.shape[-1] - int(160*1.5)]
    # wav = convert_audio(wav, sr, model.sample_rate, model.channels)
    wav = wav.unsqueeze(0).cuda()
    with torch.no_grad():
        encoded_frames = model.encode(wav)[0][0][0]
    np.save(_dir.replace("wavs","encodec_idx").replace(".wav",""), encoded_frames.cpu().numpy().astype(np.int16))

print("Finish!")
'''

In [None]:
write_tmp(_script)
res = pqdm(list(zip(cuda_devices, list(range(NJOB)))), run_multiprocess, n_jobs=NJOB)    

## Prepare dataset

In [None]:
# min_length = 2 # at least 2 seconds.
# _min_length = int(np.floor(min_length * 50))
for_eval = 128

os.makedirs(start_path + "meta", exist_ok=True)

In [None]:
import pandas as pd
import json
datas = pd.read_csv(start_path+"meta/metadata.csv",sep="|", header=None)
datas = datas.dropna()
datas = datas.values
np.random.shuffle(datas)

with open(start_path + "meta/train.json","w") as f:
    for i in datas[:-for_eval]:
        line = json.dumps({"name": i[0] +".npy", "text": i[1]})
        f.writelines(line+"\n")

with open(start_path + "meta/eval.json","w") as f:
    for i in datas[-for_eval:]:
        line = json.dumps({"name": i[0] +".npy", "text": i[1]})
        f.writelines(line+"\n")