In [1]:

import nemo.collections.asr as nemo_asr
import os
import numpy as np
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [6]:

TOKEN_OFFSET = 100


In [7]:

if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
    autocast = torch.cuda.amp.autocast
else:
    @contextlib.contextmanager
    def autocast():
        yield

def load_model(model_path):
    asr_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(model_path)
    asr_model.eval()
    asr_model.to(device='cuda')
    return asr_model

def transcribe(wav_file, asr_model,logprobs=False):
   
    if type(wav_file) != list:
        wav_file = [wav_file]
    
    with autocast():
        with torch.no_grad():
                return asr_model.transcribe(wav_file)#, logprobs=logprobs)

In [8]:
asr_model_kan=load_model('Models/Kannada.nemo')

[NeMo I 2024-07-17 23:51:47 mixins:173] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2024-07-17 23:51:47 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /root/ekstep/nemo_exp/vakyansh-nemo-experimentation/data/kannada/kannada_train_manifest.json
    sample_rate: 16000
    batch_size: 24
    shuffle: true
    num_workers: 16
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 30
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: 8
    
[NeMo W 2024-07-17 23:51:47 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /root/

[NeMo I 2024-07-17 23:51:47 features:305] PADDING: 0
[NeMo I 2024-07-17 23:51:48 save_restore_connector:272] Model EncDecCTCModelBPE was successfully restored from /home/abhyuday/Desktop/Mini_Project/Models/Kannada.nemo.


In [9]:
transcribe("/home/abhyuday/Desktop/Mini_Project/kannada_audio1.wav",asr_model_kan)

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]


['ಅದು ದೊಡ್ಡ ಪ್ರಶ್ನೆಯಾಗಿದೆ ಪ್ರತಿಯೊಬ್ಬರು ತಮ್ಮ ಜೀವನಕ್ಕೆ ಏನು ಅರ್ಥ ನೀಡುತ್ತದೆ ಎಂಬುದನ್ನು ನಿರ್ಧರಿಸುತ್ತಾರೆ']

ASR DONE!!!

In [10]:

from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer.IndicTransTokenizer import IndicProcessor, IndicTransTokenizer

BATCH_SIZE = 10
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = "4-bit"

In [11]:
def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = IndicTransTokenizer(direction=direction)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            src=True,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text
        generated_tokens = tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

In [12]:
indic_en_ckpt_dir =  "ai4bharat/indictrans2-indic-en-dist-200M" # "ai4bharat/indictrans2-indic-en-1B" 
indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, "indic-en", quantization=quantization)
ip = IndicProcessor(inference=True)
en_indic_ckpt_dir =   "ai4bharat/indictrans2-en-indic-dist-200M"
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)



    The official Tokenizer is available on HF and can be used as follows:
    ```
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    ```
      tokenizer = IndicTransTokenizer(direction=direction)
    


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-indic-en-dist-200M/resolve/main/config.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-indic-en-dist-200M/resolve/main/configuration_indictrans.py HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-indic-en-dist-200M/resolve/main/modeling_indictrans.py HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-indic-en-dist-200M/resolve/main/model.safetensors HTTP/11" 404 0
DEBUG:bitsandbytes.cextension:Loading bitsandbytes native library from: /home/abhyuday/miniconda3/envs/miniproj1/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-indic-en-dist-200M/resolve/main/generatio

    The official Tokenizer is available on HF and can be used as follows:
    ```
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    ```
      tokenizer = IndicTransTokenizer(direction=direction)
    


DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-en-indic-dist-200M/resolve/main/config.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-en-indic-dist-200M/resolve/main/configuration_indictrans.py HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-en-indic-dist-200M/resolve/main/modeling_indictrans.py HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /ai4bharat/indictrans2-en-indic-dist-200M/resolve/main/generation_config.json HTTP/11" 200 0


In [13]:
src_lang,tgt_lang="kan_Knda","eng_Latn"

In [14]:
import google.generativeai as genai

In [15]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY='AIzaSyCC-9zRe_CNF_E4UKksTjXQ9bVUvtsMrVA'
prompt="Give very short answers as u are a voice assistant \n"
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')

In [16]:
# inp=str(model.generate_content(prompt+transcribe("/home/abhyuday/Desktop/Mini_Project/kannada_audio1.wav",asr_model_kan)[0]).text).replace("**","").replace('*',"").replace(".","").split(".")
# #eng_Latn","kan_Knda"
# hi_translations = "".join(batch_translate(inp, tgt_lang,src_lang,en_indic_model , en_indic_tokenizer, ip))
# hi_translations

In [17]:
inp=str(batch_translate(transcribe("/home/abhyuday/Desktop/Mini_Project/kannadatest.mp4",asr_model_kan),src_lang,tgt_lang,indic_en_model, indic_en_tokenizer, ip)[0])
inp=prompt+" "+inp
out=str(model.generate_content(inp).text).replace("**","").replace('*',"").replace(".","").split(".")
out=batch_translate(out,tgt_lang,src_lang,en_indic_model,en_indic_tokenizer,ip)
out

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', '/home/abhyuday/Desktop/Mini_Project/kannadatest.mp4', '-acodec', 'pcm_s16le', '-vn', '-f', 'wav', '-'])
DEBUG:numba.core.byteflow:bytecode dump:
>          0	NOP(arg=None, lineno=1141)
           2	LOAD_FAST(arg=0, lineno=1144)
           4	LOAD_CONST(arg=1, lineno=1144)
           6	BINARY_SUBSCR(arg=None, lineno=1144)
           8	STORE_FAST(arg=3, lineno=1144)
          10	LOAD_FAST(arg=1, lineno=1145)
          12	UNARY_NEGATIVE(arg=None, lineno=1145)
          14	LOAD_FAST(arg=3, lineno=1145)
          16	DUP_TOP(arg=None, lineno=1145)
          18	ROT_THREE(arg=None, lineno=1145)
          20	COMPARE_OP(arg=1, lineno=1145)
          22	POP_JUMP_IF_FALSE(arg=17, lineno=1145)
          24	LOAD_FAST(arg=1, lineno=1145)
          26	COMPARE_OP(arg=1, lineno=1145)
          28	POP_JUMP_IF_FALSE(arg=21, lineno=1145)
          30	JUMP_FORWARD(arg=2, lineno=1145)
>         32	POP_TOP(arg=None, lineno=1145)
          34	JUMP_FO

Transcribing: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s]


['ಸಿದ್ಧರಾಮಯ್ಯ']

In [2]:
import os
import subprocess
import locale
locale.getpreferredencoding = lambda: "UTF-8"

def download(lang, tgt_dir="./"):
  lang_fn, lang_dir = os.path.join(tgt_dir, lang+'.tar.gz'), os.path.join(tgt_dir, lang)
  cmd = ";".join([
        f"wget https://dl.fbaipublicfiles.com/mms/tts/{lang}.tar.gz -O {lang_fn}",
        f"tar zxvf {lang_fn}"
  ])
  print(f"Download model for language: {lang}")
  subprocess.check_output(cmd, shell=True)
  print(f"Model checkpoints in {lang_dir}: {os.listdir(lang_dir)}")
  return lang_dir

LANG = "eng"
ckpt_dir = download(LANG)

Download model for language: eng


--2024-07-18 01:48:26--  https://dl.fbaipublicfiles.com/mms/tts/eng.tar.gz
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 52.84.205.56, 52.84.205.30, 52.84.205.95, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|52.84.205.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 134859962 (129M) [application/x-tar]
Saving to: ‘./eng.tar.gz’

     0K .......... .......... .......... .......... ..........  0% 1.73M 74s
    50K .......... .......... .......... .......... ..........  0% 2.78M 60s
   100K .......... .......... .......... .......... ..........  0% 12.8M 43s
   150K .......... .......... .......... .......... ..........  0% 1.68M 52s
   200K .......... .......... .......... .......... ..........  0% 37.1M 42s
   250K .......... .......... .......... .......... ..........  0% 9.14M 37s
   300K .......... .......... .......... .......... ..........  0% 7.04M 35s
   350K

Model checkpoints in ./eng: ['G_100000.pth', 'config.json', 'vocab.txt']


In [3]:

from IPython.display import Audio
import os
import re
import glob
import json
import tempfile
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import numpy as np
import vits.commons as commons
import vits.utils as utils
import argparse
import subprocess
from vits.data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from vits.models import SynthesizerTrn
from scipy.io.wavfile import write

def preprocess_char(text, lang=None):
    """
    Special treatement of characters in certain languages
    """
    print(lang)
    if lang == 'ron':
        text = text.replace("ț", "ţ")
    return text

class TextMapper(object):
    def __init__(self, vocab_file):
        self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
        self.SPACE_ID = self.symbols.index(" ")
        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

    def text_to_sequence(self, text, cleaner_names):
        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
        Args:
        text: string to convert to a sequence
        cleaner_names: names of the cleaner functions to run the text through
        Returns:
        List of integers corresponding to the symbols in the text
        '''
        sequence = []
        clean_text = text.strip()
        for symbol in clean_text:
            symbol_id = self._symbol_to_id[symbol]
            sequence += [symbol_id]
        return sequence

    def uromanize(self, text, uroman_pl):
        iso = "xxx"
        with tempfile.NamedTemporaryFile() as tf, \
             tempfile.NamedTemporaryFile() as tf2:
            with open(tf.name, "w") as f:
                f.write("\n".join([text]))
            cmd = f"perl " + uroman_pl
            cmd += f" -l {iso} "
            cmd +=  f" < {tf.name} > {tf2.name}"
            os.system(cmd)
            outtexts = []
            with open(tf2.name) as f:
                for line in f:
                    line =  re.sub(r"\s+", " ", line).strip()
                    outtexts.append(line)
            outtext = outtexts[0]
        return outtext

    def get_text(self, text, hps):
        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
        if hps.data.add_blank:
            text_norm = commons.intersperse(text_norm, 0)
        text_norm = torch.LongTensor(text_norm)
        return text_norm

    def filter_oov(self, text):
        val_chars = self._symbol_to_id
        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
        print(f"text after filtering OOV: {txt_filt}")
        return txt_filt

def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
    txt = preprocess_char(txt, lang=lang)
    is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
    if is_uroman:
        with tempfile.TemporaryDirectory() as tmp_dir:
            if uroman_dir is None:
                cmd = f"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}"
                print(cmd)
                subprocess.check_output(cmd, shell=True)
                uroman_dir = tmp_dir
            uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
            print(f"uromanize")
            txt = text_mapper.uromanize(txt, uroman_pl)
            print(f"uroman text: {txt}")
    txt = txt.lower()
    txt = text_mapper.filter_oov(txt)
    return txt

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Run inference with {device}")
vocab_file = f"{ckpt_dir}/vocab.txt"
config_file = f"{ckpt_dir}/config.json"
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
hps = utils.get_hparams_from_file(config_file)
text_mapper = TextMapper(vocab_file)
net_g = SynthesizerTrn(
    len(text_mapper.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
net_g.to(device)
_ = net_g.eval()

g_pth = f"{ckpt_dir}/G_100000.pth"
print(f"load {g_pth}")

_ = utils.load_checkpoint(g_pth, net_g, None)

DEBUG:numba.core.byteflow:bytecode dump:
>          0	NOP(arg=None, lineno=1039)
           2	LOAD_FAST(arg=0, lineno=1042)
           4	LOAD_CONST(arg=1, lineno=1042)
           6	BINARY_SUBSCR(arg=None, lineno=1042)
           8	LOAD_FAST(arg=0, lineno=1042)
          10	LOAD_CONST(arg=2, lineno=1042)
          12	BINARY_SUBSCR(arg=None, lineno=1042)
          14	COMPARE_OP(arg=4, lineno=1042)
          16	LOAD_FAST(arg=0, lineno=1042)
          18	LOAD_CONST(arg=1, lineno=1042)
          20	BINARY_SUBSCR(arg=None, lineno=1042)
          22	LOAD_FAST(arg=0, lineno=1042)
          24	LOAD_CONST(arg=3, lineno=1042)
          26	BINARY_SUBSCR(arg=None, lineno=1042)
          28	COMPARE_OP(arg=5, lineno=1042)
          30	BINARY_AND(arg=None, lineno=1042)
          32	RETURN_VALUE(arg=None, lineno=1042)
DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=0 nstack_initial=0)])
DEBUG:numba.core.byteflow:stack: []
DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=0 nstack_

    


Run inference with cuda
load ./kan/G_100000.pth
INFO:root:Loaded checkpoint './kan/G_100000.pth' (iteration 7693)


In [19]:

def generate_audio(txt):
    print(f"text: {txt}")
    txt = preprocess_text(txt, text_mapper, hps, lang=LANG)
    stn_tst = text_mapper.get_text(txt, hps)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        hyp = net_g.infer(
            x_tst, x_tst_lengths, noise_scale=.667,
            noise_scale_w=0.8, length_scale=1.0
        )[0][0,0].cpu().float().numpy()

    print(f"Generated audio") 
    return Audio(hyp, rate=hps.data.sampling_rate)

In [20]:
generate_audio(out[0])

text: ಸಿದ್ಧರಾಮಯ್ಯ
kan
text after filtering OOV: ಸಿದ್ಧರಾಮಯ್ಯ
Generated audio


In [26]:
inp=str(batch_translate(transcribe("/home/abhyuday/Desktop/Mini_Project/test_kann1.mp4",asr_model_kan),src_lang,tgt_lang,indic_en_model, indic_en_tokenizer, ip)[0])
inp=prompt+" "+inp #
out=str(model.generate_content(inp).text).replace("**","").replace('*',"").replace(".","").split(".")
out=batch_translate(out,tgt_lang,src_lang,en_indic_model,en_indic_tokenizer,ip)
generate_audio(out[0])

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-i', '/home/abhyuday/Desktop/Mini_Project/test_kann1.mp4', '-acodec', 'pcm_s16le', '-vn', '-f', 'wav', '-'])


Transcribing: 100%|██████████| 1/1 [00:00<00:00,  3.49it/s]


text: ಅಲಿಬಾಬಾ ಎಂಬ ಯುವಕನು ನಲವತ್ತು ಕಳ್ಳರಿಂದ ರಕ್ಷಿಸಲ್ಪಟ್ಟ ನಿಧಿಯಿಂದ ತುಂಬಿದ ಗುಪ್ತ ಗುಹೆಯನ್ನು ಕಂಡುಕೊಳ್ಳುತ್ತಾನೆ, ಅವನು ತೆರೆದ ಎಳ್ಳಿನ ಗುಪ್ತಪದವನ್ನು ಬಳಸಿ ಒಳಗೆ ಪ್ರವೇಶಿಸಿ ಸಂಪತ್ತನ್ನು ಕದಿಯುತ್ತಾನೆ, ಆದರೆ ಕಳ್ಳರು ಅವನನ್ನು ಹಿಡಿಯುತ್ತಾರೆ.
kan
text after filtering OOV: ಅಲಿಬಾಬಾ ಎಂಬ ಯುವಕನು ನಲವತ್ತು ಕಳ್ಳರಿಂದ ರಕ್ಷಿಸಲ್ಪಟ್ಟ ನಿಧಿಯಿಂದ ತುಂಬಿದ ಗುಪ್ತ ಗುಹೆಯನ್ನು ಕಂಡುಕೊಳ್ಳುತ್ತಾನೆ ಅವನು ತೆರೆದ ಎಳ್ಳಿನ ಗುಪ್ತಪದವನ್ನು ಬಳಸಿ ಒಳಗೆ ಪ್ರವೇಶಿಸಿ ಸಂಪತ್ತನ್ನು ಕದಿಯುತ್ತಾನೆ ಆದರೆ ಕಳ್ಳರು ಅವನನ್ನು ಹಿಡಿಯುತ್ತಾರೆ
Generated audio
