In [2]:
!pip -q install git+https://github.com/NVIDIA/NeMo.git@r1.23.0#egg=nemo_toolkit[asr]
!pip show huggingface-hub
!pip install huggingface-hub==0.23.2

[33mDEPRECATION: git+https://github.com/NVIDIA/NeMo.git@r1.23.0#egg=nemo_toolkit[asr] contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mName: huggingface-hub
Version: 0.26.2
Summary: Client library to download and publish models, datasets and other repos on the huggingface.co hub
Home-page: https://github.com/huggingface/huggingface_hub
Author: Hugging Face, Inc.
Author-email: julien@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.10/site-packages
Requires: filelock, fsspec, packaging, pyyaml, requests, tqdm, typing-extensions
Required-by: accelerate, datasets, nemo_toolkit, timm, tokenizers, transformers
Collecting huggingface-hub==0.23.2
  Downloading huggingface_hub-0.23.2-py3-none-any.whl.metadata (12 kB)
Downloading huggingface_hub-0.23.2-py3-none-any.whl (40

In [3]:
!pip install -q pyannote.audio

In [4]:
!pip install -q moviepy

In [5]:
!pip install -q silero-vad

In [5]:
from pyannote.audio import Pipeline
from nemo.collections.asr.models import EncDecMultiTaskModel
import torch
import csv
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
from pydub import AudioSegment
import tempfile

# Function 1: Audio to Diarization
def audio_to_diarization(audio_path, diarization_file):
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token="YOUR_HUGGINGFACE_ACCESS_TOKEN")
    pipeline.to(torch.device("cuda"))
    diarization_result = []
    diarization = pipeline(audio_path)

    for turn, _, speaker in diarization.itertracks(yield_label=True):
        diarization_result.append({
            'start': turn.start, 'end': turn.end, 'speaker': f"Speaker {speaker}"})

    with open(diarization_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['start', 'end', 'speaker'])
        writer.writeheader()
        writer.writerows(diarization_result)

    return diarization_result

# Function 2: Load Diarization
def load_diarization(diarization_file):
    with open(diarization_file, 'r') as f:
        reader = csv.DictReader(f)
        return [row for row in reader]

# Function to cut audio into segments based on start and end
def cut_audio(audio, start_ms, end_ms):
    return audio[start_ms:end_ms]

# Function to transcribe audio segments
def transcribe_segments(audio_path, diarization_result):
    model = load_silero_vad()
    
    canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
    decode_cfg = canary_model.cfg.decoding
    decode_cfg.beam.beam_size = 1
    canary_model.change_decoding_strategy(decode_cfg)

    audio = AudioSegment.from_mp3(audio_path)

    transcriptions = []
    for segment in diarization_result:
        start, end, speaker = float(segment['start']), float(segment['end']), segment['speaker']
        
        start_ms = int(start * 1000)
        end_ms = int(end * 1000)

        cut_audio_segment = cut_audio(audio, start_ms, end_ms)
        
        with tempfile.NamedTemporaryFile(delete=False) as temp_audio_file:
            temp_audio_path = temp_audio_file.name + ".wav"
            cut_audio_segment.export(temp_audio_path, format="wav")

        wav = read_audio(temp_audio_path)
        speech_timestamps = get_speech_timestamps(
            wav,
            model,
            # min_speech_duration_ms=4000,
            max_speech_duration_s=30,
            return_seconds=True  # Return speech timestamps in seconds
        )
        print(speech_timestamps)
        
        for timestamp in speech_timestamps:
            segment_start, segment_end = timestamp['start'], timestamp['end']
            segment_start_ms = int(segment_start * 1000)
            segment_end_ms = int(segment_end * 1000)

            audio_chunk = cut_audio(cut_audio_segment, segment_start_ms, segment_end_ms)
            
            with tempfile.NamedTemporaryFile(delete=False) as chunk_audio_file:
                chunk_audio_path = chunk_audio_file.name + ".wav"
                audio_chunk.export(chunk_audio_path, format="wav")
            
            # Transcribe the chunk using the ASR model
            # predicted_text = canary_model.transcribe(paths2audio_files=[chunk_audio_path], batch_size=1)
            # predicted_text = canary_model.transcribe(
            #     paths2audio_files=[chunk_audio_path],
            #     batch_size=16,  # batch size to run the inference with
            # )
            new_data = {
                'start': start + segment_start,
                'end': start + segment_end,
                'speaker': speaker,
                'text': "dummy"
            }
            print(new_data)
            # Append the transcription to the list
            transcriptions.append(new_data)
            
    return transcriptions

In [5]:
print(diars)

NameError: name 'diars' is not defined

In [6]:
diarization_file = "/kaggle/input/anehba/diarization (1).csv"
audio_file = "/kaggle/input/anehba/berita_bule.mp3"
diars = load_diarization(diarization_file)
transcriptions = transcribe_segments(audio_file, diars)

canary-1b.nemo:   0%|          | 0.00/4.07G [00:00<?, ?B/s]

[NeMo I 2024-12-18 18:44:03 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-12-18 18:44:03 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-12-18 18:44:03 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-12-18 18:44:03 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-12-18 18:44:03 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-12-18 18:44:03 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-12-18 18:44:03 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-12-18 18:44:03 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-12-18 18:44:03 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-12-18 18:44:03 features:289] PADDING: 0


      return torch.load(model_weights, map_location='cpu')
    


[NeMo I 2024-12-18 18:44:19 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.
[NeMo I 2024-12-18 18:44:19 aed_multitask_models:214] Changed decoding strategy to 
    strategy: beam
    compute_hypothesis_token_set: false
    preserve_alignments: null
    compute_langs: false
    beam:
      beam_size: 1
      search_type: default
      len_pen: 1.0
      max_generation_delta: 20
      return_best_hypothesis: true
      preserve_alignments: false
    temperature: 1.0
    
[{'start': 0.1, 'end': 14.2}, {'start': 14.4, 'end': 23.0}, {'start': 23.2, 'end': 24.9}, {'start': 25.3, 'end': 32.0}, {'start': 32.2, 'end': 37.2}, {'start': 37.4, 'end': 42.0}, {'start': 42.1, 'end': 44.5}, {'start': 44.8, 'end': 49.2}, {'start': 49.3, 'end': 52.4}, {'start': 52.7, 'end': 61.7}, {'start': 61.8, 'end': 64.4}, {'start': 64.7, 'end': 66.4}, {'star

In [9]:
import pandas as pd

# List of dictionaries
data = transcriptions
# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('output.csv', index=False)


In [9]:
from nemo.collections.asr.models import EncDecMultiTaskModel
from pydub import AudioSegment

model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

def transcribe_audio(audio_file_path):
    decode_cfg = model.cfg.decoding
    decode_cfg.beam.beam_size = 1
    model.change_decoding_strategy(decode_cfg)
    
    # Transcribe audio
    predicted_text = model.transcribe(
        paths2audio_files=[audio_file_path],
        batch_size=16,
    )
    
    return predicted_text

def crop_audio(input_file, start_time, end_time, output_file):
    audio = AudioSegment.from_file(input_file)
    cropped_audio = audio[start_time * 1000:end_time * 1000]
    cropped_audio.export(output_file, format="mp3")
    return output_file

[NeMo I 2024-12-18 19:38:48 mixins:196] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2024-12-18 19:38:48 mixins:330] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2024-12-18 19:38:48 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-12-18 19:38:48 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-12-18 19:38:48 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-12-18 19:38:48 mixins:330] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2024-12-18 19:38:48 aggregate_tokenizer:72] Aggregate vocab size: 4128


[NeMo W 2024-12-18 19:38:48 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2024-12-18 19:38:48 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2024-12-18 19:38:48 features:289] PADDING: 0
[NeMo I 2024-12-18 19:39:03 save_restore_connector:249] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


In [1]:
import pandas as pd

output_file = pd.read_csv("/kaggle/working/output.csv")

for index, row in output_file.iterrows():
    start_time = row['start']
    end_time = row['end']
    input_file = "/kaggle/input/anehba/berita_bule.mp3"
    cropped_file = f"/kaggle/working/cropped_audio_{index}.wav"
    
    cropped_audio_file = crop_audio(input_file, start_time, end_time, cropped_file)
    
    result = transcribe_audio(cropped_audio_file)
    print(result)
    
    output_file.at[index, 'text'] = result[0] if result else "Transcription failed"

output_file.to_csv("/kaggle/working/output_updated.csv", index=False)
print("Transcription completed and CSV updated.")

NameError: name 'crop_audio' is not defined

In [66]:
from pydub import AudioSegment

def crop_audio(input_file, start_time, end_time, output_file):
    audio = AudioSegment.from_file(input_file)
    cropped_audio = audio[start_time * 1000:end_time * 1000]
    cropped_audio.export(output_file, format="mp3")
    return output_file

input_file = "/kaggle/input/anehba/berita_bule.mp3"
start_time = 444.29784375 # Start tim141.27346875e in seconds
end_time = 444.89784375 # End time in seconds
output_file = "/kaggle/working/cropped_audio.mp3"

cropped_audio_file = crop_audio(input_file, start_time, end_time, output_file)
result = transcribe_audio(cropped_audio_file)
print(result)

[NeMo I 2024-12-18 20:30:33 aed_multitask_models:214] Changed decoding strategy to 
    strategy: beam
    compute_hypothesis_token_set: false
    preserve_alignments: null
    compute_langs: false
    beam:
      beam_size: 1
      search_type: default
      len_pen: 1.0
      max_generation_delta: 20
      return_best_hypothesis: true
      preserve_alignments: false
    temperature: 1.0
    


Transcribing: 0it [00:00, ?it/s]

      with torch.cuda.amp.autocast(enabled=False):
    


['Tiple A.']
