### Test Noise Clasifier

In [1]:
import json
from glob import glob
from pathlib import Path

import pytest
from speechline.classifiers import Wav2Vec2Classifier
from speechline.config import Config, SegmenterConfig, TranscriberConfig
from speechline.run import Runner
from speechline.segmenters import SilenceSegmenter
from speechline.transcribers import Wav2Vec2Transcriber, WhisperTranscriber
from speechline.utils.dataset import format_audio_dataset, prepare_dataframe
from speechline.utils.io import export_transcripts_json
from pydub import AudioSegment
from speechline.classifiers import DistilAstNoiseClassifier


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
datadir = "/home/BookBot/speechline/tests/test_ml"
tmpdir = "/home/BookBot/speechline/tests/test_ml_output"

model_checkpoint = "bookbot/wav2vec2-ljspeech-gruut"
noise_classifier_checkpoint = "bookbot/distil-ast-audioset"
transcriber = Wav2Vec2Transcriber(model_checkpoint)
df = prepare_dataframe(datadir)
dataset = format_audio_dataset(df, sampling_rate=transcriber.sampling_rate)
transcriptions = transcriber.predict(dataset, return_timestamps="char")

output_offsets = transcriber.predict(
    dataset, return_timestamps="char", output_offsets=True
)

segmenter = SilenceSegmenter()
segments = []
for audio_path, offsets in zip(df["audio"], output_offsets):
    json_path = Path(audio_path).with_suffix(".json")
    export_transcripts_json(json_path, offsets)
    assert json_path.exists()
    assert json.load(open(json_path)) == offsets

    
    segment = segmenter.chunk_audio_segments(
        audio_path,
        tmpdir,
        offsets,
        minimum_chunk_duration=0.7,
        silence_duration=0.3,
    )

    segment = segmenter.insert_silence_tag(segment, 0.2)
    classifier = DistilAstNoiseClassifier(noise_classifier_checkpoint)
    segment = segmenter.classify_noise(segment, classifier, audio_path)
    segments.append(segment)
print(segments)



                                                                                      

[[[{'start_time': 0.0, 'end_time': 0.04, 'text': 'h'}, {'start_time': 0.14, 'end_time': 0.2, 'text': 'h'}, {'start_time': 0.24, 'end_time': 0.28, 'text': 'ɚ'}, {'start_time': 0.42, 'end_time': 0.44, 'text': 'i'}, {'start_time': 0.5, 'end_time': 0.54, 'text': 'd'}, {'start_time': 0.64, 'end_time': 0.66, 'text': 'ʌ'}, {'start_time': 0.7, 'end_time': 0.74, 'text': 'm'}, {'start_time': 0.78, 'end_time': 0.82, 'text': 'b'}, {'start_time': 0.84, 'end_time': 0.9, 'text': 'ɹ'}, {'start_time': 0.92, 'end_time': 0.94, 'text': 'ɛ'}, {'start_time': 1.0, 'end_time': 1.04, 'text': 'l'}, {'start_time': 1.08, 'end_time': 1.12, 'text': 'ə'}, {'start_time': 1.12, 'end_time': 1.36, 'text': '<SIL>'}, {'start_time': 1.36, 'end_time': 1.38, 'text': 'ɪ'}, {'start_time': 1.54, 'end_time': 1.58, 'text': 'z'}, {'start_time': 1.58, 'end_time': 1.62, 'text': 'd͡ʒ'}, {'start_time': 1.62, 'end_time': 1.66, 'text': 'ʌ'}, {'start_time': 1.72, 'end_time': 1.76, 'text': 's'}, {'start_time': 1.78, 'end_time': 1.82, 'tex



In [131]:
from pydub import AudioSegment
start_time =  0.0
end_time = 10.0




audio = AudioSegment.from_file("/home/BookBot/speechline/examples/--aE2O5G5WE.wav")
# audio = audio[start_time*1000:end_time*1000]
print(type(audio.get_array_of_samples()))
audio_array, sr = pydub_to_np(audio)

# audio_bytes = audio.raw_data

audio_target_dataset = Dataset.from_dict({"audio": ["/home/BookBot/speechline/examples/--aE2O5G5WE.wav"]}).cast_column("audio", Audio())
audio_dataset = Dataset.from_dict({"audio": [{'path': None, 'array': audio_array, 'sampling_rate':sr}]}).cast_column("audio", Audio())
# audio_dataset = Dataset.from_dict({"audio": [{'path': None, 'bytes': audio_bytes}]}).cast_column("audio", Audio())

# print(audio_target_dataset[0]['audio'])
# print(audio_dataset[0]['audio'])
# ast(filepath[0], top_k=None)
classifier.predict(audio_dataset, threshold=0.2)

<class 'array.array'>


                                                                        

[[{'label': 'Speech', 'score': 0.8289180397987366},
  {'label': 'Music', 'score': 0.3479023575782776}]]