##DESIGN
"""
Preprocessing

  Convert To Mono
  Audio Enhancement (optional)
  Label transliteration
  Remove punctuation
  Augmentation
  Label copying
  Necessary folder arrangement to load data and feed into wav2vec2


Data
  Folder1
    audio_1
    audio_2
    ...
    transcript    
  Folder2
    audio_1
    audio_2
    ...
    transcript
  ...

DataFolderPath
->
Preprocessor
(walks data folder)
searches for transcript file, finds file and creates transliterated transcript file/ generates a pandas transliterated row for the file.
Each audio is augmented (list of augmentation is provided) corresponding label
->
Transliteration using dictionary
->
Augmentation
->
Label Copy
->
Folder format
"""


##SETUP

In [None]:
!pip install torchcodec

In [None]:
!pip install g2p_en

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install nepali-num2word

In [None]:
!git clone https://github.com/adkta/nepali_arabic_num_to_word.git

In [None]:
!rm -r /content/transliteration

In [None]:
!git clone https://github.com/adkta/transliteration.git

In [None]:
import sys
print(sys.path)

In [None]:
import torch
import torchaudio
from typing import Union, Optional
from pathlib import Path
from transliteration.transliterator import Transliterator
from transliteration.transliterators import RomanToDevaTransliterator
import pandas as pd
import re
from collections.abc import Generator

In [None]:
from transliteration.transliterator import TranslitDict

##LOGGING CONFIGURATION

In [None]:
import logging

LOG_LEVEL = logging.INFO
_log_path = '/content/preprocessing.log'
logger = logging.getLogger("preprocessing")
logger.setLevel(LOG_LEVEL)
_handler = logging.FileHandler(_log_path)
_handler.setLevel(LOG_LEVEL)
_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(_handler)

##PREPROCESSING CLASSES

In [None]:
class Utils:
    @staticmethod
    def get_matching_files(data_fol: Path, file_pattern: str) -> Generator[Path, None, None]:
        file_pattern = re.compile(file_pattern)
        for f in data_fol.iterdir():
            if file_pattern.search(f.as_posix()):
                yield f

In [None]:
import torchaudio
import torch

class AudioAugmentor:
    def __init__(self) -> None:
        self.download_assets()

    def download_assets(self) -> None:
        rir_loc = torchaudio.utils._download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
        noise_loc = torchaudio.utils._download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
        self.rir, self.rir_sample_rate = torchaudio.load(rir_loc)
        self.noise, self.noise_sample_rate = torchaudio.load(noise_loc)
        self.noise = self.noise/torch.linalg.vector_norm(self.noise)

    def resample_assets(self, new_sample_rate: int) -> None:
        self.rir = torchaudio.transforms.Resample(orig_freq=self.rir_sample_rate, new_freq=new_sample_rate)(self.rir)
        self.noise = torchaudio.transforms.Resample(orig_freq=self.noise_sample_rate, new_freq=new_sample_rate)(self.noise)

    def frame_count(self, waveform: torch.Tensor) -> int:
        return waveform.shape[1]

    def adjust_noise_frame_count_for_add(self, waveform: torch.Tensor, noise: torch.Tensor) -> torch.Tensor:
        """
        Noise adjustment for addition to signal
        """
        noise_frame_count = self.frame_count(noise)
        signal_frame_count = self.frame_count(waveform)
        if noise_frame_count == signal_frame_count:
            pass
        elif noise_frame_count > signal_frame_count:
            noise = noise[:, :signal_frame_count]
        else:
            quo, rem= divmod(signal_frame_count, noise_frame_count)
            repeated_noise_list=[]
            for _ in range(quo):
                repeated_noise_list.append(noise)
            repeated_noise_list.append(noise[:, :rem])
            noise = torch.cat(repeated_noise_list, dim = 1)
        return noise

    def validate_audio_data(self, audio_data: Union[str, Path, torch.Tensor]) -> torch.Tensor:
        assert audio_data.is_file(), "audio_data must be a torch.Tensor, str path to an audio file or Path object to an audio file"

    def convert_to_mono(self, audio_tensor: torch.Tensor) -> torch.Tensor:
        if audio_tensor.shape[0] == 1:
            return audio_tensor
        return torch.mean(audio_tensor, dim=0, keepdim=True)

    def resample(self, waveform: torch.Tensor, orig_sample_rate: int, new_sample_rate: int) -> torch.Tensor:
        return torchaudio.transforms.Resample(orig_freq=orig_sample_rate, new_freq=new_sample_rate)(waveform)

    def shift_pitch(self, audio_data: torch.Tensor, sample_rate: int) -> torch.Tensor:
        return torchaudio.transforms.PitchShift(sample_rate = sample_rate, n_steps = 2)(audio_data)

    def add_background_crowd(self, audio_data: torch.Tensor, snr: int) -> torch.Tensor:
        adj_resampled_noise = self.adjust_noise_frame_count_for_add(waveform = audio_data, noise = self.noise)
        add_noise = torchaudio.transforms.AddNoise()
        noisy_audio = add_noise(waveform = audio_data, noise = adj_resampled_noise, snr = torch.tensor([snr]))
        return noisy_audio

    def add_white_noise(self, audio_data: torch.Tensor, snr: int, variance: int) -> torch.Tensor:
        white_noise = torch.randn_like(audio_data) * variance
        white_noisy_audio = torchaudio.functional.add_noise(audio_data, white_noise,snr=torch.tensor([snr]))
        return white_noisy_audio

    def add_room_reverb(self, audio_data: torch.Tensor) -> torch.Tensor:
        return torchaudio.functional.fftconvolve(audio_data, self.rir)

    def perturb_speed(self, audio_data: torch.Tensor, sample_rate: int, factors: list[float]) -> torch.Tensor:
        speed_perturb = torchaudio.transforms.SpeedPerturbation(orig_freq= sample_rate, factors = factors)
        perturbed_audio, _ = speed_perturb(waveform = audio_data)
        return perturbed_audio

    def apply_low_pass_filter(self, audio_data: torch.Tensor, sample_rate: int) -> torch.Tensor:
        req_effect = "lowpass=frequency=1000"
        effector = torchaudio.io.AudioEffector(effect=req_effect)
        filtered_audio_list = []
        for chunk in effector.stream(waveform=audio_data.T, sample_rate = sample_rate, frames_per_chunk = 48000):
            filtered_audio_list.append(chunk)
        filtered_audio = torch.cat(filtered_audio_list, dim=0).T
        return filtered_audio



In [None]:
from typing import IO
from transliteration.transliterator import Transliterator


class AudioPreprocessor:
    """
    Audio identifiers in the transcript file will only be searched non-recursively in the folder housing the transcript file.
    Currently the preprocessor expects the following format in the transcript file.
    Each line corresponds to audio_<audio_number>.<audio_ext>. Eg. line 1 would be for audio_1.mp3, line 2 for audio_2.mp3 and so on.
    A future to do may process the file in format: audio_id<delimiter>label (which is the output format)
    """

    audio_ext_pattern = r"\.mp3$|\.wav$|\.opus$"
    DEFAULT_RESAMPLE_RATE = 16000
    TKNZR_PATTERN = re.compile(TranslitDict.PUNCT_SPACE_REGEX)

    def __init__(
        self,
        data_folder: str,
        out_folder: str,
        skip: Union[tuple[str],None] = None,
        transcr_name:tuple[str] = ("transcript.txt",),
        out_transcr_name: str = "transcript.txt",
        transliterator: Optional[Transliterator] = None,
        save_aud: bool = True,
        augmentor: Optional[AudioAugmentor] = None,
        mono: bool = True,
        resample_rate: Optional[int] = DEFAULT_RESAMPLE_RATE,
        augment: bool = False
        ) -> None:
        """
        :param data_folder: str Input data folder
        :param out_folder: str Output folder
        :param skip: Union[tuple[str], None] List of files/folders to skip
        :param transcr_name: str List of names that qualify as transcript files
        :param transliterator: Optional[Transliterator] Transliterator to use. If not specified will not transliterate labels
        :param augmentor: Augmentor Augmentor to use. If not specified, will not augment the audio
        :param mono: bool Convert to mono channel
        :param resample_rate: int Resample rate
        """
        self.data_folder = Path(data_folder)
        self.out_folder = Path(out_folder)
        self.skip = skip
        self.transcr_name = transcr_name
        self.out_transcr_name = out_transcr_name
        self.transliterator = transliterator
        self.save_aud = save_aud
        self.augmentor = augmentor
        self.mono = mono
        self.resample_rate = resample_rate
        self.augment = augment

    def is_audio(self, path: Path) -> bool:
        return path.is_file() and path.name.endswith(AudioPreprocessor.audio_ext)

    def get_transcr_file(self, folder: Path) -> Optional[Path]:
        transcript_files = []
        for fn in self.transcr_name:
            logger.info(f"Searching for {fn} in {folder}")
            if transcript_file := list(folder.glob(fn)):
                transcript_files.extend(transcript_file)

        assert len(transcript_files) < 2, "More than one transcript file"
        return transcript_files[0] if transcript_files else None

    def get_audio_files(self, folder: Path) -> Generator[Path, None, None]:
        return Utils.get_matching_files(data_fol = folder, file_pattern = AudioPreprocessor.audio_ext_pattern)

    def get_transcripts(self, transcript_file: Path) -> Generator[str, None, None]:
        with open(transcript_file, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                  continue
                if self.transliterator:
                  line = self.transliterate_line(line)
                yield line

    def transliterate_line(self, line: str) -> str:
        words = AudioPreprocessor.TKNZR_PATTERN.split(line)
        tl_words = []
        for word in words:
            if self.transliterator.for_transliteration(word):
                word = self.transliterator.translit(word)
            tl_words.append(word)
        return " ".join(tl_words)

    def get_audio_label_dict(self, folder: Path) -> Optional[dict[Path, str]]:
        transcript_file = self.get_transcr_file(folder)
        if not transcript_file:
            return None
        numbered_audio_filter = lambda x: len(x.stem.split('_')) == 2
        audio_num = lambda x: int(x.stem.split('_')[1])
        sorted_audio_path: list[Path] = sorted(filter(numbered_audio_filter, self.get_audio_files(folder)), key = audio_num)
        transcripts: Generator[str, None, None] = self.get_transcripts(transcript_file)
        audio_label_dict:dict[Path, str] = dict(zip(sorted_audio_path, transcripts))
        return audio_label_dict

    def write_to_transcript(self, audio_path: Path, label: str, out_transcr_file: IO, ext: bool = False) -> None:
        audio_id = audio_path.stem #additional assignment for code readability. Con: requires 2 assignment statements if ext = True.
        if ext:
            audio_id = audio_path.name
        logger.debug(f"Writing to {out_transcr_file}")
        out_transcr_file.write(f"{audio_id}\t{label}\n")

    def save_original(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        og_path = out_path / f"{audio_path.parent.stem}_{audio_path.name}"
        self.write_to_transcript(og_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        torchaudio.save(uri = og_path, src = audio, sample_rate = sample_rate)

    def save_pitch_shifted(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        ps_path = out_path / f"{audio_path.parent.stem}_ps_{audio_path.name}"
        self.write_to_transcript(ps_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        pitch_shifted_audio = self.augmentor.shift_pitch(audio, sample_rate)
        torchaudio.save(uri= ps_path, src = pitch_shifted_audio, sample_rate = sample_rate )

    def save_noisy(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, snr: int, label: str, out_transcr_file: IO) -> None:
        n_path = out_path / f"{audio_path.parent.stem}_n_{audio_path.name}"
        self.write_to_transcript(n_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        noisy_audio = self.augmentor.add_background_crowd(audio, snr = snr)
        torchaudio.save(uri= n_path, src = noisy_audio, sample_rate = sample_rate )

    def save_white_noisy(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, snr: int, variance: int,  label: str, out_transcr_file: IO) -> None:
        wn_path = out_path / f"{audio_path.parent.stem}_wn_{audio_path.name}"
        self.write_to_transcript(wn_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        white_noisy_audio = self.augmentor.add_white_noise(audio, snr = snr, variance = variance)
        torchaudio.save(uri= wn_path, src = white_noisy_audio, sample_rate = sample_rate )

    def save_room_reverbed(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        rr_path = out_path / f"{audio_path.parent.stem}_rr_{audio_path.name}"
        self.write_to_transcript(rr_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        room_reverbed_audio = self.augmentor.add_room_reverb(audio)
        torchaudio.save(uri= rr_path, src = room_reverbed_audio, sample_rate = sample_rate )

    def save_speed_perturbed(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        sp_path = out_path / f"{audio_path.parent.stem}_sp_{audio_path.name}"
        self.write_to_transcript(sp_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        speed_perturbed_audio = self.augmentor.perturb_speed(audio, sample_rate, factors = [0.9, 0.95, 1.05, 1.1])
        torchaudio.save(uri= sp_path, src = speed_perturbed_audio, sample_rate = sample_rate )

    def save_low_passed(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        lpf_path = out_path / f"{audio_path.parent.stem}_lpf_{audio_path.name}"
        low_passed_audio = self.augmentor.apply_low_pass_filter(audio, sample_rate)
        self.write_to_transcript(lpf_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        torchaudio.save(uri= lpf_path, src = low_passed_audio, sample_rate = sample_rate )

    def write_transcript_headers(self)->None:
        header_1, header_2 = TranslitDict.DEFAULT_HEADERS
        out_transcr_file.write(f"{header_1}\t{header_2}\n")


    def preprocess(self) -> None:
        if self.augmentor and self.resample_rate:
            self.augmentor.resample_assets(self.resample_rate)

        self.out_folder.mkdir()

        out_transcr_path = self.out_folder / Path(self.out_transcr_name)
        with open(out_transcr_path, mode = 'w', encoding = 'utf-8') as out_transcr_file:
            self.preprocess_dir(self.data_folder, out_transcr_file)

    def preprocess_dir(self, folder: Path, out_transcr_file: IO) -> None:
        # If there are nested folders go inside and handle separately as a unit (of transcript file and audio files)
        for path in folder.iterdir():
            if self.skip and path.name in self.skip: #directory to be skipped
                continue

            if path.is_dir():
                self.preprocess_dir(path, out_transcr_file)
                continue

        # the presence of a transcript file tells us that this is a concerned directory (an audio-transcript unit)
        audio_label_dict = self.get_audio_label_dict(folder)

        if not audio_label_dict:
            return

        out_path = self.out_folder
        #self.write_transcript_headers()
        for audio_path in audio_label_dict.keys():

            label = audio_label_dict[audio_path]
            audio, sample_rate = torchaudio.load(audio_path)

            if not self.augmentor:
                continue

            if self.mono:
                audio = self.augmentor.convert_to_mono(audio)

            if self.resample_rate:
                audio = self.augmentor.resample(audio, sample_rate, self.resample_rate)
                sample_rate = self.resample_rate

            self.save_original(audio_path, out_path, audio, sample_rate, label, out_transcr_file)

            if not self.augment:
                continue

            self.save_pitch_shifted(audio_path, out_path, audio, sample_rate, label, out_transcr_file)
            self.save_noisy(audio_path, out_path, audio, sample_rate, 20, label, out_transcr_file)
            self.save_white_noisy(audio_path, out_path, audio, sample_rate, 20, 0.01, label, out_transcr_file)
            self.save_room_reverbed(audio_path, out_path, audio, sample_rate, label, out_transcr_file)
            self.save_speed_perturbed(audio_path, out_path, audio, sample_rate, label, out_transcr_file)
            self.save_low_passed(audio_path, out_path, audio, sample_rate, label, out_transcr_file)





##COPY TEST FOLDER AND RUN TESTS

###CS MANUAL DATASET

In [None]:
!rm -r /content/data
!mkdir /content/data

In [None]:
!ls /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/

In [None]:
!cp -r /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/* /content/data/
# %cd /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/
# !cp -r SG_Uniq_Poet SG_Palesha_Goverdhan SS_Dr_Jaya /content/data/
# %cd ~

In [None]:
!cp '/content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Transliteration Dictionary/Roman_Devanagari_Translit_Dict.json' /content/

In [None]:
translitr = RomanToDevaTransliterator(translit_dict='/content/Roman_Devanagari_Translit_Dict.json')

In [None]:
!rm -r /content/data_out

####AUGMENT AND TRANSLITERATE AND SAVE AUDIO

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = translitr, augmentor = AudioAugmentor(), augment = True)
preprocessor.preprocess(); #To include extensions in file ids in transcript file see method write_to_transcript(). Perhaps this feature should be included as an attribute in AudioPreprocessor class.


####WRITE TRANSCRIPTS IN PROPER FORMAT WITHOUT TRANSLITERATION (AUDIO_ID TRANSCRIPT). NO AUDIO SAVING

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", augmentor = AudioAugmentor(), save_aud=False)
preprocessor.preprocess()

####TRANSLITERATE AND AUGMENT BUT DON'T SAVE AUDIO
This will only create transcript file. Will not actually augment the audio but the augmented audio's transcript (same as the original) are duplicated in the transcript file with proper labels (i.e. augmented audio ids)

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = translitr, augmentor = AudioAugmentor(), augment = True,  save_aud=False)
preprocessor.preprocess()

####AUGMENT BUT DON'T SAVE AUDIO NO TRANSLITERATION

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = None, augmentor = AudioAugmentor(), augment = True,  save_aud=False)
preprocessor.preprocess()

In [None]:
!cat /content/data_out/transcript.txt | wc -l


In [None]:
!cp /content/data_out/transcript.txt /content/drive/MyDrive/MSICE/native_augmented_transcript.txt

####SAVE TO DRIVE

In [None]:
!zip -r /content/data_out.zip /content/data_out
!cp /content/data_out.zip /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual_Augmented.zip

###BHATTA DATASET RESAMPLING, CONVERSION TO MONO

In [None]:
!rm -r /content/Bhatta_Normalized_DataSet/
!mkdir --parents /content/Bhatta_Normalized_DataSet/bharat1/ /content/Bhatta_Normalized_DataSet/ganNeshsir3/

In [None]:
!cp -r /content/drive/MyDrive/MSICE/Pure_Nepali_Labelled_Speech_Data/Dataset/Train/SECTEC /content/

In [None]:
!ls -l /content/SECTEC

In [None]:
import torchaudio
from pathlib import Path
import re

def normalize_bhatta_dataset(data_fol:str, out_fol:str, mono: bool, new_sample_rate: int) -> None:
    data_fol = Path(data_fol)
    out_fol = Path(out_fol)

    #AUDIO
    audio_ext_pattern = re.compile(r"\.wav$")
    aug = AudioAugmentor()
    for path in data_fol.iterdir():
        if not path.is_file() or not audio_ext_pattern.search(path.name):
            continue
        audio, sample_rate = torchaudio.load(path)
        if mono:
            audio = aug.convert_to_mono(audio_tensor = audio)
        if new_sample_rate:
            sample_rate = new_sample_rate
        torchaudio.save(uri = f"{out_fol}/{path.name}", src = audio, sample_rate=sample_rate)



In [None]:
normalize_bhatta_dataset(data_fol = "/content/SECTEC/bharat1", out_fol = "/content/Bhatta_Normalized_DataSet/bharat1/", mono = True, new_sample_rate = 16000)
!cp /content/SECTEC/bharat1/SECTEC_bharat1.trans.txt /content/Bhatta_Normalized_DataSet/bharat1/

In [None]:
!cat /content/Bhatta_Normalized_DataSet/bharat1/SECTEC_bharat1.trans.txt | wc -l
!ls /content/Bhatta_Normalized_DataSet/bharat1/ | grep 'wav' | wc -l

In [None]:
normalize_bhatta_dataset(data_fol = "/content/SECTEC/ganNeshsir3", out_fol = "/content/Bhatta_Normalized_DataSet/ganNeshsir3", mono = True, new_sample_rate = 16000)
!cp /content/SECTEC/ganNeshsir3/SECTEC_ganNeshsir3.trans.txt /content/Bhatta_Normalized_DataSet/ganNeshsir3/

In [None]:
!cat /content/Bhatta_Normalized_DataSet/ganNeshsir3/SECTEC_ganNeshsir3.trans.txt | wc -l
!ls /content/Bhatta_Normalized_DataSet/ganNeshsir3/ | grep 'wav' | wc -l

In [None]:
!cp Bhatta_Normalized_DataSet.zip /content/drive/MyDrive/MSICE/

In [None]:
!ls -l /content/drive/MyDrive/MSICE

###CS PREDETERMINED

In [None]:
!rm -r data/
!rm -r data_out/

In [None]:
!cp -r /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Predetermined/ /content/data/

####AUGMENT DON'T SAVE AUDIO NO TRANSLITERATION

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = None, augmentor = AudioAugmentor(), augment = False,  save_aud=False)
preprocessor.preprocess()

In [None]:
!head -20 /content/data_out/transcript.txt

###CS PREDETERMINED 2

In [None]:
!rm -r /content/data_out/
# !rm -r /content/data/

In [None]:
!ls -l /content/drive/MyDrive/MSICE/

In [None]:
!cp -r /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Predetermined_2/ /content/data/

####NO TRANSLITERATION, NO AUGMENTATION, SAVE AUDIO

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transcr_name = ('transcription.txt',), transliterator = None, augmentor = AudioAugmentor(), augment = False,  save_aud=True)
preprocessor.preprocess()

In [None]:
!head -40 /content/data_out/transcript.txt

In [None]:
# %cd /content/data_out/
# !zip English_Nepali_CS_Data_Predetermined_2.zip *
# %cd /content

In [None]:
!cp /content/data_out/transcript.txt /content/drive/MyDrive/MSICE/pt_2_native_transcript.txt

###SLR54 SUBSET

In [None]:
!rm -r /content/data_out/
!rm -r /content/data/

In [None]:
!cp /content/drive/MyDrive/MSICE/SLR54_Subset.zip /content/

In [None]:
!mkdir --parents /content/data/SLR54_Subset/

In [None]:
!unzip /content/SLR54_Subset.zip -d /content/data/SLR54_Subset/

In [None]:
!head -10 /content/data/SLR54_Subset/transcription.txt

In [None]:
!mkdir /content/data_out/

In [None]:
import torchaudio
from pathlib import Path
import re

def normalize_dataset(data_fol:str, out_fol:str, mono: bool, new_sample_rate: int) -> None:
    data_fol = Path(data_fol)
    out_fol = Path(out_fol)

    #AUDIO
    audio_ext_pattern = re.compile(r"\.flac$")
    aug = AudioAugmentor()
    for path in data_fol.iterdir():
        if not path.is_file() or not audio_ext_pattern.search(path.name):
            continue
        audio, sample_rate = torchaudio.load(path)
        if mono:
            audio = aug.convert_to_mono(audio_tensor = audio)
        if new_sample_rate:
            sample_rate = new_sample_rate
        torchaudio.save(uri = f"{out_fol}/{path.stem}.mp3", src = audio, sample_rate=sample_rate)


In [None]:
normalize_dataset(data_fol = '/content/data/SLR54_Subset/',out_fol = '/content/data_out/', mono = True, new_sample_rate = 16000)

In [None]:
!ls -l /content/data_out/*|  wc -l

In [None]:
!wc -l /content/data/SLR54_Subset/transcription.txt

In [None]:
%cd /content/data_out/

In [None]:
!zip SLR54_Subset_Normalized.zip *mp3

In [None]:
%cd /content/

In [None]:
!cp /content/data_out/SLR54_Subset_Normalized.zip /content/drive/MyDrive/MSICE/

In [None]:
!cp /content/data/SLR54_Subset/transcription.txt /content/drive/MyDrive/MSICE/slr54_transcript.txt

In [None]:
!rm /content/drive/MyDrive/MSICE/SLR54_Subset.zip

In [None]:
!ls -l /content/drive/MyDrive/MSICE/

###LOKNATH KOIRALA CHEKHOV DATASET

In [None]:
!rm -r data_out/
!rm -r data/

In [None]:
!mkdir /content/data/

In [None]:
!ls -hl /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Loknath_Koirala_About_Love_Chekhov/

In [None]:
!cp -r /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Loknath_Koirala_About_Love_Chekhov /content/data/Loknath_Koirala_About_Love_Chekhov

In [None]:
!ls -hl /content/data/Loknath_Koirala_About_Love_Chekhov/

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = None, augmentor = AudioAugmentor(), augment = False,  save_aud=True)
preprocessor.preprocess()

In [None]:
!ls -hl /content/data_out/

In [None]:
!tail -10 /content/data_out/transcript.txt

In [None]:
%cd /content/data_out/

In [None]:
!zip LK_About_Love_Chekhov.zip *

In [None]:
%cd /content/

In [None]:
!cp /content/data_out/LK_About_Love_Chekhov.zip /content/drive/MyDrive/MSICE/

In [None]:
!cp /content/data_out/transcript.txt /content/

In [None]:
!cp /content/transcript.txt '/content/drive/My Drive/MSICE/Loknath_Koirala_About_Love_Chekhov_transcript.txt'