##DESIGN
"""
Preprocessing

  Convert To Mono
  Audio Enhancement (optional)
  Label transliteration
  Remove punctuation
  Augmentation
  Label copying
  Necessary folder arrangement to load data and feed into wav2vec2


Data
  Folder1
    audio_1
    audio_2
    ...
    transcript    
  Folder2
    audio_1
    audio_2
    ...
    transcript
  ...

DataFolderPath
->
Preprocessor
(walks data folder)
searches for transcript file, finds file and creates transliterated transcript file/ generates a pandas transliterated row for the file.
Each audio is augmented (list of augmentation is provided) corresponding label
->
Transliteration using dictionary
->
Augmentation
->
Label Copy
->
Folder format
"""


##SETUP

In [1]:
!pip install torchcodec

Collecting torchcodec
  Downloading torchcodec-0.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading torchcodec-0.9.0-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.9.0


In [2]:
!pip install g2p_en

Collecting g2p_en
  Downloading g2p_en-2.1.0-py3-none-any.whl.metadata (4.5 kB)
Collecting distance>=0.1.3 (from g2p_en)
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading g2p_en-2.1.0-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: distance
  Building wheel for distance (setup.py) ... [?25l[?25hdone
  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16256 sha256=1c303b7eb64304f323e639ed76ca1beef6202cdc86eda84b24caf84ac9ab80f9
  Stored in directory: /root/.cache/pip/wheels/24/a8/58/407063d8e5c1d4dd6594c99d12baa0108570b56a92325587dd
Successfully built distance
Installing collected packages: distance, g2p_en
Successfully installed distance-

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install nepali-num2word

Collecting nepali-num2word
  Downloading nepali_num2word-0.2.3-py3-none-any.whl.metadata (12 kB)
Downloading nepali_num2word-0.2.3-py3-none-any.whl (14 kB)
Installing collected packages: nepali-num2word
Successfully installed nepali-num2word-0.2.3


In [5]:
!git clone https://github.com/adkta/nepali_arabic_num_to_word.git

Cloning into 'nepali_arabic_num_to_word'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 18 (delta 5), reused 7 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (18/18), 4.97 KiB | 1.24 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [6]:
!rm -r /content/transliteration

rm: cannot remove '/content/transliteration': No such file or directory


In [7]:
!git clone https://github.com/adkta/transliteration.git

Cloning into 'transliteration'...
remote: Enumerating objects: 221, done.[K
remote: Total 221 (delta 0), reused 0 (delta 0), pack-reused 221 (from 1)[K
Receiving objects: 100% (221/221), 48.44 KiB | 4.04 MiB/s, done.
Resolving deltas: 100% (118/118), done.


In [8]:
import sys
print(sys.path)

['/content', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.12/dist-packages/IPython/extensions', '/root/.ipython']


In [9]:
import torch
import torchaudio
from typing import Union, Optional
from pathlib import Path
from transliteration.transliterator import Transliterator
from transliteration.transliterators import RomanToDevaTransliterator
import pandas as pd
import re
from collections.abc import Generator

  label.strip('\s,"?.|')
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


In [10]:
from transliteration.transliterator import TranslitDict

##LOGGING CONFIGURATION

In [11]:
import logging

LOG_LEVEL = logging.INFO
_log_path = '/content/preprocessing.log'
logger = logging.getLogger("preprocessing")
logger.setLevel(LOG_LEVEL)
_handler = logging.FileHandler(_log_path)
_handler.setLevel(LOG_LEVEL)
_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(_handler)

##PREPROCESSING CLASSES

In [12]:
class Utils:
    @staticmethod
    def get_matching_files(data_fol: Path, file_pattern: str) -> Generator[Path, None, None]:
        file_pattern = re.compile(file_pattern)
        for f in data_fol.iterdir():
            if file_pattern.search(f.as_posix()):
                yield f

In [13]:
import torchaudio
import torch

class AudioAugmentor:
    def __init__(self) -> None:
        self.download_assets()

    def download_assets(self) -> None:
        rir_loc = torchaudio.utils._download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
        noise_loc = torchaudio.utils._download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
        self.rir, self.rir_sample_rate = torchaudio.load(rir_loc)
        self.noise, self.noise_sample_rate = torchaudio.load(noise_loc)
        self.noise = self.noise/torch.linalg.vector_norm(self.noise)

    def resample_assets(self, new_sample_rate: int) -> None:
        self.rir = torchaudio.transforms.Resample(orig_freq=self.rir_sample_rate, new_freq=new_sample_rate)(self.rir)
        self.noise = torchaudio.transforms.Resample(orig_freq=self.noise_sample_rate, new_freq=new_sample_rate)(self.noise)

    def frame_count(self, waveform: torch.Tensor) -> int:
        return waveform.shape[1]

    def adjust_noise_frame_count_for_add(self, waveform: torch.Tensor, noise: torch.Tensor) -> torch.Tensor:
        """
        Noise adjustment for addition to signal
        """
        noise_frame_count = self.frame_count(noise)
        signal_frame_count = self.frame_count(waveform)
        if noise_frame_count == signal_frame_count:
            pass
        elif noise_frame_count > signal_frame_count:
            noise = noise[:, :signal_frame_count]
        else:
            quo, rem= divmod(signal_frame_count, noise_frame_count)
            repeated_noise_list=[]
            for _ in range(quo):
                repeated_noise_list.append(noise)
            repeated_noise_list.append(noise[:, :rem])
            noise = torch.cat(repeated_noise_list, dim = 1)
        return noise

    def validate_audio_data(self, audio_data: Union[str, Path, torch.Tensor]) -> torch.Tensor:
        assert audio_data.is_file(), "audio_data must be a torch.Tensor, str path to an audio file or Path object to an audio file"

    def convert_to_mono(self, audio_tensor: torch.Tensor) -> torch.Tensor:
        if audio_tensor.shape[0] == 1:
            return audio_tensor
        return torch.mean(audio_tensor, dim=0, keepdim=True)

    def resample(self, waveform: torch.Tensor, orig_sample_rate: int, new_sample_rate: int) -> torch.Tensor:
        return torchaudio.transforms.Resample(orig_freq=orig_sample_rate, new_freq=new_sample_rate)(waveform)

    def shift_pitch(self, audio_data: torch.Tensor, sample_rate: int) -> torch.Tensor:
        return torchaudio.transforms.PitchShift(sample_rate = sample_rate, n_steps = 2)(audio_data)

    def add_background_crowd(self, audio_data: torch.Tensor, snr: int) -> torch.Tensor:
        adj_resampled_noise = self.adjust_noise_frame_count_for_add(waveform = audio_data, noise = self.noise)
        add_noise = torchaudio.transforms.AddNoise()
        noisy_audio = add_noise(waveform = audio_data, noise = adj_resampled_noise, snr = torch.tensor([snr]))
        return noisy_audio

    def add_white_noise(self, audio_data: torch.Tensor, snr: int, variance: int) -> torch.Tensor:
        white_noise = torch.randn_like(audio_data) * variance
        white_noisy_audio = torchaudio.functional.add_noise(audio_data, white_noise,snr=torch.tensor([snr]))
        return white_noisy_audio

    def add_room_reverb(self, audio_data: torch.Tensor) -> torch.Tensor:
        return torchaudio.functional.fftconvolve(audio_data, self.rir)

    def perturb_speed(self, audio_data: torch.Tensor, sample_rate: int, factors: list[float]) -> torch.Tensor:
        speed_perturb = torchaudio.transforms.SpeedPerturbation(orig_freq= sample_rate, factors = factors)
        perturbed_audio, _ = speed_perturb(waveform = audio_data)
        return perturbed_audio

    def apply_low_pass_filter(self, audio_data: torch.Tensor, sample_rate: int) -> torch.Tensor:
        req_effect = "lowpass=frequency=1000"
        effector = torchaudio.io.AudioEffector(effect=req_effect)
        filtered_audio_list = []
        for chunk in effector.stream(waveform=audio_data.T, sample_rate = sample_rate, frames_per_chunk = 48000):
            filtered_audio_list.append(chunk)
        filtered_audio = torch.cat(filtered_audio_list, dim=0).T
        return filtered_audio



In [73]:
from typing import IO
from transliteration.transliterator import Transliterator


class AudioPreprocessor:
    """
    Audio identifiers in the transcript file will only be searched non-recursively in the folder housing the transcript file.
    Currently the preprocessor expects the following format in the transcript file.
    Each line corresponds to audio_<audio_number>.<audio_ext>. Eg. line 1 would be for audio_1.mp3, line 2 for audio_2.mp3 and so on.
    A future to do may process the file in format: audio_id<delimiter>label (which is the output format)
    """

    audio_ext_pattern = r"\.mp3$|\.wav$|\.opus$"
    DEFAULT_RESAMPLE_RATE = 16000
    TKNZR_PATTERN = re.compile(TranslitDict.PUNCT_SPACE_REGEX)

    def __init__(
        self,
        data_folder: str,
        out_folder: str,
        skip: Union[tuple[str],None] = None,
        transcr_name:tuple[str] = ("transcript.txt",),
        out_transcr_name: str = "transcript.txt",
        transliterator: Optional[Transliterator] = None,
        save_aud: bool = True,
        augmentor: Optional[AudioAugmentor] = None,
        mono: bool = True,
        resample_rate: Optional[int] = DEFAULT_RESAMPLE_RATE,
        augment: bool = False
        ) -> None:
        """
        :param data_folder: str Input data folder
        :param out_folder: str Output folder
        :param skip: Union[tuple[str], None] List of files/folders to skip
        :param transcr_name: str List of names that qualify as transcript files
        :param transliterator: Optional[Transliterator] Transliterator to use. If not specified will not transliterate labels
        :param augmentor: Augmentor Augmentor to use. If not specified, will not augment the audio
        :param mono: bool Convert to mono channel
        :param resample_rate: int Resample rate
        """
        self.data_folder = Path(data_folder)
        self.out_folder = Path(out_folder)
        self.skip = skip
        self.transcr_name = transcr_name
        self.out_transcr_name = out_transcr_name
        self.transliterator = transliterator
        self.save_aud = save_aud
        self.augmentor = augmentor
        self.mono = mono
        self.resample_rate = resample_rate
        self.augment = augment

    def is_audio(self, path: Path) -> bool:
        return path.is_file() and path.name.endswith(AudioPreprocessor.audio_ext)

    def get_transcr_file(self, folder: Path) -> Optional[Path]:
        transcript_files = []
        for fn in self.transcr_name:
            logger.info(f"Searching for {fn} in {folder}")
            if transcript_file := list(folder.glob(fn)):
                transcript_files.extend(transcript_file)

        assert len(transcript_files) < 2, "More than one transcript file"
        return transcript_files[0] if transcript_files else None

    def get_audio_files(self, folder: Path) -> Generator[Path, None, None]:
        return Utils.get_matching_files(data_fol = folder, file_pattern = AudioPreprocessor.audio_ext_pattern)

    def get_transcripts(self, transcript_file: Path) -> Generator[str, None, None]:
        punctuations = ',"?.|'
        spaces = '\s\n\r\f'
        removable = punctuations + spaces
        with open(transcript_file, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                  continue
                if self.transliterator:
                  line = self.transliterate_line(line)
                line = line.replace(removable, '')
                yield line

    def transliterate_line(self, line: str) -> str:
        words = AudioPreprocessor.TKNZR_PATTERN.split(line)
        tl_words = []
        for word in words:
            if self.transliterator.for_transliteration(word):
                word = self.transliterator.translit(word)
            tl_words.append(word)
        return " ".join(tl_words)

    def get_audio_label_dict(self, folder: Path) -> Optional[dict[Path, str]]:
        transcript_file = self.get_transcr_file(folder)
        if not transcript_file:
            return None
        numbered_audio_filter = lambda x: len(x.stem.split('_')) == 2
        audio_num = lambda x: int(x.stem.split('_')[1])
        sorted_audio_path: list[Path] = sorted(filter(numbered_audio_filter, self.get_audio_files(folder)), key = audio_num)
        transcripts: Generator[str, None, None] = self.get_transcripts(transcript_file)
        audio_label_dict:dict[Path, str] = dict(zip(sorted_audio_path, transcripts))
        return audio_label_dict

    def write_to_transcript(self, audio_path: Path, label: str, out_transcr_file: IO, ext: bool = False) -> None:
        audio_id = audio_path.stem #additional assignment for code readability. Con: requires 2 assignment statements if ext = True.
        if ext:
            audio_id = audio_path.name
        logger.debug(f"Writing to {out_transcr_file}")
        out_transcr_file.write(f"{audio_id}\t{label}\n")

    def save_original(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        og_path = out_path / f"{audio_path.parent.stem}_{audio_path.name}"
        self.write_to_transcript(og_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        torchaudio.save(uri = og_path, src = audio, sample_rate = sample_rate)

    def save_pitch_shifted(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        ps_path = out_path / f"{audio_path.parent.stem}_ps_{audio_path.name}"
        self.write_to_transcript(ps_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        pitch_shifted_audio = self.augmentor.shift_pitch(audio, sample_rate)
        torchaudio.save(uri= ps_path, src = pitch_shifted_audio, sample_rate = sample_rate )

    def save_noisy(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, snr: int, label: str, out_transcr_file: IO) -> None:
        n_path = out_path / f"{audio_path.parent.stem}_n_{audio_path.name}"
        self.write_to_transcript(n_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        noisy_audio = self.augmentor.add_background_crowd(audio, snr = snr)
        torchaudio.save(uri= n_path, src = noisy_audio, sample_rate = sample_rate )

    def save_white_noisy(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, snr: int, variance: int,  label: str, out_transcr_file: IO) -> None:
        wn_path = out_path / f"{audio_path.parent.stem}_wn_{audio_path.name}"
        self.write_to_transcript(wn_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        white_noisy_audio = self.augmentor.add_white_noise(audio, snr = snr, variance = variance)
        torchaudio.save(uri= wn_path, src = white_noisy_audio, sample_rate = sample_rate )

    def save_room_reverbed(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        rr_path = out_path / f"{audio_path.parent.stem}_rr_{audio_path.name}"
        self.write_to_transcript(rr_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        room_reverbed_audio = self.augmentor.add_room_reverb(audio)
        torchaudio.save(uri= rr_path, src = room_reverbed_audio, sample_rate = sample_rate )

    def save_speed_perturbed(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        sp_path = out_path / f"{audio_path.parent.stem}_sp_{audio_path.name}"
        self.write_to_transcript(sp_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        speed_perturbed_audio = self.augmentor.perturb_speed(audio, sample_rate, factors = [0.9, 0.95, 1.05, 1.1])
        torchaudio.save(uri= sp_path, src = speed_perturbed_audio, sample_rate = sample_rate )

    def save_low_passed(self, audio_path: Path, out_path: Path, audio: torch.Tensor, sample_rate: int, label: str, out_transcr_file: IO) -> None:
        lpf_path = out_path / f"{audio_path.parent.stem}_lpf_{audio_path.name}"
        low_passed_audio = self.augmentor.apply_low_pass_filter(audio, sample_rate)
        self.write_to_transcript(lpf_path, label, out_transcr_file, ext = False)
        if not self.save_aud:
            return
        torchaudio.save(uri= lpf_path, src = low_passed_audio, sample_rate = sample_rate )

    def write_transcript_headers(self)->None:
        header_1, header_2 = TranslitDict.DEFAULT_HEADERS
        out_transcr_file.write(f"{header_1}\t{header_2}\n")


    def preprocess(self) -> None:
        if self.augmentor and self.resample_rate:
            self.augmentor.resample_assets(self.resample_rate)

        self.out_folder.mkdir()

        out_transcr_path = self.out_folder / Path(self.out_transcr_name)
        with open(out_transcr_path, mode = 'w', encoding = 'utf-8') as out_transcr_file:
            self.preprocess_dir(self.data_folder, out_transcr_file)

    def preprocess_dir(self, folder: Path, out_transcr_file: IO) -> None:
        # If there are nested folders go inside and handle separately as a unit (of transcript file and audio files)
        for path in folder.iterdir():
            if self.skip and path.name in self.skip: #directory to be skipped
                continue

            if path.is_dir():
                self.preprocess_dir(path, out_transcr_file)
                continue

        # the presence of a transcript file tells us that this is a concerned directory (an audio-transcript unit)
        audio_label_dict = self.get_audio_label_dict(folder)

        if not audio_label_dict:
            return

        out_path = self.out_folder
        #self.write_transcript_headers()
        for audio_path in audio_label_dict.keys():

            label = audio_label_dict[audio_path]
            audio, sample_rate = torchaudio.load(audio_path)

            if not self.augmentor:
                continue

            if self.mono:
                audio = self.augmentor.convert_to_mono(audio)

            if self.resample_rate:
                audio = self.augmentor.resample(audio, sample_rate, self.resample_rate)
                sample_rate = self.resample_rate

            self.save_original(audio_path, out_path, audio, sample_rate, label, out_transcr_file)

            if not self.augment:
                continue

            self.save_pitch_shifted(audio_path, out_path, audio, sample_rate, label, out_transcr_file)
            self.save_noisy(audio_path, out_path, audio, sample_rate, 20, label, out_transcr_file)
            self.save_white_noisy(audio_path, out_path, audio, sample_rate, 20, 0.01, label, out_transcr_file)
            self.save_room_reverbed(audio_path, out_path, audio, sample_rate, label, out_transcr_file)
            self.save_speed_perturbed(audio_path, out_path, audio, sample_rate, label, out_transcr_file)
            self.save_low_passed(audio_path, out_path, audio, sample_rate, label, out_transcr_file)





  spaces = '\s\n\r\f'


##COPY TEST FOLDER AND RUN TESTS

###CS MANUAL DATASET

In [None]:
!rm -r /content/data
!mkdir /content/data

rm: cannot remove '/content/data': No such file or directory


In [None]:
!ls /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/

 Doers_1				      SS_Dr_Dilip_Sharma
 Doers_Accidental_Banker		      SS_Dr_Gehendra_Purush_Dhakal
 Doers_Hivelaya				      SS_Dr_Gehendra_Purush_Dhakal_2
 Doers_Pahadi_Foods_Prashant_Ghimire_1	      SS_Dr_Jaya
 Doers_Pahadi_Foods_Prashant_Ghimire_2	      SS_Dr_Sanjay_Sharma
 Doers_Prasanna_Dhungel_Market_Intelligence   SS_Dr_Sanjay_Sharma_2
 Doers_Prof_Samrachana			     'SS_Mahabir Paudyal'
 PP_Iih					      SS_Munu_Adhikari
 PP_Swarnim_Wagle			      SS_Pradip_Raj_Giri
 PP_Swarnim_Wagle_2			      SS_Rupeshwor_Gaur_Das
 SG_Palesha_Goverdhan			      SS_Sanjog_Koirala
 SG_Palesha_Goverdhan_2			      SS_Sanjog_Koirala_2
 SG_Uniq_Poet				      SS_Siddhant_Acharya
 SS_Amod_Nath_Pyakuryal			      SS_Skanda_Gautam
 SS_Asheem_Basnyat			      SS_Sudheer_Sharma
 SS_Bibhusan_Bista			      SS_Sudheer_Sharma_2
 SS_Chanira_Bajracharya			      SS_Suresh_Bhattarai
 SS_Chiran_Jung_Thapa			      SS_Suresh_Dhakal
 SS_Dipak_Gyawali			     'Transliteration Dictionary'


In [None]:
!cp -r /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/* /content/data/
# %cd /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/
# !cp -r SG_Uniq_Poet SG_Palesha_Goverdhan SS_Dr_Jaya /content/data/
# %cd ~

In [None]:
!cp '/content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Transliteration Dictionary/Roman_Devanagari_Translit_Dict.json' /content/

In [None]:
translitr = RomanToDevaTransliterator(translit_dict='/content/Roman_Devanagari_Translit_Dict.json')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [None]:
!rm -r /content/data_out

####AUGMENT AND TRANSLITERATE AND SAVE AUDIO

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = translitr, augmentor = AudioAugmentor(), augment = True)
preprocessor.preprocess(); #To include extensions in file ids in transcript file see method write_to_transcript(). Perhaps this feature should be included as an attribute in AudioPreprocessor class.


####WRITE TRANSCRIPTS IN PROPER FORMAT WITHOUT TRANSLITERATION (AUDIO_ID TRANSCRIPT). NO AUDIO SAVING

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", augmentor = AudioAugmentor(), save_aud=False)
preprocessor.preprocess()

  rir_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
  noise_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Dr_Gehendra_Purush_Dhakal
INFO:preprocessing:Searching for transcript.txt in /content/data/PP_Swarnim_Wagle
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Chanira_Bajracharya
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Pradip_Raj_Giri
INFO:preprocessing:Searching for transcript.txt in /content/data/Doers_1
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Chiran_Jung_Thapa
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Dr_Sanjay_Sharma_2
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Skanda_Gautam
INFO:preprocessing:Searching for transcript.txt in /content/data/Doers_Prof_Samrachana
INFO:p

####TRANSLITERATE AND AUGMENT BUT DON'T SAVE AUDIO
This will only create transcript file. Will not actually augment the audio but the augmented audio's transcript (same as the original) are duplicated in the transcript file with proper labels (i.e. augmented audio ids)

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = translitr, augmentor = AudioAugmentor(), augment = True,  save_aud=False)
preprocessor.preprocess()

  rir_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
  noise_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Dr_Gehendra_Purush_Dhakal
INFO:preprocessing:Searching for transcript.txt in /content/data/PP_Swarnim_Wagle
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Chanira_Bajracharya
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Pradip_Raj_Giri
INFO:preprocessing:Searching for transcript.txt in /content/data/Doers_1
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Chiran_Jung_Thapa
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Dr_Sanjay_Sharma_2
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Skanda_Gautam
INFO:preprocessing:Searching

####AUGMENT BUT DON'T SAVE AUDIO NO TRANSLITERATION

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = None, augmentor = AudioAugmentor(), augment = True,  save_aud=False)
preprocessor.preprocess()

  rir_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
100%|██████████| 31.3k/31.3k [00:00<00:00, 27.0MB/s]
  noise_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
100%|██████████| 78.2k/78.2k [00:00<00:00, 77.8MB/s]
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Dr_Sanjay_Sharma_2
  effector = torchaudio.io.AudioEffector(effect=req_effect)
  self.writer = StreamWriter(self.buffer, format=muxer)
  reader = StreamReader(src, format=muxer, option=option)
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Sanjog_Koirala
INFO:preprocessing:Searching for transcript.txt in /content/data/SS_Suresh_Bhattarai
INFO:preprocessing:Searching for transcript.txt in /content/data/SG_Palesha_Goverdhan_2
INFO:preprocessing:Searching for transcript.txt in /content/data/Doers_1
INFO:pre

In [None]:
!cat /content/data_out/transcript.txt | wc -l


4151


In [None]:
!cp /content/data_out/transcript.txt /content/drive/MyDrive/MSICE/native_augmented_transcript.txt

####SAVE TO DRIVE

In [None]:
!zip -r /content/data_out.zip /content/data_out
!cp /content/data_out.zip /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual_Augmented.zip

###BHATTA DATASET RESAMPLING, CONVERSION TO MONO

In [None]:
!rm -r /content/Bhatta_Normalized_DataSet/
!mkdir --parents /content/Bhatta_Normalized_DataSet/bharat1/ /content/Bhatta_Normalized_DataSet/ganNeshsir3/

In [None]:
!cp -r /content/drive/MyDrive/MSICE/Pure_Nepali_Labelled_Speech_Data/Dataset/Train/SECTEC /content/

In [None]:
!ls -l /content/SECTEC

total 8
dr-x------ 2 root root 4096 Sep  6 11:27 bharat1
dr-x------ 2 root root 4096 Sep  6 11:27 ganNeshsir3


In [None]:
import torchaudio
from pathlib import Path
import re

def normalize_bhatta_dataset(data_fol:str, out_fol:str, mono: bool, new_sample_rate: int) -> None:
    data_fol = Path(data_fol)
    out_fol = Path(out_fol)

    #AUDIO
    audio_ext_pattern = re.compile(r"\.wav$")
    aug = AudioAugmentor()
    for path in data_fol.iterdir():
        if not path.is_file() or not audio_ext_pattern.search(path.name):
            continue
        audio, sample_rate = torchaudio.load(path)
        if mono:
            audio = aug.convert_to_mono(audio_tensor = audio)
        if new_sample_rate:
            sample_rate = new_sample_rate
        torchaudio.save(uri = f"{out_fol}/{path.name}", src = audio, sample_rate=sample_rate)



In [None]:
normalize_bhatta_dataset(data_fol = "/content/SECTEC/bharat1", out_fol = "/content/Bhatta_Normalized_DataSet/bharat1/", mono = True, new_sample_rate = 16000)
!cp /content/SECTEC/bharat1/SECTEC_bharat1.trans.txt /content/Bhatta_Normalized_DataSet/bharat1/

  rir_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
  noise_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)


In [None]:
!cat /content/Bhatta_Normalized_DataSet/bharat1/SECTEC_bharat1.trans.txt | wc -l
!ls /content/Bhatta_Normalized_DataSet/bharat1/ | grep 'wav' | wc -l

87


In [None]:
normalize_bhatta_dataset(data_fol = "/content/SECTEC/ganNeshsir3", out_fol = "/content/Bhatta_Normalized_DataSet/ganNeshsir3", mono = True, new_sample_rate = 16000)
!cp /content/SECTEC/ganNeshsir3/SECTEC_ganNeshsir3.trans.txt /content/Bhatta_Normalized_DataSet/ganNeshsir3/

  rir_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
  noise_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)


cp: cannot stat '/content/SECTEC/bharat1/SECTEC_ganNeshsir3.trans.txt': No such file or directory


In [None]:
!cat /content/Bhatta_Normalized_DataSet/ganNeshsir3/SECTEC_ganNeshsir3.trans.txt | wc -l
!ls /content/Bhatta_Normalized_DataSet/ganNeshsir3/ | grep 'wav' | wc -l

100
100


In [None]:
!cp Bhatta_Normalized_DataSet.zip /content/drive/MyDrive/MSICE/

In [None]:
!ls -l /content/drive/MyDrive/MSICE

total 531524
-rw------- 1 root root 155996529 Sep 23  2024 '074BCT audio dataset-20240923T044153Z-001.zip'
drwx------ 2 root root      4096 Dec  7  2024  Articles
-rw------- 1 root root  96846716 Sep  6 12:07  Bhatta_Normalized_DataSet.zip
drwx------ 2 root root      4096 Aug 11 09:23 'Dependency Corrections'
drwx------ 2 root root      4096 Jun 23 05:57  English_Nepali_CS_Data_Manual
-rw------- 1 root root  81637102 Sep  6 08:19  English_Nepali_CS_Data_Manual_Augmented_WO_ext.zip
-rw------- 1 root root  81637876 Sep  6 08:19  English_Nepali_CS_Data_Manual_Augmented.zip
drwx------ 2 root root      4096 Dec  4  2024 'English To Nepali Transliteration Dictionary'
drwx------ 2 root root      4096 Oct 14  2024 'Óbuda University '
drwx------ 2 root root      4096 Jan 30  2025  Predetermined_CS_Texts
drwx------ 2 root root      4096 Jun 23 05:59  Pure_Nepali_Labelled_Speech_Data
-rw------- 1 root root 128120317 Jun 23 14:49  real_time_test-20240823T071018Z-001.zip
drwx------ 2 root root    

###CS PREDETERMINED

In [None]:
!rm -r data_out/

In [None]:
!cp -r /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Predetermined/ /content/data/

####AUGMENT DON'T SAVE AUDIO NO TRANSLITERATION

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = None, augmentor = AudioAugmentor(), augment = False,  save_aud=False)
preprocessor.preprocess()

  rir_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
  noise_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
INFO:preprocessing:Searching for transcript.txt in /content/data/PT_Ashish_Devkota
INFO:preprocessing:Searching for transcript.txt in /content/data


In [None]:
!head -20 /content/data_out/transcript.txt

PT_Ashish_Devkota_audio_1	सरकारी school हरुमा पढ्नेहरु back नलागिकनै continue regular college level मा pass गर्दै गईराको छ
PT_Ashish_Devkota_audio_2	private को students हरुले pass गर्न सकेन भन्न खोजेको होइन
PT_Ashish_Devkota_audio_3	young generations हरुको लागि English एकदम important छ
PT_Ashish_Devkota_audio_4	Even नेपाली subject पढाउने teachers हरु पनि motivation create गर्न बिचमा अंग्रेजी शब्दहरु प्रयोग गर्नुहुन्छ
PT_Ashish_Devkota_audio_5	दिपिका जी तपाई inspiration प्रेरणाको source हुनुहुन्छ
PT_Ashish_Devkota_audio_6	सरकार even prime minister पनि विकासको लागि budget थुप्रै छ भन्छ
PT_Ashish_Devkota_audio_7	तर NIST को plus two chain बिस्तारै अलि खस्किँदै जस्तो लागेको छ मलाई त
PT_Ashish_Devkota_audio_8	MG Sir त बिन्दास अहिले त total transformation हुनुभाछ नि पहिले त अलि strict हुनुहुन्थ्यो
PT_Ashish_Devkota_audio_9	Dubai को लागि भनेर, Bahrain को लागि त vacancy नै छैन
PT_Ashish_Devkota_audio_10	Clients हरुले पुरै website visit नगरेसम्म हाम्रो कामको बारे थाहा हुँदैन
PT_Ashish_Devkota_au

###CS PREDETERMINED 2

In [74]:
!rm -r /content/data_out/
# !rm -r /content/data/

In [35]:
!ls -l /content/drive/MyDrive/MSICE/

total 1068855
-rw------- 1 root root 155996529 Sep 23  2024 '074BCT audio dataset-20240923T044153Z-001.zip'
drwx------ 2 root root      4096 Dec  7  2024  Articles
-rw------- 1 root root   4321076 Dec  6 08:27  automated_cs_asr_native_numless_punctless_transcript.txt
-rw------- 1 root root  96846716 Nov  4 19:17  Bhatta_Normalized_DataSet.zip
-rw------- 1 root root 371563302 Dec  5 18:34  Bijaya_Khanal_Automated_CS_ASR_Dataset.zip
-rw------- 1 root root    129462 Sep 19 09:47 'Changes transcript.txt'
drwx------ 2 root root      4096 Oct 30 08:40  CM_Nep_Eng_Fine-tuned_Results
drwx------ 2 root root      4096 Nov 17 12:44  CM_Text_Corpus
drwx------ 2 root root      4096 Aug 11 09:23 'Dependency Corrections'
drwx------ 2 root root      4096 Nov  4 06:40  English_Nepali_CS_Audio_PT
drwx------ 2 root root      4096 Dec  1 19:18  English_Nepali_CS_Data_Manual
-rw------- 1 root root  81637102 Sep  6 08:19  English_Nepali_CS_Data_Manual_Augmented_WO_ext.zip
-rw------- 1 root root  81637876 Se

In [28]:
!cp -r /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Predetermined_2/ /content/data/

####NO TRANSLITERATION, NO AUGMENTATION, SAVE AUDIO

In [75]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transcr_name = ('transcription.txt',), transliterator = None, augmentor = AudioAugmentor(), augment = False,  save_aud=True)
preprocessor.preprocess()

INFO:preprocessing:Searching for transcription.txt in /content/data/PT_2_Shreena_Chhetri
INFO:preprocessing:Searching for transcription.txt in /content/data/PT_2_Abinash_Thapa
INFO:preprocessing:Searching for transcription.txt in /content/data/PT_2_Sachin_Kumar_Yadav
INFO:preprocessing:Searching for transcription.txt in /content/data


In [77]:
!tail -40 /content/data_out/transcript.txt

PT_2_Sachin_Kumar_Yadav_audio_2	Osho ले यो disseminating को sector मा बढी काम गर्नुभो
PT_2_Sachin_Kumar_Yadav_audio_3	at least यो भनेर बुझाउन नसक्ला तर यसरी प्राप्त हुन सक्छ
PT_2_Sachin_Kumar_Yadav_audio_4	म चाँहि अब exam prepare गरिराखेको सम्पूर्ण students लाई best of luck भन्न चाहन्छु
PT_2_Sachin_Kumar_Yadav_audio_5	अब revision गर्दा CDC ले publish गरेको question solution हेर्न लगाउने
PT_2_Sachin_Kumar_Yadav_audio_6	त्यस्तै government school हरुले English medium मा publish गरेको problem हरु हेर्न लगाउने एक पटक
PT_2_Sachin_Kumar_Yadav_audio_7	अब सबै question हरु solve गर्न भ्याउँदैन एकदम shortcut मा formula बनाएर तयार गर्ने
PT_2_Sachin_Kumar_Yadav_audio_8	हामीले त्यो time लाई meet गर्न उँहालाई जुन time suit हुन्छ
PT_2_Sachin_Kumar_Yadav_audio_9	भक्तपुरमा meeting सकेर यहाँ meeting attend गर्न आउँछौँ
PT_2_Sachin_Kumar_Yadav_audio_10	त्यो चाँहि process मा छ most probably
PT_2_Sachin_Kumar_Yadav_audio_11	चाँडै नै equipment procure गर्ने plan छ हाम्रो
PT_2_Sachin_Kumar_Yadav_audio_12	certi

In [36]:
# %cd /content/data_out/
# !zip English_Nepali_CS_Data_Predetermined_2.zip *
# %cd /content

/content/data_out


In [78]:
!cp /content/data_out/transcript.txt /content/drive/MyDrive/MSICE/pt_2_native_transcript.txt

###SLR54 SUBSET

In [None]:
!rm -r /content/data_out/
!rm -r /content/data/

rm: cannot remove '/content/data_out/': No such file or directory


In [None]:
!cp /content/drive/MyDrive/MSICE/SLR54_Subset.zip /content/

In [None]:
!mkdir --parents /content/data/SLR54_Subset/

In [None]:
!unzip /content/SLR54_Subset.zip -d /content/data/SLR54_Subset/

In [None]:
!head -10 /content/data/SLR54_Subset/transcription.txt

5e766d189e	०२३ सम्म
3ed2cee22b	०३५ देखि भूमिगत
96cf842c07	०३५ देखि विद्यार्थी
a7f5ba44bb	०३६ देखि नै राजनीतिमा लाग्नुभएका गुरुङ ०४२ मा अखिल छैठौँको अध्यक्ष हुनुभएको थियो।
5a358087ff	०४६ सालको जनआन्दोलनपछि
59288e2a7d	०४९मा नेपाल उत्पीडित
5728b3aa68	०५० देखि जिल्ला
ada907ea4a	०५२ सालपछि २०६३
38979e32bc	०५२ सालमा भूमिगत
9cf28af199	०५७ चैत १७ गते


In [None]:
!mkdir /content/data_out/

In [None]:
import torchaudio
from pathlib import Path
import re

def normalize_dataset(data_fol:str, out_fol:str, mono: bool, new_sample_rate: int) -> None:
    data_fol = Path(data_fol)
    out_fol = Path(out_fol)

    #AUDIO
    audio_ext_pattern = re.compile(r"\.flac$")
    aug = AudioAugmentor()
    for path in data_fol.iterdir():
        if not path.is_file() or not audio_ext_pattern.search(path.name):
            continue
        audio, sample_rate = torchaudio.load(path)
        if mono:
            audio = aug.convert_to_mono(audio_tensor = audio)
        if new_sample_rate:
            sample_rate = new_sample_rate
        torchaudio.save(uri = f"{out_fol}/{path.stem}.mp3", src = audio, sample_rate=sample_rate)


In [None]:
normalize_dataset(data_fol = '/content/data/SLR54_Subset/',out_fol = '/content/data_out/', mono = True, new_sample_rate = 16000)

  rir_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
  noise_loc = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")


In [None]:
!ls -l /content/data_out/*|  wc -l

39250


In [None]:
!wc -l /content/data/SLR54_Subset/transcription.txt

39250 /content/data/SLR54_Subset/transcription.txt


In [None]:
%cd /content/data_out/

/content/data_out


In [None]:
!zip SLR54_Subset_Normalized.zip *mp3

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: a8101e5dc7.mp3 (deflated 4%)
  adding: a8103341ab.mp3 (deflated 4%)
  adding: a810745298.mp3 (deflated 4%)
  adding: a810bc4b03.mp3 (deflated 3%)
  adding: a810e5816e.mp3 (deflated 4%)
  adding: a811840bbf.mp3 (deflated 4%)
  adding: a811a8d7ad.mp3 (deflated 4%)
  adding: a811e470cb.mp3 (deflated 3%)
  adding: a811f1ec42.mp3 (deflated 3%)
  adding: a81250cc14.mp3 (deflated 4%)
  adding: a8126683bf.mp3 (deflated 4%)
  adding: a813636b04.mp3 (deflated 3%)
  adding: a81396aef4.mp3 (deflated 3%)
  adding: a813ee3fec.mp3 (deflated 4%)
  adding: a8142db7a9.mp3 (deflated 3%)
  adding: a8143297d9.mp3 (deflated 5%)
  adding: a8146423c4.mp3 (deflated 4%)
  adding: a814723dc3.mp3 (deflated 4%)
  adding: a815481893.mp3 (deflated 4%)
  adding: a815634bb1.mp3 (deflated 4%)
  adding: a81632f200.mp3 (deflated 5%)
  adding: a8165603f3.mp3 (deflated 3%)
  adding: a81700ebf2.mp3 (deflated 3%)
  adding: a817ab0b38.mp3 (deflated 4%)

In [None]:
%cd /content/

/content


In [None]:
!cp /content/data_out/SLR54_Subset_Normalized.zip /content/drive/MyDrive/MSICE/

In [None]:
!cp /content/data/SLR54_Subset/transcription.txt /content/drive/MyDrive/MSICE/slr54_transcript.txt

In [None]:
!rm /content/drive/MyDrive/MSICE/SLR54_Subset.zip

In [None]:
!ls -l /content/drive/MyDrive/MSICE/

total 956061
-rw------- 1 root root 155996529 Sep 23  2024 '074BCT audio dataset-20240923T044153Z-001.zip'
drwx------ 2 root root      4096 Dec  7  2024  Articles
-rw------- 1 root root  96846716 Nov  4 19:17  Bhatta_Normalized_DataSet.zip
-rw------- 1 root root    129462 Sep 19 09:47 'Changes transcript.txt'
drwx------ 2 root root      4096 Oct 30 08:40  CM_Nep_Eng_Fine-tuned_Results
drwx------ 2 root root      4096 Nov 17 12:44  CM_Text_Corpus
drwx------ 2 root root      4096 Aug 11 09:23 'Dependency Corrections'
drwx------ 2 root root      4096 Nov  4 06:40  English_Nepali_CS_Audio_PT
drwx------ 2 root root      4096 Nov  7 03:53  English_Nepali_CS_Data_Manual
-rw------- 1 root root  81637102 Sep  6 08:19  English_Nepali_CS_Data_Manual_Augmented_WO_ext.zip
-rw------- 1 root root  81637876 Sep  6 08:19  English_Nepali_CS_Data_Manual_Augmented.zip
drwx------ 2 root root      4096 Nov  7 03:59  English_Nepali_CS_Data_Predetermined
-rw------- 1 root root   5077070 Nov  4 16:00  English_

###LOKNATH KOIRALA CHEKHOV DATASET

In [None]:
!rm -r data_out/
!rm -r data/

rm: cannot remove 'data_out/': No such file or directory


In [None]:
!mkdir /content/data/

In [None]:
!ls -hl /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Loknath_Koirala_About_Love_Chekhov/

total 2.9M
-rw------- 1 root root  21K Nov 30 11:01 audio_10.mp3
-rw------- 1 root root  38K Nov 30 11:01 audio_11.mp3
-rw------- 1 root root  70K Nov 30 11:01 audio_12.mp3
-rw------- 1 root root  49K Nov 30 11:00 audio_13.mp3
-rw------- 1 root root  24K Nov 30 11:00 audio_14.mp3
-rw------- 1 root root  22K Nov 30 11:00 audio_15.mp3
-rw------- 1 root root  69K Nov 30 11:00 audio_16.mp3
-rw------- 1 root root  55K Nov 30 11:00 audio_17.mp3
-rw------- 1 root root  49K Nov 30 10:59 audio_18.mp3
-rw------- 1 root root  34K Nov 30 10:59 audio_19.mp3
-rw------- 1 root root  32K Nov 30 10:46 audio_1.mp3
-rw------- 1 root root  32K Nov 30 10:59 audio_20.mp3
-rw------- 1 root root  42K Nov 30 10:59 audio_21.mp3
-rw------- 1 root root  57K Nov 30 10:59 audio_22.mp3
-rw------- 1 root root  66K Nov 30 10:58 audio_23.mp3
-rw------- 1 root root  43K Nov 30 10:58 audio_24.mp3
-rw------- 1 root root  59K Nov 30 10:58 audio_25.mp3
-rw------- 1 root root  37K Nov 30 10:58 audio_26.mp3
-rw------- 1 root 

In [None]:
!cp -r /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Loknath_Koirala_About_Love_Chekhov /content/data/Loknath_Koirala_About_Love_Chekhov

In [None]:
!ls -hl /content/data/Loknath_Koirala_About_Love_Chekhov/

total 3.0M
-rw------- 1 root root  21K Dec  6 17:11 audio_10.mp3
-rw------- 1 root root  38K Dec  6 17:11 audio_11.mp3
-rw------- 1 root root  70K Dec  6 17:11 audio_12.mp3
-rw------- 1 root root  49K Dec  6 17:11 audio_13.mp3
-rw------- 1 root root  24K Dec  6 17:11 audio_14.mp3
-rw------- 1 root root  22K Dec  6 17:11 audio_15.mp3
-rw------- 1 root root  69K Dec  6 17:11 audio_16.mp3
-rw------- 1 root root  55K Dec  6 17:11 audio_17.mp3
-rw------- 1 root root  49K Dec  6 17:11 audio_18.mp3
-rw------- 1 root root  34K Dec  6 17:11 audio_19.mp3
-rw------- 1 root root  32K Dec  6 17:11 audio_1.mp3
-rw------- 1 root root  32K Dec  6 17:11 audio_20.mp3
-rw------- 1 root root  42K Dec  6 17:11 audio_21.mp3
-rw------- 1 root root  57K Dec  6 17:11 audio_22.mp3
-rw------- 1 root root  66K Dec  6 17:11 audio_23.mp3
-rw------- 1 root root  43K Dec  6 17:11 audio_24.mp3
-rw------- 1 root root  59K Dec  6 17:11 audio_25.mp3
-rw------- 1 root root  37K Dec  6 17:11 audio_26.mp3
-rw------- 1 root 

In [None]:
preprocessor = AudioPreprocessor(data_folder = "/content/data/", out_folder = "/content/data_out/", transliterator = None, augmentor = AudioAugmentor(), augment = False,  save_aud=True)
preprocessor.preprocess()

INFO:preprocessing:Searching for transcript.txt in /content/data/Loknath_Koirala_About_Love_Chekhov
INFO:preprocessing:Searching for transcript.txt in /content/data


In [None]:
!ls -hl /content/data_out/

total 1.6M
-rw-r--r-- 1 root root  11K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_10.mp3
-rw-r--r-- 1 root root  19K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_11.mp3
-rw-r--r-- 1 root root  36K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_12.mp3
-rw-r--r-- 1 root root  25K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_13.mp3
-rw-r--r-- 1 root root  13K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_14.mp3
-rw-r--r-- 1 root root  11K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_15.mp3
-rw-r--r-- 1 root root  35K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_16.mp3
-rw-r--r-- 1 root root  28K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_17.mp3
-rw-r--r-- 1 root root  25K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_18.mp3
-rw-r--r-- 1 root root  17K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_19.mp3
-rw-r--r-- 1 root root  17K Dec  6 17:12 Loknath_Koirala_About_Love_Chekhov_audio_1.mp3
-rw-r--r-- 

In [None]:
!head -10 /content/data_out/transcript.txt

Loknath_Koirala_About_Love_Chekhov_audio_1	Welcome to the video series dedicated to class 12 affiliated to Nepal Examination Board
Loknath_Koirala_About_Love_Chekhov_audio_2	Today our concern remains the short story about love by a Russian playwright and short story writer Anton Pavlovich Chekhov
Loknath_Koirala_About_Love_Chekhov_audio_3	As usual we are initiating our journey with the introduction of the author
Loknath_Koirala_About_Love_Chekhov_audio_4	Chekhov lived from 29th January 1860 to 15th July 1904
Loknath_Koirala_About_Love_Chekhov_audio_5	In a letter to a friend and editor Alexei Suvarin Chekhov wrote
Loknath_Koirala_About_Love_Chekhov_audio_6	Medicine is my lawful wife and literature is my mistress. 
Loknath_Koirala_About_Love_Chekhov_audio_7	When I get tired with the one I spend the night with the other
Loknath_Koirala_About_Love_Chekhov_audio_8	Chekhov was not only a writer but also a medical doctor 
Loknath_Koirala_About_Love_Chekhov_audio_9	and he practiced medicine un

In [None]:
%cd /content/data_out/

/content/data_out


In [None]:
!zip LK_About_Love_Chekhov.zip *

  adding: Loknath_Koirala_About_Love_Chekhov_audio_10.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_11.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_12.mp3 (deflated 2%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_13.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_14.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_15.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_16.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_17.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_18.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_19.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_1.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_20.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_21.mp3 (deflated 3%)
  adding: Loknath_Koirala_About_Love_Chekhov_audio_22.mp3 (deflat

In [None]:
%cd /content/

/content


In [None]:
!cp /content/data_out/LK_About_Love_Chekhov.zip /content/drive/MyDrive/MSICE/

In [None]:
!cp /content/data_out/transcript.txt /content/

In [None]:
!cp /content/transcript.txt '/content/drive/My Drive/MSICE/Loknath_Koirala_About_Love_Chekhov_transcript.txt'