##SETUP

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!rm -r /content/transliteration
!git clone https://github.com/adkta/transliteration.git

In [None]:
!pip install g2p_en

In [None]:
!pip install nepali-num2word

In [None]:
!git clone https://github.com/adkta/nepali_arabic_num_to_word.git

##GENERAL TRANSFORM LOGIC

In [None]:
from typing import Callable

import re
from transliteration.transliterator import TranslitDict


def transform_transcript(input_path: str, output_path: str, transform_fn: Callable[[str], str]) -> None:
    tknzr_pattern = re.compile(TranslitDict.PUNCT_SPACE_REGEX)
    with open(input_path, mode = 'r') as transcr_in, \
        open(output_path, mode = 'w') as transcr_out:
        for line in transcr_in:
            line = line.strip()
            audio_nm, label = line.split('\t')
            words = tknzr_pattern.split(label)
            numless_native_label = []
            for word in words:
                if not word:
                    continue
                numless_native_label.append(transform_fn(word))
            print(f"{audio_nm}\t{' '.join(numless_native_label)}", file=transcr_out)

##ORIGINAL NUMLESS TRANSCRIPT

1. Input original native transcript: /content/transcript.txt
2. Output is unpunctuated numberless native transcript. /content/native_numless_punctless_transcript.txt

i.e. All numerals are written in words

In [None]:
!cp '/content/drive/MyDrive/MSICE/transcript.txt' /content/

In [None]:
from nepali_arabic_num_to_word.nepali_arabic_num_to_word import numeral_type, get_word_from_numeral

def num_2_word(word: str) -> str:
    num_type = numeral_type(word)
    if numeral_type:
        word = get_word_from_numeral(word, num_type)
    return word

In [None]:
transform_transcript(input_path = "/content/transcript.txt", output_path = "/content/native_numless_punctless_transcript.txt", transform_fn = num_2_word)

In [None]:
!head -20 native_numless_punctless_transcript.txt

##CREATE REDUCTION DICTIONARY (COMBINES TRANSLITERATION AND REDUCTION)

TASK 1: Create a reduction dictionary
The input to this task is numberless native transcript file
Output is a reduction dictionary

TASK 2: Create a reduced transcript file
The input is numberless native transcript file and reduction dictionary (if possible so the mapping can be reused)
Output is reduced transcript file

In [None]:
# !cp '/content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Transliteration Dictionary/Roman_Devanagari_Translit_Dict.json' /content/

In [None]:
!head -4 /content/native_numless_punctless_transcript.txt > /content/test_transcript.txt

In [None]:
!head -4 /content/test_transcript.txt

In [None]:
from transliteration.transliterator import TranslitDict
from transliteration.reducers import get_reduced_devanagari_word
from transliteration.transliterators import RomanToDevaTransliterator
from tqdm import tqdm

import re

def create_reduc_dict(input_path: str, output_path: str) -> None:
    tknzr_pattern = re.compile(TranslitDict.PUNCT_SPACE_REGEX)
    translitr = RomanToDevaTransliterator()
    with open(input_path, mode = 'r') as transcr_in, \
        open(output_path, mode = 'w') as transcr_out:
        for line in tqdm(transcr_in):
            line = line.strip()
            audio_nm, label = line.split('\t')
            words = tknzr_pattern.split(label)
            numless_native_label = []
            for word in words:
                if not word:
                    continue

                orig_word = word
                if translitr.for_transliteration(word):
                    word = translitr.translit(word)
                reduced_word = get_reduced_devanagari_word(word)

                if not reduced_word or reduced_word == orig_word:
                    continue

                print(f"{orig_word}\t{reduced_word}", file=transcr_out)

In [None]:
create_reduc_dict(input_path = "/content/native_numless_punctless_transcript.txt", output_path = "/content/test_reduc_dict.dat")

In [None]:
test_reduc_dict = TranslitDict.load(src_path='/content/test_reduc_dict.dat', delimiter='\t')

In [None]:
test_reduc_dict.export('/content/Nep_Eng_Code-Mixed_Reduct_Dict.json')

In [None]:
!cp  /content/Nep_Eng_Code-Mixed_Reduct_Dict.json '/content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Transliteration Dictionary/'

##CREATE REDUCED TRANSCRIPT FILE

1. Inputs are transcript file, reduction dictionary
2. Output is reduced transcript file

In [None]:
# !cp /content/transliteration/reducers.py /content/

In [None]:
!cp /content/drive/MyDrive/MSICE/English_Nepali_CS_Data_Manual/Transliteration\ Dictionary/Nep_Eng_Code-Mixed_Reduct_Dict.json /content/

In [None]:
def reduce(word: str) -> str:
    reduced_word = DevanagariReducer(translit_dict='/content/Nep_Eng_Code-Mixed_Reduct_Dict.json').translit_using_dict(word)
    if reduced_word:
        return reduced_word
    return word

In [None]:
transform_transcript(input_path = "/content/native_numless_punctless_transcript.txt", output_path = "/content/reduced_transcript.txt", transform_fn = reduce)