In [1]:
!pip install pandas numpy==1.26.4 torchaudio regex fasttext




In [2]:
!wget -q https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin


In [3]:
import pandas as pd
import numpy as np
import regex as re
import unicodedata
import torchaudio
import fasttext


In [12]:
ft_model = fasttext.load_model("lid.176.bin")

def predict_language_fasttext(text):
    if not isinstance(text, str) or text.strip() == "":
        return "unknown"
    prediction = ft_model.predict(text.strip().replace("\n", " "))[0][0]
    return prediction.replace("__label__", "")


In [13]:
DIGIT_MAP_HI = {
    '0': 'शून्य', '1': 'एक', '2': 'दो', '3': 'तीन', '4': 'चार',
    '5': 'पांच', '6': 'छह', '7': 'सात', '8': 'आठ', '9': 'नौ'
}


In [14]:
ALLOWED_PUNCTUATION = set(". , ? ! ' - : ;")
ALLOWED_NON_VERBAL_TOKENS = ["[laugh]", "[breath]", "[cough]", "[pause]", "[noise]"]

def expand_digits(text, digit_map):
    return re.sub(r'\d', lambda m: digit_map.get(m.group(0), m.group(0)), text)

def preserve_non_verbal_tokens(text):
    for token in ALLOWED_NON_VERBAL_TOKENS:
        text = text.replace(token, f" {token} ")
    return text

def normalize_text(text, lang='hi'):
    if not isinstance(text, str) or text.strip() == '':
        return ''

    text = preserve_non_verbal_tokens(text)
    text = unicodedata.normalize("NFC", text)

    # Retain allowed punctuation and brackets for tokens
    text = ''.join(
        ch if (ch.isalnum() or ch.isspace() or ch in ALLOWED_PUNCTUATION or ch in "[]") else " "
        for ch in text
    )

    if lang == 'hi':
        text = expand_digits(text, DIGIT_MAP_HI)

    if lang == 'en':
        text = text.lower()

    text = re.sub(r"[\s\u200c\u200d\u00A0]+", " ", text).strip()
    return text


In [15]:
def validate_audio_stub(audio_path, duration_sec):
    if not isinstance(audio_path, str) or not audio_path.startswith("s3://"):
        return False, "InvalidAudioPath"
    try:
        if float(duration_sec) > 15.0:
            return False, "DurationTooLong"
    except:
        return False, "InvalidDuration"
    return True, ""


In [16]:
def process_metadata(input_csv_path, output_dir="."):
    df = pd.read_csv(input_csv_path)

    clean_rows = []
    rejected_rows = []

    for idx, row in df.iterrows():
        reasons = []
        lang_warning = ""

        audio_ok, audio_reason = validate_audio_stub(row.get("audio_path"), row.get("duration_sec"))
        if not audio_ok:
            reasons.append(audio_reason)

        raw_text = row.get("transcription_raw", "")
        if pd.isna(raw_text) or str(raw_text).strip() == "":
            reasons.append("MissingTranscription")
            normalized_text = ""
        else:
            normalized_text = normalize_text(raw_text, row['language'])

        if isinstance(raw_text, str) and raw_text.strip() != "":
            predicted_lang = predict_language_fasttext(raw_text)
            if predicted_lang != row['language']:
                lang_warning = f"Detected:{predicted_lang}"

        if reasons:
            rejected_rows.append({**row, "reason": ";".join(reasons)})
        else:
            clean_rows.append({
                "utterance_id": row["utterance_id"],
                "audio_path": row["audio_path"],
                "language": row["language"],
                "speaker_id": row["speaker_id"],
                "gender": row["gender"],
                "transcription": normalized_text,
                "duration_sec": row["duration_sec"],
                "noise_level_db": row["noise_level_db"],
                "collection_source": row["collection_source"],
                "quality_flag": row["quality_flag"],
                "lang_warning": lang_warning
            })

    train_ready = pd.DataFrame(clean_rows)
    rejected = pd.DataFrame(rejected_rows)

    train_ready.to_csv(f"{output_dir}/train_ready.csv", index=False)
    rejected.to_csv(f"{output_dir}/rejected.csv", index=False)

    print(f" Clean rows: {len(train_ready)}")
    print(f" Rejected rows: {len(rejected)}")
    if not rejected.empty:
        print(" Sample rejections:\n", rejected[['utterance_id', 'reason']].head())


In [57]:
from google.colab import files
uploaded = files.upload()


Saving utterances_metadata.csv to utterances_metadata (1).csv


In [17]:
process_metadata("/content/utterances_metadata.csv")


 Clean rows: 1886
 Rejected rows: 114
 Sample rejections:
   utterance_id                reason
0     utt_0019  MissingTranscription
1     utt_0030  MissingTranscription
2     utt_0072  MissingTranscription
3     utt_0093  MissingTranscription
4     utt_0147  MissingTranscription


In [19]:
from google.colab import files

files.download("train_ready.csv")
files.download("rejected.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
def keep_allowed_characters(text):
    """Keep letters, numbers, marks (matras), whitespace, allowed punctuation, and token brackets."""
    return ''.join(
        ch if (unicodedata.category(ch)[0] in ["L", "N", "M"] or ch.isspace() or ch in ALLOWED_PUNCTUATION or ch in "[]")
        else " "
        for ch in text
    )


In [26]:
def normalize_text(text, lang='hi'):
    if not isinstance(text, str) or text.strip() == '':
        return ''

    text = preserve_non_verbal_tokens(text)
    text = unicodedata.normalize("NFC", text)
    text = keep_allowed_characters(text)

    if lang == 'hi':
        text = expand_digits(text, DIGIT_MAP_HI)

    if lang == 'en':
        text = text.lower()

    text = re.sub(r"[\s\u200c\u200d\u00A0]+", " ", text).strip()
    return text


In [28]:
def test_normalize_text():
    assert normalize_text("मैं 2 स्कूल जा रहा हूँ!", "hi") == "मैं दो स्कूल जा रहा हूँ!"
    assert normalize_text("HELLO! 2 SCHOOL. [laugh]", "en") == "hello! 2 school. [laugh]"
    print(" All normalization tests passed!")
