In [1]:
# read raw json data
import json
import pathlib
import random

dataset_root = pathlib.Path("../dataset")
raw_data_path = dataset_root / "neyshekar_export.json"

with open(raw_data_path, "r") as f:
    raw_data = json.load(f)

# print the number of entries in the raw data
print(f"Number of entries in raw data: {len(raw_data)}")

# shuffle the raw data
random.shuffle(raw_data)

# go through each record and add and id starting from 0
shuffled_data = []
for i, entry in enumerate(raw_data):
    entry["id"] = i
    shuffled_data.append(entry)

raw_data = shuffled_data

Number of entries in raw data: 20020


In [4]:
# go through each entry and download the voice files
import requests
import os
import tqdm

output_dir = dataset_root / "downloaded_voices"
os.makedirs(output_dir, exist_ok=True)

for entry in tqdm.tqdm(raw_data):
    voice_url = entry.get("audio_url")
    voice_id = entry.get("voice_id")
    if voice_url:
        # extract file extension
        file_extension = voice_url.split(".")[-1].split("?")[0]
        filename = f"{voice_id}.{file_extension}"
        output_path = output_dir / filename

        if output_path.exists():
            continue

        # download the file
        response = requests.get(voice_url)
        if response.status_code == 200:
            with open(output_path, "wb") as f:
                f.write(response.content)

100%|██████████| 20020/20020 [00:00<00:00, 182743.63it/s]


In [5]:
# now convert all downloaded files to wav format
# Format      : WAV (PCM)
# Sample Rate : 16,000 Hz
# Bit Depth   : 16-bit
# Channels    : Mono

import pydub

export_dir = dataset_root / "export"
converted_dir = export_dir / "audio"
raw_converted_dir = export_dir / "raw_audio"
os.makedirs(converted_dir, exist_ok=True)
os.makedirs(raw_converted_dir, exist_ok=True)

for audio_file in tqdm.tqdm(output_dir.iterdir()):
    if audio_file.suffix.lower() not in [".wav",".mp4", ".mp3", ".ogg", ".flac", ".m4a", ".webm"]:
        continue

    # fine the corresponding id in the raw data
    entry = next((e for e in raw_data if e["voice_id"] == audio_file.stem), None)
    if entry is None:
        continue

    file_id = entry["id"]
    
    # file id padded to 6 digits
    file_id = str(file_id).zfill(6)

    output_path = converted_dir / f"neyshekar_{file_id}.wav"
    raw_output_path = raw_converted_dir / f"neyshekar_{file_id}.wav"

    if not raw_output_path.exists():
        audio = pydub.AudioSegment.from_file(audio_file)
        audio = audio.set_sample_width(2)
        audio.export(raw_output_path, format="wav", codec="pcm_s16le", parameters=["-map_metadata", "-1"])

    if not output_path.exists():
        audio = pydub.AudioSegment.from_file(audio_file)
        audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
        audio.export(output_path, format="wav", codec="pcm_s16le", parameters=["-map_metadata", "-1"])

20020it [1:28:37,  3.76it/s]


In [6]:
from shekar import Normalizer

# output a json file with these fields:
# id: sequential id starting from 0
# audio: filename of the wav file in the audio directory
# duration: duration of the audio file in seconds
# text: transcription text

nomalizer = Normalizer()
audio_freq = 16000

final_data = []
 
for _, entry in tqdm.tqdm(enumerate(raw_data)):
    
    id = entry.get("id")
    file_id = str(id).zfill(6)
    voice_id = f"neyshekar_{file_id}"

    text = entry.get("text_content", "").strip()
    # normalize with shekar 
    text = nomalizer(text)

    audio_path = converted_dir / f"{voice_id}.wav"

    if not audio_path.exists() or len(text) == 0:
        continue

    audio = pydub.AudioSegment.from_file(audio_path)
    
    final_data.append({
        "id": id,
        "audio": f"{voice_id}.wav",
        "text": text,
        "duration": audio.duration_seconds,
    })

final_output_path = dataset_root / "dataset.json"
with open(final_output_path, "w") as f:
    json.dump(final_data, f, indent=4, ensure_ascii=False)


[0;93m2026-01-14 23:56:14.412193340 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m
20020it [00:02, 8741.18it/s]


In [7]:
# now print some statistics about the final data
# these are the metrics we care about:
# total number of samples
# total duration in hours, average duration
# total number of tokens in the dataset
# vocab size
# histogram of durations
# histogram of named entities in the text

from shekar import WordTokenizer
tokenizer = WordTokenizer()

total_duration = sum([entry["duration"] for entry in final_data])
total_samples = len(final_data)
all_text = " ".join([entry["text"] for entry in final_data])
tokens = list(tokenizer.tokenize(all_text))
total_tokens = len(tokens)
vocab = set(tokens)
print(f"Total samples: {total_samples}")
print(f"Total duration (hours): {total_duration / 3600:.2f}")
print(f"Average clip duration (seconds): {total_duration / total_samples:.2f}")
print(f"Total tokens: {total_tokens}")
print(f"Vocab size: {len(vocab)}")

Total samples: 20020
Total duration (hours): 29.08
Average clip duration (seconds): 5.23
Total tokens: 208472
Vocab size: 20853


In [8]:
import matplotlib.pyplot as plt

# histogram of durations
durations = [entry["duration"] for entry in final_data]
max_duration = max(durations)
plt.hist(durations, bins=int(max_duration), range=(0, int(max_duration)))
# each bin represents duration range of 1 second
plt.xlabel("Duration (seconds)")
plt.ylabel("Number of samples")
plt.title("Histogram of Clip Durations")
plt.savefig(dataset_root / "duration_histogram.png")
plt.clf()
plt.show()

<Figure size 640x480 with 0 Axes>

In [9]:
from shekar import NER

ner = NER()

# histogram of named entities in the text
all_entities = []
for entry in tqdm.tqdm(final_data):
    entities = ner(entry["text"])
    all_entities.extend(entities)

# count occurrences of each entity type

all_entities = [entity for text, entity in all_entities]

entity_counts = {}
for tag in all_entities:
    entity_counts[tag] = entity_counts.get(tag, 0) + 1

# plot histogram of entity types
plt.figure(figsize=(10, 5))
plt.bar(entity_counts.keys(), entity_counts.values())
plt.xlabel("Entity Types")
plt.ylabel("Count")
plt.title("Histogram of Entity Types")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(dataset_root / "entity_histogram.png")
plt.clf()
plt.show()


100%|██████████| 20020/20020 [1:15:39<00:00,  4.41it/s]


<Figure size 1000x500 with 0 Axes>

In [10]:
# number of entities per entity type

entity_type_counts = {}
for entity in all_entities:
    entity_type_counts[entity] = entity_type_counts.get(entity, 0) + 1
print("Entity type counts:")
for entity_type, count in entity_type_counts.items():
    print(f"{entity_type}: {count}")
    

Entity type counts:
ORG: 965
DAT: 1913
LOC: 2421
PER: 998
EVE: 139
