# Tarteel-ML Dataset Exploration

This notebook walks through cloning the repository, downloading the dataset, exploring metadata and audio, visualizing features, and saving artifacts.

In [None]:
# 1. Setup: Clone Repo and Install Dependencies
!git clone https://github.com/Tarteel-io/Tarteel-ML.git
%cd Tarteel-ML
!pip install -r requirements.txt

In [None]:
# 2. Download Tarteel Dataset
from scripts.download_data import download_tarteel_data

# By default downloads into `data/` directory
download_tarteel_data(output_dir='data')

In [None]:
# 3. Inspect Metadata
import pandas as pd
meta = pd.read_csv('data/metadata.csv')
meta.head(10)

## Quick Stats

In [None]:
# Show basic statistics
meta['duration_sec'].describe()

In [None]:
# 4. Audio Playback & Duration Distribution
import IPython.display as ipd

# Play a sample file
ipd.Audio('data/audio/reciter_A/001.wav')

In [None]:
import matplotlib.pyplot as plt
plt.hist(meta['duration_sec'], bins=50)
plt.xlabel('Duration (sec)')
plt.ylabel('Count')
plt.title('Audio Duration Distribution')
plt.show()

## 5. Spectrogram Visualization

In [None]:
import librosa, librosa.display
import numpy as np
import matplotlib.pyplot as plt

path = 'data/audio/reciter_A/001.wav'
y, sr = librosa.load(path, sr=16000)
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
S_db = librosa.power_to_db(S, ref=np.max)

plt.figure(figsize=(10, 4))
librosa.display.specshow(S_db, sr=sr, y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-Spectrogram of 001.wav')
plt.tight_layout()
plt.show()

## 6. Segment-Level Preview
Load transcript JSON and show a few segments alongside short audio snippets.

In [None]:
import json
with open('data/transcripts/reciter_A.json') as f:
    transcripts = json.load(f)

# Display first 5 segments
for seg in transcripts[:5]:
    start, end = seg['start_time'], seg['end_time']
    print(f"Verse {seg['verse_id']}: {seg['text']} ({start}-{end}s)")
    display(ipd.Audio('data/audio/reciter_A/001.wav', rate=sr, autoplay=False, normalize=True))
    break  # remove break to iterate further

## 7. Save Sample Metadata
Export a random subset for reporting.

In [None]:
sample = meta.sample(10, random_state=42)
sample.to_csv('sample_metadata.csv', index=False)
print("Sample metadata saved to sample_metadata.csv")

## Next Steps
- Extend to multiple reciters
- Extract features (MFCC, pitch)
- Build initial ASR or classification baseline