# Speech-to-Text: Ma'lumotlarni O'rganish

Bu notebook Parquet fayldagi ma'lumotlarni o'rganish uchun

In [None]:
# Kerakli kutubxonalarni import qilish
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Konfiguratsiyani yuklash
import sys
sys.path.append('..')
from config import RAW_DATA_DIR, DataConfig

print("✓ Kutubxonalar yuklandi")

## 1. Parquet Faylni Yuklash

In [None]:
# Parquet faylni yuklash
parquet_path = RAW_DATA_DIR / DataConfig.PARQUET_FILE

print(f"Fayl yo'li: {parquet_path}")
print(f"Fayl mavjudmi: {parquet_path.exists()}")

if parquet_path.exists():
    df = pd.read_parquet(parquet_path)
    print(f"\n✓ Ma'lumotlar yuklandi")
    print(f"Qatorlar soni: {len(df)}")
    print(f"Ustunlar: {list(df.columns)}")
else:
    print("\n❌ Parquet fayl topilmadi!")

## 2. Ma'lumotlar Strukturasi

In [None]:
# DataFrame info
print("=" * 60)
print("DATAFRAME MA'LUMOTLARI")
print("=" * 60)
df.info()

print("\n" + "=" * 60)
print("BIRINCHI 5 TA QATOR")
print("=" * 60)
df.head()

## 3. Matn Tahlili

In [None]:
# Matn ustuni
text_column = DataConfig.TEXT_COLUMN

# Matn uzunliklari
df['text_length'] = df[text_column].str.len()
df['word_count'] = df[text_column].str.split().str.len()

print("=" * 60)
print("MATN STATISTIKASI")
print("=" * 60)
print(f"\nMatn uzunligi (belgilar):")
print(df['text_length'].describe())

print(f"\nSo'zlar soni:")
print(df['word_count'].describe())

In [None]:
# Matn uzunligi grafigi
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Matn uzunligi
axes[0].hist(df['text_length'], bins=50, edgecolor='black')
axes[0].set_title('Matn Uzunligi Taqsimoti (Belgilar)', fontsize=14)
axes[0].set_xlabel('Belgilar soni')
axes[0].set_ylabel('Chastota')
axes[0].grid(alpha=0.3)

# So'zlar soni
axes[1].hist(df['word_count'], bins=50, edgecolor='black', color='green')
axes[1].set_title("So'zlar Soni Taqsimoti", fontsize=14)
axes[1].set_xlabel("So'zlar soni")
axes[1].set_ylabel('Chastota')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Audio Tahlili

In [None]:
# Audio ustuni
audio_column = DataConfig.AUDIO_COLUMN

print("=" * 60)
print("AUDIO MA'LUMOTLARI")
print("=" * 60)

# Audio turi
sample_audio = df[audio_column].iloc[0]
print(f"\nAudio turi: {type(sample_audio)}")

if isinstance(sample_audio, str):
    print("Audio fayl yo'li sifatida saqlangan")
elif isinstance(sample_audio, bytes):
    print("Audio bytes sifatida saqlangan")
else:
    print(f"Noma'lum format: {type(sample_audio)}")

In [None]:
# Bitta audio faylni yuklash va vizualizatsiya qilish
def load_and_visualize_audio(audio_path_or_bytes, index=0):
    """
    Audio faylni yuklash va vizualizatsiya qilish
    """
    try:
        # Audio yuklash
        if isinstance(audio_path_or_bytes, str):
            audio, sr = librosa.load(audio_path_or_bytes, sr=16000)
        elif isinstance(audio_path_or_bytes, bytes):
            import io
            import soundfile as sf
            audio, sr = sf.read(io.BytesIO(audio_path_or_bytes))
        else:
            print(f"Noma'lum audio format: {type(audio_path_or_bytes)}")
            return
        
        # Visualizatsiya
        fig, axes = plt.subplots(3, 1, figsize=(15, 10))
        
        # 1. Waveform
        librosa.display.waveshow(audio, sr=sr, ax=axes[0])
        axes[0].set_title(f'Audio Waveform (Sample #{index})', fontsize=14)
        axes[0].set_xlabel('Vaqt (s)')
        axes[0].set_ylabel('Amplituda')
        
        # 2. Spectrogram
        D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
        img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[1])
        axes[1].set_title('Spectrogram', fontsize=14)
        fig.colorbar(img, ax=axes[1], format='%+2.0f dB')
        
        # 3. Mel Spectrogram
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr)
        mel_spec_db = librosa.amplitude_to_db(mel_spec, ref=np.max)
        img2 = librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel', ax=axes[2])
        axes[2].set_title('Mel Spectrogram', fontsize=14)
        fig.colorbar(img2, ax=axes[2], format='%+2.0f dB')
        
        plt.tight_layout()
        plt.show()
        
        # Audio ma'lumotlari
        print(f"\nAudio ma'lumotlari:")
        print(f"  Uzunligi: {len(audio)/sr:.2f} soniya")
        print(f"  Sample rate: {sr} Hz")
        print(f"  Samples: {len(audio)}")
        
    except Exception as e:
        print(f"Xato: {e}")

# Birinchi audio faylni vizualizatsiya qilish
print("Birinchi audio fayl vizualizatsiyasi:\n")
load_and_visualize_audio(df[audio_column].iloc[0], index=0)

## 5. Audio va Matn Munosabati

In [None]:
# Bir nechta sample'larni ko'rsatish
print("=" * 60)
print("AUDIO VA MATN MISOLLAR")
print("=" * 60)

for i in range(min(5, len(df))):
    print(f"\nSample #{i+1}:")
    print(f"  Matn: {df[text_column].iloc[i][:100]}...")
    print(f"  Matn uzunligi: {df['text_length'].iloc[i]} belgi")
    print(f"  So'zlar soni: {df['word_count'].iloc[i]}")

## 6. Ma'lumotlar Sifati Tekshiruvi

In [None]:
print("=" * 60)
print("MA'LUMOTLAR SIFATI")
print("=" * 60)

# Bo'sh qiymatlar
print("\nBo'sh qiymatlar:")
print(df.isnull().sum())

# Dublikatlar
print(f"\nDublikat qatorlar: {df.duplicated().sum()}")

# Bo'sh matnlar
empty_texts = df[df[text_column].str.strip() == '']
print(f"Bo'sh matnlar: {len(empty_texts)}")

# Juda qisqa matnlar (< 3 so'z)
short_texts = df[df['word_count'] < 3]
print(f"Juda qisqa matnlar (< 3 so'z): {len(short_texts)}")

# Juda uzun matnlar (> 100 so'z)
long_texts = df[df['word_count'] > 100]
print(f"Juda uzun matnlar (> 100 so'z): {len(long_texts)}")

## 7. Xulosa va Tavsiyalar

In [None]:
print("=" * 60)
print("XULOSA")
print("=" * 60)

total_samples = len(df)
clean_samples = total_samples - len(empty_texts) - len(short_texts)

print(f"\n📊 Umumiy ma'lumotlar:")
print(f"  • Jami samples: {total_samples}")
print(f"  • Toza samples: {clean_samples} ({clean_samples/total_samples*100:.1f}%)")
print(f"  • Muammoli samples: {total_samples - clean_samples} ({(total_samples - clean_samples)/total_samples*100:.1f}%)")

print(f"\n✅ TAVSIYALAR:")

if len(empty_texts) > 0:
    print(f"  • Bo'sh matnlarni tozalang: {len(empty_texts)} ta")

if len(short_texts) > 0:
    print(f"  • Juda qisqa matnlarni olib tashlang: {len(short_texts)} ta")

if df.duplicated().sum() > 0:
    print(f"  • Dublikatlarni olib tashlang: {df.duplicated().sum()} ta")

avg_text_length = df['text_length'].mean()
if avg_text_length < 20:
    print(f"  • O'rtacha matn juda qisqa: {avg_text_length:.0f} belgi")
elif avg_text_length > 500:
    print(f"  • O'rtacha matn juda uzun: {avg_text_length:.0f} belgi")

print(f"\n✓ Ma'lumotlar tahlili tugadi!")