# üìä Hindi Disfluency Restoration - Exploratory Data Analysis

This notebook analyzes the disfluency patterns in Hindi speech transcripts to guide our restoration strategy.

**Sections:**
1. Data Loading & Overview
2. Disfluency Frequency Analysis
3. Position Analysis (Where do disfluencies occur?)
4. Count per Sample
5. Consecutive Disfluencies
6. Text Length Analysis
7. Context Analysis (Before/After words)
8. Train vs Test Comparison
9. Audio Analysis
10. Summary & Recommendations

---

## 1Ô∏è‚É£ Setup & Data Loading

In [None]:
# =============================================================================
# IMPORTS
# =============================================================================

import os
import re
import pickle
import unicodedata
from collections import Counter
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
from tqdm.auto import tqdm

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['font.size'] = 11
sns.set_palette('husl')

print("‚úÖ Imports complete")

In [None]:
# =============================================================================
# CONFIGURATION
# =============================================================================

# Data paths (Kaggle)
INPUT_DIR = Path("/kaggle/input/nppe-2-automatic-disfluency-restoration")
AUDIO_DIR = INPUT_DIR / "downloaded_audios"

print(f"üìÅ Input directory: {INPUT_DIR}")
print(f"üéµ Audio directory: {AUDIO_DIR}")

In [None]:
# =============================================================================
# LOAD DATASETS
# =============================================================================

# Load CSVs
train_df = pd.read_csv(INPUT_DIR / "train.csv")
test_df = pd.read_csv(INPUT_DIR / "test.csv")
disf_df = pd.read_csv(INPUT_DIR / "unique_disfluencies.csv")

# Display summary
print("=" * 50)
print("üìä DATASET OVERVIEW")
print("=" * 50)
print(f"\nüîπ Train samples:       {len(train_df):,}")
print(f"üîπ Test samples:        {len(test_df):,}")
print(f"üîπ Unique disfluencies: {len(disf_df):,}")

print("\nüìã Train columns:", list(train_df.columns))
print("üìã Test columns: ", list(test_df.columns))

In [None]:
# =============================================================================
# BUILD DISFLUENCY SET (with Unicode variants)
# =============================================================================
# Hindi has multiple Unicode representations for the same character.
# We create variants to catch all possible spellings.

DISFLUENCY_SET = set(disf_df['disfluency'].str.strip().tolist())

# Add Unicode variants
variants = set()
for d in DISFLUENCY_SET:
    variants.add(unicodedata.normalize('NFC', d))   # Composed form
    variants.add(unicodedata.normalize('NFD', d))   # Decomposed form
    variants.add(unicodedata.normalize('NFKC', d))  # Compatibility composed
    variants.add(d.replace('‡§Ç', '‡§Å'))               # Anusvara ‚Üî Chandrabindu
    variants.add(d.replace('‡§Å', '‡§Ç'))

DISFLUENCY_SET.update(variants)

# Add common fillers that might be missing
COMMON_FILLERS = {'‡§π‡§Æ‡•ç‡§Æ', '‡§π‡§æ‡§Ç', '‡§π‡§æ‡§Å', '‡§â‡§Æ‡•ç‡§Æ', '‡§Ö‡§Æ‡•ç‡§Æ', '‡§π', '‡§Ö‡§Ç', '‡§§‡•ã', '‡§µ‡•ã', '‡§Æ‡§§‡§≤‡§¨'}
DISFLUENCY_SET.update(COMMON_FILLERS)

print(f"‚úÖ Total disfluencies (with variants): {len(DISFLUENCY_SET)}")
print(f"\nüìù Sample disfluencies: {list(DISFLUENCY_SET)[:10]}")

## üõ†Ô∏è Helper Functions

In [None]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def extract_disfluencies(text):
    """
    Find all disfluencies in a transcript.
    Returns list of {'disfluency': word, 'position': index, 'total_words': n}
    """
    if pd.isna(text):
        return []
    
    words = str(text).split()
    found = []
    
    for i, word in enumerate(words):
        # Clean punctuation
        clean = word.strip('‡•§‡••,;:!?\'"-')
        # Normalize for matching
        norm = unicodedata.normalize('NFC', clean.lower())
        
        if norm in DISFLUENCY_SET or clean in DISFLUENCY_SET:
            found.append({
                'disfluency': norm,
                'position': i,
                'total_words': len(words)
            })
    
    return found


def find_consecutive_runs(disfluencies):
    """
    Find runs of consecutive disfluencies (e.g., "‡§π‡§Æ‡•ç‡§Æ ‡§π‡§æ‡§Ç ‡§â‡§Æ‡•ç‡§Æ").
    Returns list of run lengths.
    """
    if not disfluencies:
        return []
    
    positions = sorted([d['position'] for d in disfluencies])
    runs = []
    current_run = 1
    
    for i in range(1, len(positions)):
        if positions[i] == positions[i-1] + 1:
            current_run += 1
        else:
            if current_run > 1:
                runs.append(current_run)
            current_run = 1
    
    if current_run > 1:
        runs.append(current_run)
    
    return runs


def get_context(text, disfluencies):
    """
    Get words before and after each disfluency.
    Useful for understanding context patterns.
    """
    if pd.isna(text) or not disfluencies:
        return []
    
    words = str(text).split()
    contexts = []
    
    for d in disfluencies:
        pos = d['position']
        before = words[pos-1] if pos > 0 else '<START>'
        after = words[pos+1] if pos < len(words)-1 else '<END>'
        
        contexts.append({
            'disfluency': d['disfluency'],
            'before': before,
            'after': after
        })
    
    return contexts


print("‚úÖ Helper functions defined")

## 2Ô∏è‚É£ Disfluency Frequency Analysis

Which disfluencies are most common? This helps us prioritize which ones to focus on.

In [None]:
# =============================================================================
# EXTRACT ALL DISFLUENCIES FROM TRAINING DATA
# =============================================================================

print("üîÑ Extracting disfluencies from training data...")

# Apply extraction to all transcripts
train_df['disfluencies'] = train_df['transcript'].apply(extract_disfluencies)
train_df['disf_count'] = train_df['disfluencies'].apply(len)

# Flatten all disfluencies into one list
all_disfluencies = []
for disfs in train_df['disfluencies']:
    all_disfluencies.extend(disfs)

# Convert to DataFrame for analysis
disf_analysis = pd.DataFrame(all_disfluencies)

print(f"‚úÖ Found {len(disf_analysis):,} total disfluency occurrences")
print(f"   in {len(train_df):,} transcripts")

In [None]:
# =============================================================================
# FREQUENCY ANALYSIS PLOT
# =============================================================================

disf_freq = disf_analysis['disfluency'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Absolute counts
disf_freq.head(15).plot(kind='barh', ax=axes[0], color='steelblue', edgecolor='black')
axes[0].set_title('üîπ Top 15 Most Common Disfluencies', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Count')
axes[0].invert_yaxis()  # Highest at top

# Plot 2: Percentage of total
disf_pct = (disf_freq.head(15) / len(disf_analysis) * 100)
disf_pct.plot(kind='barh', ax=axes[1], color='darkorange', edgecolor='black')
axes[1].set_title('üîπ Top 15 Disfluencies (% of Total)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Percentage (%)')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

# Key insight
top5_coverage = disf_freq.head(5).sum() / len(disf_analysis) * 100
print(f"\nüìä KEY INSIGHT: Top 5 disfluencies cover {top5_coverage:.1f}% of all occurrences!")
print(f"   Top 5: {disf_freq.head(5).index.tolist()}")

## 3Ô∏è‚É£ Position Analysis

Where in the sentence do disfluencies occur? This helps us set position priors.

In [None]:
# =============================================================================
# POSITION ANALYSIS
# =============================================================================

# Calculate relative position (0 = start, 1 = end)
disf_analysis['relative_position'] = disf_analysis['position'] / disf_analysis['total_words'].clip(lower=1)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Position distribution histogram
axes[0].hist(disf_analysis['relative_position'], bins=20, edgecolor='black', alpha=0.7, color='teal')
axes[0].set_title('üîπ Disfluency Position Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Relative Position (0=start, 1=end)')
axes[0].set_ylabel('Count')
axes[0].axvline(x=0.25, color='red', linestyle='--', linewidth=2, label='First Quarter')
axes[0].legend()

# Plot 2: Mean position by disfluency type
top_disfs = disf_freq.head(8).index.tolist()
position_by_type = disf_analysis[disf_analysis['disfluency'].isin(top_disfs)].groupby('disfluency')['relative_position'].mean()
position_by_type.sort_values().plot(kind='barh', ax=axes[1], color='purple', edgecolor='black')
axes[1].set_title('üîπ Mean Position by Disfluency Type', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Mean Relative Position')
axes[1].axvline(x=0.5, color='red', linestyle='--', alpha=0.5, label='Middle')
axes[1].legend()

plt.tight_layout()
plt.show()

# Statistics
first_quarter = (disf_analysis['relative_position'] < 0.25).sum() / len(disf_analysis) * 100
at_start = (disf_analysis['position'] == 0).sum() / len(disf_analysis) * 100

print(f"\nüìä KEY INSIGHTS:")
print(f"   ‚Ä¢ {first_quarter:.1f}% of disfluencies occur in the first quarter")
print(f"   ‚Ä¢ {at_start:.1f}% of disfluencies are at sentence start")
print(f"   ‚üπ Use position prior biased toward start!")

## 4Ô∏è‚É£ Disfluency Count per Sample

How many disfluencies does each transcript have?

In [None]:
# =============================================================================
# COUNT PER SAMPLE
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Histogram of counts
train_df['disf_count'].hist(bins=range(0, 15), ax=axes[0], edgecolor='black', alpha=0.7, color='coral')
axes[0].set_title('üîπ Disfluencies per Transcript', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Number of Disfluencies')
axes[0].set_ylabel('Number of Transcripts')
axes[0].set_xticks(range(0, 15))

# Plot 2: Cumulative percentage
counts = train_df['disf_count'].value_counts().sort_index()
cumsum = counts.cumsum() / len(train_df) * 100
cumsum.plot(ax=axes[1], marker='o', color='purple', linewidth=2)
axes[1].set_title('üîπ Cumulative % of Samples', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Max Disfluencies per Sample')
axes[1].set_ylabel('Cumulative %')
axes[1].axhline(y=90, color='red', linestyle='--', label='90% coverage')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistics
print(f"\nüìä STATISTICS:")
print(f"   ‚Ä¢ Mean disfluencies/sample: {train_df['disf_count'].mean():.2f}")
print(f"   ‚Ä¢ Median: {train_df['disf_count'].median():.0f}")
print(f"   ‚Ä¢ Max: {train_df['disf_count'].max()}")
print(f"   ‚Ä¢ Samples with 0 disfluencies: {(train_df['disf_count'] == 0).sum()} ({(train_df['disf_count'] == 0).mean()*100:.1f}%)")

## 5Ô∏è‚É£ Consecutive Disfluencies

Do disfluencies appear in runs (e.g., "‡§π‡§Æ‡•ç‡§Æ ‡§π‡§æ‡§Ç ‡§â‡§Æ‡•ç‡§Æ")?

In [None]:
# =============================================================================
# CONSECUTIVE DISFLUENCIES
# =============================================================================

# Find consecutive runs in each transcript
train_df['consecutive_runs'] = train_df['disfluencies'].apply(find_consecutive_runs)
train_df['max_consecutive'] = train_df['consecutive_runs'].apply(lambda x: max(x) if x else 0)

# Flatten all runs
all_runs = [run for runs in train_df['consecutive_runs'] for run in runs]

print(f"üìä CONSECUTIVE DISFLUENCY ANALYSIS:")
print(f"   ‚Ä¢ Samples with consecutive disfluencies: {(train_df['max_consecutive'] > 0).sum()}")
print(f"   ‚Ä¢ Max consecutive in single sample: {train_df['max_consecutive'].max()}")

if all_runs:
    print(f"   ‚Ä¢ Mean run length: {np.mean(all_runs):.2f}")
    
    plt.figure(figsize=(8, 4))
    pd.Series(all_runs).value_counts().sort_index().plot(kind='bar', color='coral', edgecolor='black')
    plt.title('üîπ Consecutive Disfluency Run Lengths', fontsize=12, fontweight='bold')
    plt.xlabel('Run Length')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.show()
else:
    print("   ‚ö†Ô∏è No consecutive disfluencies found")

## 6Ô∏è‚É£ Text Length Analysis

Does transcript length correlate with disfluency count?

In [None]:
# =============================================================================
# TEXT LENGTH ANALYSIS
# =============================================================================

train_df['word_count'] = train_df['transcript'].fillna('').apply(lambda x: len(x.split()))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Word count distribution
train_df['word_count'].hist(bins=50, ax=axes[0], edgecolor='black', alpha=0.7, color='skyblue')
axes[0].set_title('üîπ Word Count Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Number of Words')
axes[0].set_ylabel('Count')
axes[0].axvline(x=train_df['word_count'].mean(), color='red', linestyle='--', label=f'Mean: {train_df["word_count"].mean():.1f}')
axes[0].legend()

# Plot 2: Disfluencies vs Word count (scatter)
axes[1].scatter(train_df['word_count'], train_df['disf_count'], alpha=0.3, s=15, color='steelblue')
axes[1].set_title('üîπ Disfluencies vs Word Count', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Disfluency Count')

# Add trend line
z = np.polyfit(train_df['word_count'], train_df['disf_count'], 1)
p = np.poly1d(z)
x_line = np.linspace(0, train_df['word_count'].max(), 100)
axes[1].plot(x_line, p(x_line), 'r--', linewidth=2, label=f'Trend: {z[0]:.3f}x + {z[1]:.1f}')
axes[1].legend()

plt.tight_layout()
plt.show()

# Correlation
corr = train_df['word_count'].corr(train_df['disf_count'])
print(f"\nüìä CORRELATION:")
print(f"   ‚Ä¢ Word count ‚Üî Disfluency count: r = {corr:.3f}")
print(f"   ‚üπ {'Moderate positive' if corr > 0.3 else 'Weak'} correlation")

## 7Ô∏è‚É£ Context Analysis

What words typically appear before/after disfluencies?

In [None]:
# =============================================================================
# CONTEXT ANALYSIS
# =============================================================================

print("üîÑ Extracting context for each disfluency...")

all_contexts = []
for _, row in train_df.iterrows():
    all_contexts.extend(get_context(row['transcript'], row['disfluencies']))

context_df = pd.DataFrame(all_contexts)

if len(context_df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Words BEFORE disfluencies
    before_words = context_df[context_df['before'] != '<START>']['before'].value_counts().head(15)
    before_words.plot(kind='barh', ax=axes[0], color='lightcoral', edgecolor='black')
    axes[0].set_title('üîπ Most Common Words BEFORE Disfluencies', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Count')
    axes[0].invert_yaxis()
    
    # Words AFTER disfluencies
    after_words = context_df[context_df['after'] != '<END>']['after'].value_counts().head(15)
    after_words.plot(kind='barh', ax=axes[1], color='lightgreen', edgecolor='black')
    axes[1].set_title('üîπ Most Common Words AFTER Disfluencies', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Count')
    axes[1].invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    # Statistics
    start_pct = (context_df['before'] == '<START>').sum() / len(context_df) * 100
    print(f"\nüìä {start_pct:.1f}% of disfluencies occur at sentence start")
else:
    print("‚ö†Ô∏è No context data available")

## 8Ô∏è‚É£ Train vs Test Comparison

Are train and test distributions similar?

In [None]:
# =============================================================================
# TRAIN VS TEST DISTRIBUTION
# =============================================================================

test_df['word_count'] = test_df['transcript'].fillna('').apply(lambda x: len(x.split()))

fig, ax = plt.subplots(figsize=(10, 5))

train_df['word_count'].hist(bins=50, alpha=0.5, label='Train', ax=ax, density=True, color='steelblue')
test_df['word_count'].hist(bins=50, alpha=0.5, label='Test', ax=ax, density=True, color='coral')

ax.set_title('üîπ Word Count Distribution: Train vs Test', fontsize=12, fontweight='bold')
ax.set_xlabel('Word Count')
ax.set_ylabel('Density')
ax.legend(fontsize=11)

plt.show()

print(f"\nüìä COMPARISON:")
print(f"   Train - Mean: {train_df['word_count'].mean():.1f}, Median: {train_df['word_count'].median():.0f}")
print(f"   Test  - Mean: {test_df['word_count'].mean():.1f}, Median: {test_df['word_count'].median():.0f}")

## 9Ô∏è‚É£ Audio Analysis

In [None]:
# =============================================================================
# AUDIO FILE ANALYSIS
# =============================================================================

# Check which audio files exist
train_df['audio_exists'] = train_df['id'].apply(lambda x: (AUDIO_DIR / f"{x}.wav").exists())

print(f"üìä AUDIO AVAILABILITY:")
print(f"   ‚Ä¢ Audio available: {train_df['audio_exists'].sum()}/{len(train_df)} ({train_df['audio_exists'].mean()*100:.1f}%)")

# Sample audio duration analysis (if audio exists)
if train_df['audio_exists'].any():
    sample = train_df[train_df['audio_exists']].sample(min(50, train_df['audio_exists'].sum()))
    durations = []
    
    print("\nüîÑ Analyzing sample audio durations...")
    for _, row in tqdm(sample.iterrows(), total=len(sample)):
        try:
            duration = librosa.get_duration(path=str(AUDIO_DIR / f"{row['id']}.wav"))
            durations.append({
                'id': row['id'],
                'duration': duration,
                'word_count': row['word_count']
            })
        except Exception as e:
            pass
    
    if durations:
        dur_df = pd.DataFrame(durations)
        speaking_rate = dur_df['word_count'].sum() / dur_df['duration'].sum()
        
        print(f"\nüìä AUDIO STATISTICS (sample of {len(durations)}):")
        print(f"   ‚Ä¢ Mean duration: {dur_df['duration'].mean():.1f} seconds")
        print(f"   ‚Ä¢ Speaking rate: {speaking_rate:.1f} words/second")
else:
    print("\n‚ö†Ô∏è No audio files found")

## üîü Summary & Recommendations

In [None]:
# =============================================================================
# SUMMARY & RECOMMENDATIONS
# =============================================================================

print("=" * 60)
print("üìà KEY STATISTICS SUMMARY")
print("=" * 60)

print(f"\nüîπ Total disfluencies in training: {len(disf_analysis):,}")
print(f"üîπ Mean disfluencies per sample:   {train_df['disf_count'].mean():.2f}")
print(f"üîπ Top 5 disfluencies cover:       {top5_coverage:.1f}% of all")
print(f"üîπ In first quarter of sentence:   {first_quarter:.1f}%")
print(f"üîπ At sentence start:              {at_start:.1f}%")

print("\n" + "=" * 60)
print("üí° RECOMMENDATIONS FOR PIPELINE")
print("=" * 60)

recommendations = [
    ("POSITION PRIOR", "Bias insertions toward sentence start (first 25%)"),
    ("TOP-K FOCUS", "Prioritize top 5-10 disfluencies for higher precision"),
    ("CONSECUTIVE", "Handle back-to-back disfluencies (limit max 4)"),
    ("CONTEXT PATTERNS", "Use before/after word patterns for validation"),
    ("PER-DISFLUENCY THRESHOLDS", "Different confidence thresholds per disfluency type"),
]

for i, (title, desc) in enumerate(recommendations, 1):
    print(f"\n{i}. {title}")
    print(f"   {desc}")

print("\n" + "=" * 60)
print(f"üìä Top 10 Disfluencies: {disf_freq.head(10).index.tolist()}")
print("=" * 60)

---

## üîç Data Validation

to check for data quality issues before running the pipeline.

In [None]:
# =============================================================================
# DATA VALIDATION
# =============================================================================

def validate_dataset(df, audio_dir, name="Dataset"):
    """Check for common data quality issues."""
    print(f"\n{'='*50}")
    print(f"üîç VALIDATING: {name}")
    print(f"{'='*50}")
    
    issues = []
    
    # Check for null/empty transcripts
    null_count = df['transcript'].isna().sum()
    empty_count = (df['transcript'].astype(str).str.strip() == '').sum()
    
    print(f"\nüìù Transcripts:")
    print(f"   ‚Ä¢ Null: {null_count}")
    print(f"   ‚Ä¢ Empty: {empty_count}")
    
    if null_count > 0:
        issues.append(f"{null_count} null transcripts")
    
    # Check for missing audio
    df['has_audio'] = df['id'].apply(lambda x: (audio_dir / f"{x}.wav").exists())
    missing_audio = (~df['has_audio']).sum()
    
    print(f"\nüéµ Audio:")
    print(f"   ‚Ä¢ Missing: {missing_audio}/{len(df)}")
    
    if missing_audio > 0:
        issues.append(f"{missing_audio} missing audio files")
    
    # Check for duplicates
    dup_count = df['id'].duplicated().sum()
    print(f"\nüÜî IDs:")
    print(f"   ‚Ä¢ Duplicates: {dup_count}")
    
    if dup_count > 0:
        issues.append(f"{dup_count} duplicate IDs")
    
    # Summary
    if issues:
        print(f"\n‚ö†Ô∏è Issues found: {', '.join(issues)}")
    else:
        print(f"\n‚úÖ No issues found!")
    
    return issues

# Run validation
train_issues = validate_dataset(train_df, AUDIO_DIR, "Training Set")
test_issues = validate_dataset(test_df, AUDIO_DIR, "Test Set")

print("\n" + "=" * 50)
print("‚úÖ VALIDATION COMPLETE")
print("=" * 50)