# Data Exploration

This notebook explores the SQuAD dataset to understand its structure and characteristics.


In [None]:
import sys
from pathlib import Path
import pandas as pd
import json

# Add src to path
sys.path.append(str(Path('..').resolve()))

from src.data.dataset_loader import SquadDatasetLoader


In [None]:
# Load dataset
data_path = '../Artifacts/SQuAD-v1.1.csv'  # Update with your path
loader = SquadDatasetLoader(data_path)
data = loader.load()


In [None]:
# Explore dataset
print(f"Number of examples: {len(data)}")
print(f"\nFirst example:")
if data:
    print(data[0])


In [None]:
# Analyze dataset characteristics
import matplotlib.pyplot as plt
import numpy as np

if isinstance(data, list) and len(data) > 0:
    question_lengths = [len(ex.get('question', '')) for ex in data]
    context_lengths = [len(ex.get('context', '')) for ex in data]
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    axes[0].hist(question_lengths, bins=50)
    axes[0].set_title('Question Length Distribution')
    axes[0].set_xlabel('Length (characters)')
    axes[0].set_ylabel('Frequency')
    
    axes[1].hist(context_lengths, bins=50)
    axes[1].set_title('Context Length Distribution')
    axes[1].set_xlabel('Length (characters)')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
