In [None]:
# Cell 1: Imports and Path Setup
from pathlib import Path
import json
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
# Cell 2: Load JSONL file
chunks_path = Path("../data/chunks_langchain.jsonl")

# Read JSONL line-by-line into list of dicts
chunks = [json.loads(line) for line in chunks_path.open("r", encoding="utf-8")]
df = pd.DataFrame(chunks)


In [None]:
# Cell 3: Preview the first few chunks
df[["chunk_id", "source", "content"]].head()


In [None]:
# Cell 4: Chunk length stats
df["length"] = df["content"].apply(len)

print(f"📏 Average chunk size: {df['length'].mean():.2f} characters")
print(f"🔢 Total chunks: {len(df)}")


In [None]:
# Cell 5: Optional — histogram of chunk lengths
plt.figure(figsize=(8, 4))
plt.hist(df["length"], bins=20, color="skyblue", edgecolor="black")
plt.title("Distribution of Chunk Lengths")
plt.xlabel("Characters")
plt.ylabel("Number of Chunks")
plt.grid(True)
plt.show()
