In [None]:
from datasets import load_dataset

# Load SQuAD v2 dataset
dataset = load_dataset("squad_v2")

# Check dataset structure
print(dataset)

# Print one sample from training set
print(dataset["train"][0])


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert dataset to pandas DataFrame
train_df = pd.DataFrame(dataset["train"])
val_df = pd.DataFrame(dataset["validation"])

# Manually create 'is_impossible' column based on answers
train_df["is_impossible"] = train_df["answers"].apply(lambda x: len(x["text"]) == 0)
val_df["is_impossible"] = val_df["answers"].apply(lambda x: len(x["text"]) == 0)

# Calculate length statistics
train_df["question_length"] = train_df["question"].apply(len)
train_df["context_length"] = train_df["context"].apply(len)

# Summary statistics for question and context length
print("Train Question Length Stats:")
print(train_df["question_length"].describe())
print("\nTrain Context Length Stats:")
print(train_df["context_length"].describe())

# Answer availability stats
answer_stats = train_df["is_impossible"].value_counts(normalize=True) * 100
print("\nAnswer Availability (%):")
print(answer_stats)

# Visualization settings
sns.set(style="whitegrid")

# Question length distribution (including outliers)
plt.figure(figsize=(8, 5))
sns.histplot(train_df["question_length"], bins=50, kde=False)
plt.title("Question Length Distribution")
plt.xlabel("Question Length")
plt.ylabel("Frequency")
plt.show()

# Context length distribution
plt.figure(figsize=(8, 5))
sns.histplot(train_df["context_length"], bins=50, kde=False)
plt.title("Context Length Distribution")
plt.xlabel("Context Length")
plt.ylabel("Frequency")
plt.show()

# Answer availability pie chart
plt.figure(figsize=(5, 5))
plt.pie(
    train_df["is_impossible"].value_counts(),
    labels=["Answerable", "Unanswerable"],
    autopct="%1.1f%%",
    startangle=140,
    colors=["#4CAF50", "#FF7043"]
)
plt.title("Answer Availability Ratio")
plt.show()

# Filter extreme outliers for question length visualization
q99 = train_df["question_length"].quantile(0.99)
filtered_train_df = train_df[train_df["question_length"] <= q99]

plt.figure(figsize=(8, 5))
sns.histplot(filtered_train_df["question_length"], bins=50, kde=False)
plt.title("Question Length Distribution (Filtered ≤ 99th percentile)")
plt.xlabel("Question Length")
plt.ylabel("Frequency")
plt.show()

# Check top 5 longest questions
print("\nTop 5 Longest Questions:")
print(train_df.sort_values("question_length", ascending=False)[["question", "question_length"]].head())
