In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from pathlib import Path

# ---------------------------------
# Paths
# ---------------------------------
CSV_PATH = "class_10_11_12.csv"
OUT_DIR = Path("eda_reports")
OUT_DIR.mkdir(exist_ok=True)

# ---------------------------------
# Load dataset
# ---------------------------------
df = pd.read_csv(CSV_PATH)
print("Dataset Loaded.")
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

# ---------------------------------
# 1. Missing values summary
# ---------------------------------
missing = df.isna().sum().sort_values(ascending=False)
print("\n=== Missing Values ===")
print(missing)

missing.to_csv(OUT_DIR / "missing_values.csv")

# ---------------------------------
# 2. Basic stats on text lengths
# ---------------------------------
def text_len(s):
    if isinstance(s, str):
        return len(s.split())
    return 0

df["len_Explanation"] = df["Explanation"].apply(text_len)
df["len_Question"] = df["Question"].apply(text_len)
df["len_Answer"] = df["Answer"].apply(text_len)

length_stats = df[["len_Explanation","len_Question","len_Answer"]].describe()
print("\n=== Length Statistics ===")
print(length_stats)

length_stats.to_csv(OUT_DIR / "length_stats.csv")

# Plot distributions
plt.figure(figsize=(10,5))
sns.histplot(df["len_Explanation"], bins=50, kde=True)
plt.title("Explanation Length Distribution (words)")
plt.savefig(OUT_DIR / "explanation_length_dist.png")
plt.close()

plt.figure(figsize=(10,5))
sns.histplot(df["len_Question"], bins=50, kde=True)
plt.title("Question Length Distribution (words)")
plt.savefig(OUT_DIR / "question_length_dist.png")
plt.close()

plt.figure(figsize=(10,5))
sns.histplot(df["len_Answer"], bins=50, kde=True)
plt.title("Answer Length Distribution (words)")
plt.savefig(OUT_DIR / "answer_length_dist.png")
plt.close()

# ---------------------------------
# 3. Duplicate checks
# ---------------------------------
dupe_q = df.duplicated(subset=["Question"]).sum()
dupe_qa = df.duplicated(subset=["Question","Answer"]).sum()

print("\n=== Duplicates ===")
print("Duplicate Questions:", dupe_q)
print("Duplicate Question+Answer pairs:", dupe_qa)

# Save duplicates data
df[df.duplicated(subset=["Question"], keep=False)].to_csv(
    OUT_DIR / "duplicate_questions.csv", index=False)

df[df.duplicated(subset=["Question","Answer"], keep=False)].to_csv(
    OUT_DIR / "duplicate_QA.csv", index=False)

# --- NEW: Duplicate Answers ---
dupe_a = df.duplicated(subset=["Answer"]).sum()
print("Duplicate Answers:", dupe_a)

df[df.duplicated(subset=["Answer"], keep=False)].to_csv(
    OUT_DIR / "duplicate_answers.csv", index=False
)

# Extra: frequency of each answer (to know which answers repeat how many times)
answer_counts = df["Answer"].value_counts()
answer_counts.to_csv(OUT_DIR / "answer_duplicate_counts.csv")

print("\nTop 20 most repeated Answers:")
print(answer_counts.head(20))

# ---------------------------------
# 4. Subject distribution (raw)
# ---------------------------------
if "subject" in df.columns:
    subj_counts = df["subject"].value_counts()
    print("\n=== Subject Distribution (raw) ===")
    print(subj_counts)
    subj_counts.to_csv(OUT_DIR / "subject_distribution.csv")

    plt.figure(figsize=(10,5))
    sns.barplot(x=subj_counts.index, y=subj_counts.values)
    plt.xticks(rotation=45)
    plt.title("Subject Distribution")
    plt.savefig(OUT_DIR / "subject_distribution.png")
    plt.close()

# ---------------------------------
# 5. Grade distribution
# ---------------------------------
if "grade" in df.columns:
    grade_counts = df["grade"].value_counts()
    print("\n=== Grade Distribution ===")
    print(grade_counts)
    grade_counts.to_csv(OUT_DIR / "grade_distribution.csv")

# ---------------------------------
# 6. Difficulty distribution
# ---------------------------------
if "Difficulty" in df.columns:
    diff_counts = df["Difficulty"].value_counts()
    print("\n=== Difficulty Distribution ===")
    print(diff_counts)
    diff_counts.to_csv(OUT_DIR / "difficulty_distribution.csv")

# ---------------------------------
# 7. StudentLevel distribution
# ---------------------------------
if "StudentLevel" in df.columns:
    stu_counts = df["StudentLevel"].value_counts()
    print("\n=== Student Level Distribution ===")
    print(stu_counts)
    stu_counts.to_csv(OUT_DIR / "student_level_distribution.csv")

# ---------------------------------
# 8. QuestionType / QuestionComplexity distribution
# ---------------------------------
if "QuestionType" in df.columns:
    qt_counts = df["QuestionType"].value_counts()
    print("\n=== Question Type ===")
    print(qt_counts)
    qt_counts.to_csv(OUT_DIR / "question_type_distribution.csv")

if "QuestionComplexity" in df.columns:
    qc_counts = df["QuestionComplexity"].value_counts()
    print("\n=== Question Complexity ===")
    print(qc_counts)
    qc_counts.to_csv(OUT_DIR / "question_complexity_distribution.csv")

# ---------------------------------
# 9. Explanation Outliers (too long / too short)
# ---------------------------------
exp_lengths = df["len_Explanation"]
long_outliers = df[exp_lengths > exp_lengths.quantile(0.99)]
short_outliers = df[exp_lengths < exp_lengths.quantile(0.01)]

print("\n=== Outlier Explanations ===")
print("Very Long (Top 1%):", len(long_outliers))
print("Very Short (Bottom 1%):", len(short_outliers))

long_outliers.to_csv(OUT_DIR / "long_explanations.csv")
short_outliers.to_csv(OUT_DIR / "short_explanations.csv")

# ---------------------------------
# Final summary
# ---------------------------------
print("\n=== EDA Completed ===")
print("Reports saved to:", OUT_DIR)


Dataset Loaded.
Shape: (30706, 12)

Columns: ['Topic', 'Explanation', 'Question', 'Answer', 'Difficulty', 'StudentLevel', 'QuestionType', 'QuestionComplexity', 'Prerequisites', 'EstimatedTime', 'subject', 'grade']

=== Missing Values ===
Topic                 0
Explanation           0
Question              0
Answer                0
Difficulty            0
StudentLevel          0
QuestionType          0
QuestionComplexity    0
Prerequisites         0
EstimatedTime         0
subject               0
grade                 0
dtype: int64

=== Length Statistics ===
       len_Explanation  len_Question    len_Answer
count     30706.000000  30706.000000  30706.000000
mean         77.934899     14.077118     32.179997
std          23.010455      5.206792     19.562341
min           1.000000      1.000000      1.000000
25%          63.000000     11.000000     16.000000
50%          74.000000     14.000000     32.000000
75%          87.000000     17.000000     45.000000
max         308.000000    

In [3]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np

CSV_PATH = "class_10_11_12.csv"
MODEL = "t5-base"   # change to your model/tokenizer

df = pd.read_csv(CSV_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

# function to get token length of the Explanation (or full prompt)
def tok_len(text):
    return len(tokenizer.encode(text, truncation=False))

# measure Explanation token lengths for a sample or all rows
df["expl_tok_len"] = df["Explanation"].fillna("").astype(str).map(lambda t: tok_len(t))

# stats
print(df["expl_tok_len"].describe(percentiles=[.25, .5, .75, .9, .95, .98, .99]))

# get what percentage would fit within X tokens
for cutoff in [256, 320, 384, 512]:
    pct = (df["expl_tok_len"] <= cutoff).mean() * 100
    print(f"{cutoff} tokens covers {pct:.2f}% of Explanations")

# Example: to choose cutoff covering 98%:
target_pct = 0.98
cutoff_token = int(np.quantile(df["expl_tok_len"], target_pct))
print("98% cutoff (tokens):", cutoff_token)


  from .autonotebook import tqdm as notebook_tqdm


count    30706.000000
mean       117.108904
std         40.394809
min          4.000000
25%         90.000000
50%        109.000000
75%        134.000000
90%        167.000000
95%        194.000000
98%        232.000000
99%        258.950000
max        502.000000
Name: expl_tok_len, dtype: float64
256 tokens covers 98.95% of Explanations
320 tokens covers 99.79% of Explanations
384 tokens covers 99.97% of Explanations
512 tokens covers 100.00% of Explanations
98% cutoff (tokens): 232
