# Enhanced Dataset Preparation for Study Notes Generator

We build **evaluation datasets** for summarization and Q&A, focusing on AI/ML/scientific domain.
We retrieve Papers, Q&A, Definitions from reputable sources (6 different dataset types) like 

- ML-ArXiv (300)
- real StackOverflow questions with accepted answers filtered to AI/ML/Data Science (250)
- PubMedQA (200) 
- StackOverflow ML Interview Q&A Library(400)
- We made custom glossary with 115 items

We are covering:
- Research-level summaries (ML papers)
- Applied QA in ML engineering + data science
- Scientific/clinical QA for “AI in healthcare” 

**Given all the constraints and utilizing a non-script HF datasets to avoid issues, we have a decently extensive and reliable database with 1265 items totally.**


## Step 0 – Imports & Directory Setup

In [None]:

import os
import json
from datasets import load_dataset
from tqdm import tqdm

# Create directories
os.makedirs("datasets/raw", exist_ok=True)
os.makedirs("datasets/cleaned", exist_ok=True)
os.makedirs("datasets/custom", exist_ok=True)

print("=" * 70)
print("ENHANCED DATASET COLLECTION - AI/ML/DATA SCIENCE FOCUS")
print("=" * 70)

cleaned_ml_arxiv = []
cleaned_qasper = []
cleaned_pubmed = []
cleaned_stack = []


  from .autonotebook import tqdm as notebook_tqdm


ENHANCED DATASET COLLECTION - AI/ML/DATA SCIENCE FOCUS


## Step 1 – ML-ArXiv Summarization Dataset (`ccdv/arxiv-summarization`)

This dataset provides Machine Learning / CS research papers with their **abstracts** as reference
summaries. We will:

1. Load a small slice of the `test` split.
2. Filter out short documents/abstracts.
3. Truncate long documents.
4. Save as `datasets/cleaned/summarization_ml_arxiv.json`.

In [None]:

print("\n 1. Loading ML-ArXiv Summarization Dataset...")

try:
    ml_arxiv = load_dataset("ccdv/arxiv-summarization", split="test[:300]")

    cleaned_ml_arxiv = []
    for i, item in enumerate(tqdm(ml_arxiv, desc="Processing ML-ArXiv")):
        article = item.get("article", "")
        abstract = item.get("abstract", "")

        if len(article) > 500 and len(abstract) > 50:
            cleaned_ml_arxiv.append({
                "id": f"ml_arxiv_{i}",
                "document": article[:15000],
                "reference_summary": abstract,
                "document_type": "ml_cs_paper",
                "length": len(article),
                "summary_length": len(abstract)
            })

    out_path = "datasets/cleaned/summarization_ml_arxiv.json"
    with open(out_path, "w") as f:
        json.dump(cleaned_ml_arxiv[:300], f, indent=2)

    print(f"Saved {len(cleaned_ml_arxiv[:300])} ML/CS-specific papers")
    print(f"   File: {out_path}")
    print("   Domain: 100% Machine Learning & Computer Science")
except Exception as e:
    print(f"Error loading ML-ArXiv: {e}")



 1. Loading ML-ArXiv Summarization Dataset...


Generating train split: 100%|██████████| 203037/203037 [00:24<00:00, 8343.05 examples/s] 
Generating validation split: 100%|██████████| 6436/6436 [00:02<00:00, 2713.06 examples/s]
Generating test split: 100%|██████████| 6440/6440 [00:02<00:00, 2906.40 examples/s]
Processing ML-ArXiv: 100%|██████████| 300/300 [00:00<00:00, 11659.48it/s]

Saved 300 ML/CS-specific papers
   File: datasets/cleaned/summarization_ml_arxiv.json
   Domain: 100% Machine Learning & Computer Science





## Step 2 – Real StackOverflow questions with accepted answers filtered to AI/ML/Data Science topics by keyword

`stackoverflow` contains questions and answers grounded in AI/ML/Data science discussions. We attempt to:

1. Load a slice of the `validation` split.
2. Extract a short **context** from the text.
3. Extract up to 2 question–answer pairs per discussion.
4. Save them as `datasets/cleaned/qa_stackoverflow_ml_250.json`.


In [5]:
import os
import json
from datasets import load_dataset
from tqdm import tqdm

os.makedirs("datasets/cleaned", exist_ok=True)

print("\n 2. Loading StackOverflow AI/ML/Data Science Q&A (250 examples)...")

try:
    # Full sample from StackOverflow: Q + accepted answer
    so_qa = load_dataset("eshangj/stackoverflow_q_and_a_sample", split="train")

    # Heuristic keywords for AI/ML/Data Science / Python-for-DS
    ml_keywords = [
        "machine learning", "deep learning", "neural network",
        "pytorch", "tensorflow", "keras", "scikit-learn", "sklearn",
        "xgboost", "lightgbm", "catboost",
        "regression", "classification", "clustering",
        "gradient boosting", "random forest",
        "data science", "data scientist",
        "pandas", "dataframe", "numpy",
        "matplotlib", "seaborn", "plotly",
        "nlp", "natural language processing",
        "transformer", "bert", "gpt",
        "time series", "forecasting",
        "cross-validation", "train_test_split",
    ]

    def is_ml_related(text: str) -> bool:
        text = (text or "").lower()
        return any(kw in text for kw in ml_keywords)

    cleaned_so_ml = []
    for i, item in enumerate(tqdm(so_qa, desc="Filtering ML/DS questions")):
        q = item.get("question", "") or ""
        a = item.get("accepted_answer", "") or ""

        # Need both Q and accepted answer
        if not q.strip() or not a.strip():
            continue

        # Filter to ML/DS-ish content
        if not (is_ml_related(q) or is_ml_related(a)):
            continue

        cleaned_so_ml.append({
            "id": f"so_ml_{item.get('question_id', i)}",
            "question": q.strip(),
            "answer": a.strip(),
            "link": item.get("link", ""),
            "question_vote": int(item.get("question_vote", 0)),
            "answer_vote": int(item.get("answer_vote", 0)),
            "document_type": "stackoverflow_ml_qa"
        })

        # Stop once we have 250
        if len(cleaned_so_ml) >= 250:
            break

    out_path = "datasets/cleaned/qa_stackoverflow_ml_250.json"
    with open(out_path, "w") as f:
        json.dump(cleaned_so_ml, f, indent=2)

    print(f"Saved {len(cleaned_so_ml)} AI/ML/DS StackOverflow Q&A examples")
    print(f"   File: {out_path}")
    print("   Domain: Practical AI/ML/Data Science programming Q&A")

except Exception as e:
    print(f"Error loading StackOverflow ML Q&A: {e}")



 2. Loading StackOverflow AI/ML/Data Science Q&A (250 examples)...


Generating train split: 100%|██████████| 15451/15451 [00:00<00:00, 283702.63 examples/s]
Filtering ML/DS questions:   3%|▎         | 498/15451 [00:00<00:01, 13214.75it/s]

Saved 250 AI/ML/DS StackOverflow Q&A examples
   File: datasets/cleaned/qa_stackoverflow_ml_250.json
   Domain: Practical AI/ML/Data Science programming Q&A





## Step 3 – PubMedQA (Medical/Scientific Q&A)

We load `pubmed_qa` with the `pqa_labeled` config and construct a cleaned scientific Q&A dataset:

- Question: research question.
- Context: small snippet from the `contexts` field.
- Answer: `long_answer` (explanatory answer).

Saved as `datasets/cleaned/qa_pubmed.json`.

In [6]:

print("\n 3. Loading PubMedQA (Medical/Scientific Q&A)...")

try:
    pubmed_dataset = load_dataset("pubmed_qa", "pqa_labeled", split="train[:400]")

    cleaned_pubmed = []
    for i, item in enumerate(tqdm(pubmed_dataset, desc="Processing PubMedQA")):
        context = item.get('context', {})
        question = item.get('question', '')
        long_answer = item.get('long_answer', '')
        final_decision = item.get('final_decision', '')

        context_text = ""
        if isinstance(context, dict):
            contexts = context.get('contexts', [])
            if contexts:
                context_text = " ".join(contexts[:3])

        if question and long_answer and len(long_answer) > 50:
            cleaned_pubmed.append({
                "id": f"pubmed_{i}",
                "question": question,
                "context": context_text[:1500] if context_text else "See abstract",
                "answer": long_answer,
                "decision": final_decision,
                "type": "explanatory_scientific",
                "domain": "medical_research"
            })

    out_path = "datasets/cleaned/qa_pubmed.json"
    with open(out_path, "w") as f:
        json.dump(cleaned_pubmed[:200], f, indent=2)

    print(f"Saved {len(cleaned_pubmed[:200])} medical/scientific Q&A pairs")
    print(f"   File: {out_path}")
    print("   Domain: Medical research (scientific reasoning)")
except Exception as e:
    print(f"Error loading PubMedQA: {e}")



 3. Loading PubMedQA (Medical/Scientific Q&A)...


Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 150016.24 examples/s]
Processing PubMedQA: 100%|██████████| 400/400 [00:00<00:00, 17996.09it/s]

Saved 200 medical/scientific Q&A pairs
   File: datasets/cleaned/qa_pubmed.json
   Domain: Medical research (scientific reasoning)





## Step 4 – StackOverflow ML Q&A (Bonus Technical Q&A)

We attempt to load the **StackOverflow ML Libraries Q&A** library.

1. Load a slice of the `train` split.
2. Keep entries with a non-trivial question (`title`) and body (`body`).
3. Truncate the body to a manageable length.
4. Save as `datasets/cleaned/qa_aiml_concepts_400.json`.

In [None]:
import os
import json
from datasets import load_dataset
from tqdm import tqdm

os.makedirs("datasets/cleaned", exist_ok=True)

print("\n4. Loading AI/ML/NLP Conceptual Q&A (StackOverflow- ML libraries Q&A)...")

cleaned_aiml_qa = []

# --------------------------------------------------------------------
# 4. StackOverflow ML Libraries (Python/ML/Numpy/Pandas/TensorFlow/PyTorch)
#    Dataset: Syed-Hasan-8503/StackOverflow-ML-Libraries
#    Fields:
#       - title
#       - question (full body)
#       - answer  (accepted/best answer)
#       - tags, score
# --------------------------------------------------------------------
try:
    print("Loading StackOverflow ML library Q&A (Syed-Hasan-8503/StackOverflow-ML-Libraries)...")
    so_ml = load_dataset(
        "Syed-Hasan-8503/StackOverflow-ML-Libraries",
        split="train[:400]"
    )

    start_idx = len(cleaned_aiml_qa)
    for j, ex in enumerate(tqdm(so_ml, desc="Processing StackOverflow ML-Libraries")):
        title = (ex.get("title") or "").strip()
        question_body = (ex.get("question") or "").strip()
        answer_body = (ex.get("answer") or "").strip()
        tags = (ex.get("tags") or "").strip()
        score = int(ex.get("score") or 0)

        if not title or not question_body or not answer_body:
            continue

        cleaned_aiml_qa.append({
            "id": f"stackoverflow_ml_{j}",
            "source": "stackoverflow_ml_libraries",
            "question": title,          # short question summary
            "context": question_body,   # full StackOverflow question text
            "answer": answer_body,      # accepted/best answer
            "tags": tags,
            "score": score,
            "domain": "python_ml_libraries"
        })

    so_count = len(cleaned_aiml_qa) - start_idx
    print(f"  ➜ Collected {so_count} Q&A from StackOverflow ML-Libraries")

except Exception as e:
    print(f"Error loading StackOverflow ML-Libraries: {e}")

out_path = "datasets/cleaned/qa_aiml_concepts_400.json"
final_subset = cleaned_aiml_qa[:400]

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(final_subset, f, indent=2, ensure_ascii=False)

print("Saved AI/ML conceptual Q&A dataset")
print(f"   File     : {out_path}")
print(f"   Total QA : {len(final_subset)}")
print("   Source: Syed-Hasan-8503/StackOverflow-ML-Libraries (ML libraries Q&A)")


  print("\ Saved AI/ML conceptual Q&A dataset")



4. Loading AI/ML/NLP Conceptual Q&A (StackOverflow- ML libraries Q&A)...
Loading StackOverflow ML library Q&A (Syed-Hasan-8503/StackOverflow-ML-Libraries)...


Processing StackOverflow ML-Libraries: 100%|██████████| 400/400 [00:00<00:00, 34369.68it/s]

  ➜ Collected 400 Q&A from StackOverflow ML-Libraries
\ Saved AI/ML conceptual Q&A dataset
   File     : datasets/cleaned/qa_aiml_concepts_400.json
   Total QA : 400
   Source: Syed-Hasan-8503/StackOverflow-ML-Libraries (ML libraries Q&A)





## Step 5 – Create Dataset Summary JSON

We now construct a high-level `datasets/DATASET_SUMMARY.json` that:

- Inspects which cleaned dataset files actually exist.
- Records their file paths, counts, and intended use.
- Reports overall counts for quick reference.

In [11]:
print("\n 5. Creating Dataset Summary...")

import os
import json

summary = {
    "dataset_overview": {
        "domain_focus": "AI, Machine Learning, Data Science, Healthcare",
        "purpose": "Evaluation & benchmarking for Personalized Study Notes Generator",
    },
    "datasets": {
        "summarization": {},   # e.g., ML-ArXiv
        "qa_generation": {},   # PubMed + AI/ML Q&A
    },
}


def add_dataset_if_exists(group, key, path, relevance, use_case, dtype):
    """
    group  : "summarization" or "qa_generation"
    key    : short name like "ml_arxiv" or "pubmed_qa"
    path   : JSON file path
    dtype  : type label, e.g., "ml_cs_papers" or "medical_qa"
    """
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            count = len(data)
            summary["datasets"][group][key] = {
                "file": path,
                "count": count,
                "relevance": relevance,
                "use_case": use_case,
                "type": dtype,
            }
            print(f"Included {key}: {count} items from {path}")
        except Exception as e:
            print(f"Warning: could not read {path} for summary: {e}")
    else:
        print(f"Skipping {key}: file not found at {path}")


# ---------------- Summarization datasets ---------------- #

# ML-ArXiv summarization (Step 2)
add_dataset_if_exists(
    group="summarization",
    key="ml_arxiv",
    path="datasets/cleaned/summarization_ml_arxiv.json",
    relevance="High (ML/CS research domain)",
    use_case="Summarization of ML/CS research articles (article → abstract)",
    dtype="ml_cs_papers",
)


# ---------------- QA datasets ---------------- #

# PubMedQA (Step 4) – AI-in-healthcare / clinicalAI QA
add_dataset_if_exists(
    group="qa_generation",
    key="pubmed_qa",
    path="datasets/cleaned/qa_pubmed.json",
    relevance="Medium–High (scientific/medical reasoning)",
    use_case="Scientific Q&A, useful for AI-in-healthcare / medical study notes",
    dtype="medical_scientific_qa",
)

# StackOverflow ML Q&A
add_dataset_if_exists(
    group="qa_generation",
    key="stackoverflow_ml_250",
    path="datasets/cleaned/qa_stackoverflow_ml_250.json",
    relevance="High (practical ML/DS programming)",
    use_case="Practical questions about ML libraries, Python, model implementation",
    dtype="ml_programming_qa",
)

# Stackoverflow ML Interview Q&A
add_dataset_if_exists(
    group="qa_generation",
    key="aiml_concepts_400",
    path="datasets/cleaned/qa_aiml_concepts_400.json",
    relevance="High (ML theory + ML tooling)",
    use_case="Conceptual ML/NLP questions and ML library Q&A",
    dtype="conceptual_and_tooling_qa",
)

# ---------------- Totals & save ---------------- #

total_items = 0
for group_dict in summary["datasets"].values():
    for ds_meta in group_dict.values():
        total_items += ds_meta.get("count", 0)

summary["dataset_overview"]["total_items"] = total_items
summary["dataset_overview"]["total_datasets"] = sum(
    len(group_dict) for group_dict in summary["datasets"].values()
)

os.makedirs("datasets", exist_ok=True)
summary_path = "datasets/DATASET_SUMMARY.json"
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print("\n" + "=" * 70)
print("DATASET SUMMARY CREATED")
print("=" * 70)
print(f"Summary file: {summary_path}")
print("Included datasets:")
for group_name, group in summary["datasets"].items():
    for key, meta in group.items():
        print(f" - [{group_name}] {key}: {meta['count']} items -> {meta['file']}")



 5. Creating Dataset Summary...
Included ml_arxiv: 300 items from datasets/cleaned/summarization_ml_arxiv.json
Included pubmed_qa: 200 items from datasets/cleaned/qa_pubmed.json
Included stackoverflow_ml_250: 250 items from datasets/cleaned/qa_stackoverflow_ml_250.json
Included aiml_concepts_400: 400 items from datasets/cleaned/qa_aiml_concepts_400.json

DATASET SUMMARY CREATED
Summary file: datasets/DATASET_SUMMARY.json
Included datasets:
 - [summarization] ml_arxiv: 300 items -> datasets/cleaned/summarization_ml_arxiv.json
 - [qa_generation] pubmed_qa: 200 items -> datasets/cleaned/qa_pubmed.json
 - [qa_generation] stackoverflow_ml_250: 250 items -> datasets/cleaned/qa_stackoverflow_ml_250.json
 - [qa_generation] aiml_concepts_400: 400 items -> datasets/cleaned/qa_aiml_concepts_400.json
