## Imports & Paths

In [10]:
import os
import subprocess
from datasets import load_dataset
from huggingface_hub import snapshot_download


In [11]:
BASE_DIR = "/root/workspace/agentic-nlp-rl/data/raw"

PATHS = {
    "meld": f"{BASE_DIR}/meld",
    "mpdd": f"{BASE_DIR}/mpdd",
    "alfworld": f"{BASE_DIR}/alfworld",
    "scienceworld": f"{BASE_DIR}/scienceworld",
    "textworld": f"{BASE_DIR}/textworld",
}

for p in PATHS.values():
    os.makedirs(p, exist_ok=True)


In [22]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # works when running from notebooks/
DATA_ROOT = PROJECT_ROOT / "data"


## MELD (Emotion-Aware Dialogue)

In [28]:
from pathlib import Path
from datasets import load_dataset

PROJECT_ROOT = Path.cwd().parents[0]
MELD_ROOT = PROJECT_ROOT / "data" / "raw" / "meld"

train = next(MELD_ROOT.rglob("train_sent_emo.csv"))
dev   = next(MELD_ROOT.rglob("dev_sent_emo.csv"))
test  = next(MELD_ROOT.rglob("test_sent_emo.csv"))

meld = load_dataset(
    "csv",
    data_files={
        "train": str(train),
        "validation": str(dev),
        "test": str(test),
    }
)

meld.save_to_disk(PROJECT_ROOT / "data" / "processed" / "meld")
print(meld)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9989 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1109 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2610 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
        num_rows: 9989
    })
    validation: Dataset({
        features: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
        num_rows: 1109
    })
    test: Dataset({
        features: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
        num_rows: 2610
    })
})


## Daily Dialog - Dataset

In [35]:
from pathlib import Path
from huggingface_hub import snapshot_download
from datasets import load_dataset

# Paths
PROJECT_ROOT = Path.cwd().parents[0]
RAW_DIR = PROJECT_ROOT / "data" / "raw" / "ultrafeedback"
OUT_DIR = PROJECT_ROOT / "data" / "processed" / "ultrafeedback"

RAW_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("[+] Downloading UltraFeedback (parquet)...")

snapshot_download(
    repo_id="HuggingFaceH4/ultrafeedback_binarized",
    repo_type="dataset",
    local_dir=RAW_DIR,
)

print("[+] Loading local parquet files...")

dataset = load_dataset(
    "parquet",
    data_files={
        "train": str(next(RAW_DIR.rglob("train*.parquet"))),
        "test": str(next(RAW_DIR.rglob("test*.parquet"))),
    },
)

dataset.save_to_disk(OUT_DIR)

print("[✓] UltraFeedback loaded and saved to:", OUT_DIR)
print(dataset)


[+] Downloading UltraFeedback (parquet)...


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

data/test_prefs-00000-of-00001.parquet:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

data/train_sft-00000-of-00001.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

data/train_prefs-00000-of-00001.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

data/train_gen-00000-of-00001.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

create_dataset.py: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

data/test_sft-00000-of-00001.parquet:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

data/test_gen-00000-of-00001.parquet:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

[+] Loading local parquet files...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/61135 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

[✓] UltraFeedback loaded and saved to: /root/workspace/agentic-nlp-rl/data/processed/ultrafeedback
DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 61135
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 1000
    })
})


## ALFWorld (Language + Embodied RL)

In [18]:
print("Downloading ALFWorld...")

if not os.listdir(PATHS["alfworld"]):
    subprocess.run([
        "git", "clone",
        "https://github.com/alfworld/alfworld.git",
        PATHS["alfworld"]
    ], check=True)

print("ALFWorld downloaded to", PATHS["alfworld"])


Downloading ALFWorld...


Cloning into '/root/workspace/agentic-nlp-rl/data/raw/alfworld'...


ALFWorld downloaded to /root/workspace/agentic-nlp-rl/data/raw/alfworld


## ScienceWorld (Text-Based Scientific RL)

In [15]:
print("Downloading ScienceWorld...")

if not os.listdir(PATHS["scienceworld"]):
    subprocess.run([
        "git", "clone",
        "https://github.com/allenai/scienceworld.git",
        PATHS["scienceworld"]
    ], check=True)

print("ScienceWorld downloaded to", PATHS["scienceworld"])


Downloading ScienceWorld...


Cloning into '/root/workspace/agentic-nlp-rl/data/raw/scienceworld'...


ScienceWorld downloaded to /root/workspace/agentic-nlp-rl/data/raw/scienceworld


## TextWorld (Text-Based RL Environments)

In [16]:
print("Installing TextWorld...")

subprocess.run(
    ["pip", "install", "textworld"],
    check=True
)

print("TextWorld installed")


Installing TextWorld...
TextWorld installed


[0m

## Verification

In [36]:
for name, path in DATASETS.items():
    print(f"\n{name.upper()}:")
    for root, dirs, files in os.walk(path):
        print(root)
        break



ALFWORLD:
/root/workspace/agentic-nlp-rl/data/raw/alfworld

MELD:
/root/workspace/agentic-nlp-rl/data/raw/meld

MPDD:
/root/workspace/agentic-nlp-rl/data/raw/mpdd

SCIENCEWORLD:
/root/workspace/agentic-nlp-rl/data/raw/scienceworld

TEXTWORLD:
/root/workspace/agentic-nlp-rl/data/raw/textworld


In [25]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]
meld_root = PROJECT_ROOT / "data" / "raw" / "meld"

print("MELD root:", meld_root)
for path in meld_root.rglob("*.csv"):
    print(path)


MELD root: /root/workspace/agentic-nlp-rl/data/raw/meld
