In [1]:
from datasets import load_dataset
dataset = load_dataset('universityofbucharest/laroseda', trust_remote_code=True)
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['index', 'title', 'content', 'starRating'],
        num_rows: 12000
    })
    test: Dataset({
        features: ['index', 'title', 'content', 'starRating'],
        num_rows: 3000
    })
})

In [2]:
from datasets import DatasetDict
from sklearn.model_selection import train_test_split
import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
train_dataset = dataset['train']
train_labels = train_dataset['starRating']

train_idx, val_idx = train_test_split(
    range(len(train_labels)),
    test_size=0.1,
    stratify=train_labels,  # stratified split
    random_state=SEED       # reproducibility
)

train_split = train_dataset.select(train_idx)
val_split = train_dataset.select(val_idx)

final_splits = DatasetDict({
    'train': train_split,
    'validation': val_split,
    'test': dataset['test']  # untouched test split
})

In [4]:
final_splits

DatasetDict({
    train: Dataset({
        features: ['index', 'title', 'content', 'starRating'],
        num_rows: 10800
    })
    validation: Dataset({
        features: ['index', 'title', 'content', 'starRating'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['index', 'title', 'content', 'starRating'],
        num_rows: 3000
    })
})

In [5]:
import collections

def class_distribution(dataset):
    return collections.Counter(dataset['starRating'])

print("Train:", class_distribution(final_splits['train']))
print("Validation:", class_distribution(final_splits['validation']))
print("Test:", class_distribution(final_splits['test']))


Train: Counter({5: 4484, 1: 4013, 2: 1387, 4: 916})
Validation: Counter({5: 498, 1: 446, 2: 154, 4: 102})
Test: Counter({5: 1256, 1: 1102, 2: 398, 4: 244})


In [6]:
import pandas as pd
import os

def save_splits_to_csv(final_splits, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    for split_name, dataset in final_splits.items():
        df = pd.DataFrame({
            'index': dataset['index'],
            'title': dataset['title'],
            'content': dataset['content'],
            'starRating': dataset['starRating'],
        })
        df.to_csv(os.path.join(save_dir, f"{split_name}.csv"), index=False)

In [7]:
save_splits_to_csv(final_splits, save_dir="saved_splits/laroseda/")