In [1]:
from pathlib import Path

JESTER_URL = "https://eigentaste.berkeley.edu/dataset/archive/jester_dataset_2.zip"
OUTPUT_DIR = Path("/tmp/jester")


In [2]:
import zipfile
from shutil import copyfileobj
from urllib.request import urlopen
from pathlib import Path

def download_jester(destination_dir: Path):
    """
    Download the Jester dataset to the specified directory.

    Args:
        destination_dir: The directory where the dataset will be downloaded.
    """
    if destination_dir.exists():
        print("Already downloaded!. Nothing to do.")
        return

    destination_dir.mkdir(parents=True, exist_ok=True)

    jester_zip_file = destination_dir / "jester_dataset_2.zip"

    if not jester_zip_file.exists():
        print(f"Downloading Jester to {destination_dir}...")
        with urlopen(JESTER_URL) as stream, open(jester_zip_file, "wb") as out_file:
            copyfileobj(stream, out_file)
        print("Done!")

    print("Extracting...")
    with zipfile.ZipFile(jester_zip_file, "r") as zip_file:
        zip_file.extractall(destination_dir)

    jester_zip_file.unlink()

    print("Done!")

download_jester(OUTPUT_DIR)

Already downloaded!. Nothing to do.


In [3]:
from surprise import Dataset, Reader
from surprise.model_selection import KFold

data = Dataset.load_from_file(OUTPUT_DIR / "jester_ratings.dat", reader=Reader("jester"))

k_fold = KFold(n_splits=5, random_state=42)



In [4]:
from dataset.common import resolve_folds


k_fold = KFold(n_splits=5, random_state=42)

folds = resolve_folds(data, k_fold)
folds_without_index = [fold[1] for fold in folds]

for trainset, testset in folds_without_index:


    print("Testset:")
    print(testset[:5])

    break
    

Testset:
[('14696', '148', 3.156), ('58862', '18', -0.094), ('41679', '143', 7.812), ('25475', '72', 9.844), ('44124', '65', 6.094)]
