# Getting started with the EB-NeRD

In [1]:
from pathlib import Path
import polars as pl

from recsys_challenge.dataset._vocab import WordVocab

from recsys_challenge.dataset.preprocess.neighbors import (build_one_hop_neighbors, build_two_hop_neighbors)

### Load dataset

In [2]:
PATH = Path("../data/small")
OUTPUT_PATH = Path("../data/neighbors")

In [3]:
df_behaviors_train = pl.read_parquet(PATH / "train" / "behaviors.parquet")
df_behaviors_val = pl.read_parquet(PATH / "validation" / "behaviors.parquet")

### Load vocab

In [4]:
VOCAB_PATH = Path("../data/vocab")
vocab_articles = WordVocab.load_vocab(VOCAB_PATH / "articles_id_vocab.bin")
vocab_user = WordVocab.load_vocab(VOCAB_PATH / "user_id_vocab.bin")

In [5]:
# Train one- and two-hops
train_user_one_hop, train_article_one_hop = build_one_hop_neighbors(
    df_behaviors_train, vocab_user, vocab_articles, "train", OUTPUT_PATH
)
build_two_hop_neighbors(train_user_one_hop, train_article_one_hop, "train", OUTPUT_PATH)

# validation one- and two-hops
val_user_one_hop, val_article_one_hop = build_one_hop_neighbors(
    df_behaviors_val,
    vocab_user,
    vocab_articles,
    "validation",
    OUTPUT_PATH,
)
build_two_hop_neighbors(val_user_one_hop, val_article_one_hop, "validation", OUTPUT_PATH)

Building Hop-1:   0%|          | 0/232887 [00:00<?, ?it/s]

Building hop-2 user:   0%|          | 0/15143 [00:00<?, ?it/s]

Building hop-2 news:   0%|          | 0/3995 [00:00<?, ?it/s]

Building Hop-1:   0%|          | 0/244647 [00:00<?, ?it/s]

Building hop-2 user:   0%|          | 0/11658 [00:00<?, ?it/s]

Building hop-2 news:   0%|          | 0/4457 [00:00<?, ?it/s]