# Getting started with the EB-NeRD

In [None]:
from pathlib import Path
import polars as pl

from recsys_challenge.utils._constants import (
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
)

from recsys_challenge.dataset.preprocess.vocab import (setup_word_embedder, build_vocab, build_word_embeddings, build_article_id_to_title)

### Load dataset

In [None]:
PATH = Path("../data/small")
OUTPUT_PATH = Path("../data/vocab")
data_split = "train"

In [None]:
df_behaviors = pl.scan_parquet(PATH / data_split / "behaviors.parquet")
df_history = pl.scan_parquet(PATH / data_split / "history.parquet")
df_articles = pl.scan_parquet(PATH / "articles.parquet")

### Tokenize titles

In [None]:
tokenizer, word_embedder = setup_word_embedder()

df_articles_tok = df_articles.with_columns(
    title_tokenized=pl.col(DEFAULT_TITLE_COL).map_elements(lambda x: " ".join(tokenizer(x)), return_dtype=pl.String)
).collect()

### Build vocabs

In [None]:
# needs to be generated, but result isn't used for other stuff
_ = build_vocab(
    df_behaviors.collect().get_column(DEFAULT_USER_COL), OUTPUT_PATH / "user_id_vocab.bin"
)

articles_vocab = build_vocab(
    df_articles_tok.get_column("article_id"), OUTPUT_PATH / "articles_id_vocab.bin"
)

word_vocab = build_vocab(
    df_articles_tok.get_column("title_tokenized"), OUTPUT_PATH / "word_vocab.bin"
)

In [None]:
build_word_embeddings(
    word_vocab,
    word_embedder,
    OUTPUT_PATH / "word_embeddings.npy",
)

build_article_id_to_title(
    df_articles_tok,
    articles_vocab,
    word_vocab,
    OUTPUT_PATH / "article_id_to_title.npy",
)