In [None]:
# label encoding of keyword using sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
df['keyword'] = le.fit_transform(df['keyword'].fillna(''))
df['keyword'].sample(5)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
PAD_IDX=1
SOS_IDX=2
UNK_IDX=0


# Tokenizer-Funktion
def tokenizer(text):
    # use spacey for tokenization
    return [tok.text for tok in nlp.tokenizer(text)]

# Vokabular erstellen
def build_vocab(texts, vocab_size=None):
    word_to_idx = {}
    for text in texts:
        tokens = tokenizer(text)
        for token in tokens:
            if token not in word_to_idx:
                word_to_idx[token] = len(word_to_idx)
    if vocab_size is not None:
        word_to_idx = {k: v for k, v in sorted(word_to_idx.items(), key=lambda item: item[1])[:vocab_size-3]}
    # increase all idx by 3  
    word_to_idx = {k: v+3 for k, v in word_to_idx.items()}
    # add special tokens
    word_to_idx['<unk>'] = UNK_IDX
    word_to_idx['<pad>'] = PAD_IDX
    word_to_idx['<sos>'] = SOS_IDX
    return word_to_idx

# Texte in Sequenzen von Wortindizes umwandeln
def text_to_indices(text, word_to_idx):
    tokens = tokenizer(text)
    indices = [word_to_idx[token] if token in word_to_idx else 0 for token in tokens]
    return indices






# Aufteilung in Trainings- und Testdaten


In [None]:
def text_pipeline(x, word_to_idx):                           
    return text_to_indices(x, word_to_idx)

In [None]:
def preprocess(df):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    num_epochs = 2
    # Erstellung des Vokabulars
    texts = train_df['text'].tolist()
    word_to_idx = build_vocab(texts, vocab_size=10000)

    idx_to_word = {v: k for k, v in word_to_idx.items()}

    vocab_size=len(word_to_idx)
    return {
        "train_df": train_df,
        "test_df": test_df,
        "word_to_idx": word_to_idx,
        "idx_to_word": idx_to_word,
        "vocab_size": vocab_size,
        "text_pipeline": text_pipeline,
        "special_symbols": {"PAD": PAD_IDX, "SOS": SOS_IDX, "UNK": UNK_IDX},
    }