## Tokenizer

Create universal tokenizer for items/actions/countries.

In [17]:
%load_ext autoreload
%autoreload 2

In [18]:
import json
with open("../../data/IKEA/insp_feed_dict.json", "r") as file:
    if_dict = json.load(file)

In [3]:
len(if_dict)

# So there are 3600 imgs that were not shown at all

9957

In [4]:
from recommenders.utils.tokenizer import Tokenizer
tt = Tokenizer(vocabulary=["hallo", "ich", "bin", "pat"], unknown=True)
print(tt.itos_)
tt.stoi_

['hallo', 'ich', 'bin', 'pat', '<unk>', '<pad>']


{'hallo': 0, 'ich': 1, 'bin': 2, 'pat': 3, '<unk>': 4, '<pad>': 5}

In [5]:
# Extend it
tt.extend(new_vocabulary=["perter", "parker"])
tt.extend(new_vocabulary=["DDDD", "HHHHHHHH"])

print(tt.itos_)
print(tt.stoi_)
print(tt.unk_idx)
print(tt.pad_idx)

['hallo', 'ich', 'bin', 'pat', '<unk>', '<pad>', 'perter', 'parker', 'DDDD', 'HHHHHHHH']
{'hallo': 0, 'ich': 1, 'bin': 2, 'pat': 3, '<unk>': 4, '<pad>': 5, 'perter': 5, 'parker': 6, 'DDDD': 7, 'HHHHHHHH': 8}
4
5


In [6]:
# Save to file
tt.save_to_file(file_path="./example_token.json")

In [7]:
# Init form file 
tt_copy = Tokenizer.from_file(file_path="./example_token.json")

print(tt_copy.itos_)
tt_copy.stoi_

['hallo', 'ich', 'bin', 'pat', '<unk>', '<pad>', 'perter', 'parker', 'DDDD', 'HHHHHHHH']


{'hallo': 0,
 'ich': 1,
 'bin': 2,
 'pat': 3,
 '<unk>': 4,
 '<pad>': 5,
 'perter': 5,
 'parker': 6,
 'DDDD': 7,
 'HHHHHHHH': 8}

## Create Tokenizers

### Combinded Image and item tokenizer 

This is a nice approach if we want to have the same indexes for both, the input and output layer. But it is only possible if the input dim larger than the output dim, then the output dim will be the first part togehther with \<unk\>, then we get \<pad\> and the extend it by the rest.

In [13]:
import pandas as pd

# Load train dataset 
train = pd.read_csv("../../temp_data/temp_train.csv")

In [14]:
inspiration = train[train.action_type.isin(["click_inspiration", "select_content"])].item_id.unique()
items = train[~train.action_type.isin(["click_inspiration", "select_content"])].item_id.unique()

In [15]:
print(len(inspiration))
print(len(items))

6304
121106


In [None]:
from recommenders.utils.tokenizer import Tokenizer

# Tokenizer for items and images
# First add images from the training data and from the online dict 
# These will be the output layer.
item_tok = Tokenizer(list(if_dict.keys()), unknown=True)
item_tok.extend(items)

item_tok.save_to_file("../../data/IKEA/tokenizers/tokenizer_items_images.json")

In [None]:
len(item_tok.stoi_)

In [None]:
item_tok.stoi("<pad>")

In [None]:
loaded_item_tok = Tokenizer.from_file("../../data/IKEA/tokenizers/tokenizer_items_images.json")

In [None]:
loaded_item_tok.stoi("<pad>")

In [None]:
len(loaded_item_tok.stoi_)

### Seperate tokenizers for input and output

#### 1. Input

The input dims will be all product and pictures inside the whole training set. 

- Take all inspirational images (otherwise there will be unknowns in the target)
- Take only the products that are in the training set to mitigate bloating

In [8]:
import pandas as pd

# Load train dataset 
full = pd.read_csv("../../temp_data/temp_df.csv")
train = pd.read_csv("../../temp_data/temp_train.csv")

# Images in full data
inspiration = full[full.action_type.isin(["click_inspiration", "select_content"])].item_id.unique()

In [25]:
from recommenders.utils.tokenizer import Tokenizer

# Create tokenizer
items_and_imgs = list(set().union(list(inspiration), train.item_id.unique()))

input_tok = Tokenizer(items_and_imgs, unknown=True, padding=True)

input_tok.save_to_file("../../data/IKEA/tokenizers/input_tokenizer.json")

In [26]:
print(len(input_tok))
print(input_tok.unk_idx)
print(input_tok.pad_idx)
print(input_tok.stoi("<pad>"))
print(input_tok.stoi("halloooo"))

127421
127419
127420
127420
127419


In [27]:
from recommenders.utils.tokenizer import Tokenizer
loaded_item_tok = Tokenizer.from_file("../../data/IKEA/tokenizers/input_tokenizer.json")
len(loaded_item_tok.itos_)

127421

In [28]:
loaded_item_tok.stoi("19fd75f3-6716-4606-abbbe0498a52e1dd")

76729

#### 2. Output

The output dims will be all pictures inside the whole dataset including the ones from the online json to make deployment easier later on. 

In [29]:
# Get union set of the two sources
all_images = list(set().union(list(inspiration), if_dict.keys()))

print(len(inspiration))
print(len(if_dict))
print(f"Total length: {len(all_images)}")

6313
9957
Total length: 10107


In [1]:
# Create tokenizer
output_tok = Tokenizer(all_images, unknown=False, padding=True)

output_tok.save_to_file("../../data/IKEA/tokenizers/output_tokenizer.json")

NameError: name 'Tokenizer' is not defined

In [33]:
print(len(output_tok))

10107


In [34]:
loaded_item_tok = Tokenizer.from_file("../../data/IKEA/tokenizers/output_tokenizer.json")
loaded_item_tok.stoi("bd1ed584-fb37-4d0f-81ab8cc5aee4db0f")

9299

### Country tokenizer

In [35]:
market_tok = Tokenizer(train.market.unique(), unknown=True, padding=True)
market_tok.save_to_file("../../data/IKEA/tokenizers/market_tokenizer.json")

In [36]:
market_tok.stoi("qa")

0

In [37]:
market_tok.stoi("hallo")
market_tok.stoi("morning")

45