# Preparing dataset for coupon selecting
As most of the proposals for the PoC improvements involve selecting coupons in input, I have written this notebook to

In [None]:
import numpy as np
!python ../../../tools/data_load.py coupons_1

In [1]:
import pandas as pd

## How many coupon files have timestamps absent in associated content files?

In [2]:
TAG_B_COUPON = 'B-COUPON'
TAG_I_COUPON = 'I-COUPON'
TAG_UNKNOWN = 'UNKNOWN'

COL_TEXT_FULL = 'content_full'
COL_CONTENT_TEXT = 'text'
COL_TIME = 'time'
COL_IS_COUPON = 'is_coupon'
COL_VIEW_ID = 'view_id'
COL_DEPTH = 'view_depth'

In [3]:
PATHS = (
    (
        "datasets/coupons_1/dm/Kopia test_data_2024_03_07_dm_content_generic_2024-12-05T10_09_32.502568365+01_00.csv",
        "datasets/coupons_1/dm/Kopia test_data_2024_03_07_dm_coupons_2024-12-05T10_11_37.906933437+01_00.csv"
    ),
    (
        "datasets/coupons_1/lidl/Kopia test_data_2024_11_25_lidl_plus_content_generic_2024-12-05T07_39_49.726955559+01_00.csv",
        "datasets/coupons_1/lidl/Kopia test_data_2024_11_25_lidl_plus_cupons_2024-12-05T07_54_54.63740938+01_00.csv"
    ),
    (
        "datasets/coupons_1/rewe/Kopia test_data_2024_03_07_rewe_content_generic_2024-12-05T10_30_59.948177782+01_00.csv",
        "datasets/coupons_1/rewe/Kopia test_data_2024_03_07_rewe_coupons_2024-12-05T10_29_55.24829781+01_00.csv"
    ),
    (
        "datasets/coupons_1/rossmann/Kopia test_data_2024_03_07_rossmann_content_generic_2024-12-05T10_24_07.981399375+01_00.csv",
        "datasets/coupons_1/rossmann/Kopia test_data_2024_03_07_rossmann_coupons_2024-12-05T10_24_55.372924301+01_00.csv"
    )
)

PATH_PREFIX = "../../../"

In [4]:
for p_content, p_coupons in PATHS:
    coupons = pd.read_csv(PATH_PREFIX + p_coupons)
    content = pd.read_csv(PATH_PREFIX + p_content)
    if not set(coupons[COL_TIME]).issubset(set(content[COL_TIME])):
        print("coupons from " + p_content.split("/")[2] + " have times not present in content")

coupons from dm have times not present in content
coupons from rewe have times not present in content


### Finding coupons in frame from list of strings
The following code aims to select rows representing coupons' content from a frame associated with a single time moment. This is done by comparing `content_generic` file with `full_text` column  from `coupons` file. We assume that consequential rows from the content file are representation of coupon's display if their concatenated text matches any of the values from the `full_text` column. However, this intuitive approach generates some unexpected challenges mostly related to overlapping coupon candidates:
* prefixes within `full_text` column. This problem can be observed in the dataset under `coupons/lidl`. In these cases we would prefer to select the longer coupon, not its prefix.
* overlapping in general: consider `full_text[0]=[A,B]`, `full_text[1]=[B,C]` and `text` column from the `content` frame being `[A,B,C,X,A,B,X,B,C,X,A,B,C]`. Given the format of the data we currently have, there is no proper way for deducing if the screen view represented by this frame contains coupons `[0,0,1,1]` or `[1,0,1,0]`, as we are not guaranteed that the coupon order in `coupons` frame is the same as in `content`. Currently, presented script in such cases will always prefer the coupon that ended earlier, (in greedy manner if there are more than 2 overlapping coupons).<br/>
#### Solution
Proposed solution utilizes prefix tree and does not take into account the count of identical coupons in `coupons` frame. This might be a proposition for future improvements.

In [5]:
from typing import List, Tuple, Dict, Optional
from collections import namedtuple
# prefix tree utils
PTreeNode = Tuple[Dict[str, 'PTreeNode'], bool] # children: dict, is_valid_coupon: bool


def ptree_insert(root: PTreeNode, text_fields: List[str]):
    if not text_fields: return
    if text_fields[0] not in root:
        root[0][text_fields[0]] = ({}, len(text_fields) == 1)
    ptree_insert(root[0][text_fields[0]], text_fields[1:])


def build_ptree(strings: List[List[str]]) -> PTreeNode:
    root = ({}, False)
    for s in strings:
        ptree_insert(root, s)
    return root


In [6]:
import datasets
labels = datasets.ClassLabel(names=["UNKNOWN", "B-COUPON", "I-COUPON"])
lbl_unk = labels.str2int("UNKNOWN")
lbl_bc = labels.str2int("B-COUPON")
lbl_ic = labels.str2int("I-COUPON")


def annotate_frame(content_frame: pd.DataFrame, coupons_list: List[str]) -> pd.DataFrame:
    content_frame.dropna(subset=[COL_CONTENT_TEXT], inplace=True)
    content_frame.reset_index(drop=True, inplace=True)
    ptree = build_ptree([s[1:-1].split(', ') for s in coupons_list])
    is_coupon_array = []
    ptree_iters: List[List] = []
    ix = 0
    text_col = content_frame[COL_CONTENT_TEXT]
    while ix < len(text_col):
        text = text_col[ix]
        if not isinstance(text, str):
            ix += 1
            continue
        ended_iters = []
        for itr in ptree_iters:
            if text not in itr[0][0]:
                if itr[2] != -1:
                    ended_iters.append(itr)
            else:
                itr[0] = itr[0][0][text]
                if itr[0][1]:
                    itr[2]= ix
        if ended_iters:
            chosen = None
            chosen_len = 0
            for itr in ended_iters:
                if itr[2] - itr[1] + 1 > chosen_len:
                    chosen = itr
                    chosen_len = itr[2] - itr[1] + 1
            is_coupon_array += [lbl_unk] * (chosen[1] - len(is_coupon_array) - 1)
            is_coupon_array.append(lbl_bc)
            is_coupon_array += [lbl_ic] * (chosen_len - 1)
            ptree_iters.clear()
            ix = chosen[2] + 1
            continue
        if text in ptree[0]:
            ptree_iters.append([ptree[0][text], ix, -1 if not ptree[0][text][1] else ix])
        ix += 1
    is_coupon_array += [lbl_unk] * (len(content_frame) - len(is_coupon_array))
    content_frame[COL_IS_COUPON] = is_coupon_array
    return content_frame

In [86]:
import re
import json


def collapse_tree(tree: dict) -> Tuple[Optional[dict], str]:
    """removes nodes that have only one child and no text"""
    if len(tree['children']) < 2 and tree['text'] is None:
        if len(tree['children']) == 1:
            child_name, child = list(tree['children'].items())[0]
            collapsed, name = collapse_tree(child)
            if collapsed is not None:
                name = f"{child_name}.{name}" if name else child_name
            return collapsed, name
        return None, ""
    new_children = {}
    for child_name, child in tree['children'].items():
        collapsed, suffix = collapse_tree(child)
        if collapsed is not None:
            if suffix:
                new_children[f"{child_name}.{suffix}"] = collapsed
            else:
                new_children[child_name] = collapsed
    tree['children'] = new_children
    if len(tree['children']) == 0:
        del tree['children']
    return tree, ""

def timestamp_batch_to_json(batch: pd.DataFrame):
    """takes batch representing single screen content and converts it to JSON representing XML structure"""
    tree_path = []
    res = {"text": None, "children": {}, "is_coupon": False}

    def _insert_at_path(key, val):
        t = res
        for k, d in tree_path:
            t = t["children"][k]
        t["children"][key] = val

    for row in batch.iterrows():
        text_field = row[1][COL_CONTENT_TEXT]
        name = row[1][COL_VIEW_ID]
        if isinstance(name, str):
            name = name.rsplit('/')[-1]
        if not isinstance(text_field, str):
            text_field = None
        depth = row[1][COL_DEPTH]
        while len(tree_path) > 0 and tree_path[-1][1] >= depth:
            tree_path.pop(-1)
        _insert_at_path(name, {"text": text_field, "children": {}, "is_coupon": row[1][COL_IS_COUPON]})
        tree_path.append((name, depth))

    return res

def frame_to_json(frame: pd.DataFrame, coupons_frame: pd.DataFrame) -> List[dict]:
    res = []
    for t, subframe in frame.groupby(COL_TIME):
        coupons_for_time = coupons_frame[coupons_frame[COL_TIME] == t][COL_TEXT_FULL].to_list()
        subframe = annotate_frame(subframe, coupons_for_time)
        tree = timestamp_batch_to_json(subframe)
        tree = collapse_tree(tree)[0]
        if tree is not None:
            res.append(tree)
    return res

def json_to_labeled_tokens(data: List[dict], indent: Optional[int]=None) -> List[Tuple[List[str], List[int]]]:
    def _encode_tree_rec(root: dict, is_coupon) -> Tuple[List[str], List[bool]]:
        is_coupon |= root.pop('is_coupon')
        if 'children' in root:
            children = root['children']
            root['children'] = []
            string = json.dumps(root, indent=indent)
            string1, string2 = string.rsplit('[]', maxsplit=1)
            string1 += '['
            string2 = ']' + string2
            words1 = re.split(" |\n|\t", string1)
            words2 = re.split(" |\n|\t", string2)
            labels1 = [is_coupon] * len(words1)
            labels2 = [is_coupon] * len(words2)
            first_child = True
            for name, child in children.items():
                words_child, labels_child = _encode_tree_rec(child, is_coupon)
                words1.append(f'{name}:')
                labels1.append(is_coupon)
                if not first_child:
                    words1[-2] += ','
                first_child = False
                labels1 += labels_child
                words1 += words_child
            return words1 + words2, labels1 + labels2
        else:
            string = json.dumps(root, indent=indent)
            words = re.split(" |\n|\t", string)
            labels = [is_coupon] * len(words)
            return words, labels

    res = []
    for tree in data:
        tkns, lbls = _encode_tree_rec(tree, False)
        prv = lbl_unk
        lbls = [prv := (lbl_unk if not lbl else lbl_bc if prv == lbl_unk else lbl_ic) for lbl in lbls]
        res.append((tkns, lbls))
    return res

In [87]:
to_annotate = [PATHS[1], PATHS[3]]
by_time = []

save_id = 0
for (path_content, path_coupons) in to_annotate:
    content_frame = pd.read_csv(PATH_PREFIX + path_content)
    coupons_frame = pd.read_csv(PATH_PREFIX + path_coupons)
    for time, subframe in content_frame.groupby(COL_TIME):
        coupon_texts = coupons_frame[coupons_frame[COL_TIME] == time][COL_TEXT_FULL]
        subframe = annotate_frame(subframe, coupon_texts)
        by_time.append(subframe)

samples_arr = [fr[COL_CONTENT_TEXT].to_list() for fr in by_time]
labels_arr = [fr[COL_IS_COUPON].to_list() for fr in by_time]

## Dataset creation

In [71]:
for text, lbl in zip(samples_arr, labels_arr):
    print(text)
    print(lbl)

['LIDL PLUS KARTE', 'Home', 'Prospekte', 'Lidl Plus', 'Onlineshop', 'Mehr']
[0, 0, 0, 0, 0, 0]
['Jetzt noch mehr sparen!', 'Raderberg', 'Mehr', 'Offen', 'Schließt um 21:00', 'Nächsten Sonntag geschlossen', 'Beste Einkaufszeit für deine Filiale', 'Spare zum Wochenstart mit Lidl Plus', 'HIER ENTLANG', 'Exklusiv für dich', 'Jetzt entdecken', 'Rabattsammler Plus', 'Partnervorteile', '3', 'LIDL PLUS KARTE', 'Home', 'Prospekte', 'Lidl Plus', 'Onlineshop', 'Mehr']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[]
[]
['Exklusiv für dich', 'Jetzt entdecken', 'Sofortgewinne', 'Rabattsammler November', 'Mehr Infos', '5 verbleibende/r Tag/e', '81,40 €', '120 €', '200 €', '400 €', 'Nächste Stufe', 'Solevita Vitamingetränk oder W5 Spülmittel Ultra Power', 'gratis', 'Rabattsammler Plus', 'Inaktiv', 'Lidl Plus Angebote', 'Alle anzeigen', 'Alesto Selection Mandeln XXL', '24.11. – 30.11.', '-50%', '4,99', '3,45', '*', '€', '1 kg = 6.90', 'Je 500 g (Max. 24 Stück)', 'Söhnlein Brillant halbt

In [72]:
features = datasets.Features({
    "texts": datasets.Sequence(datasets.Value("string")),
    "labels": datasets.Sequence(labels)
})

ds = datasets.Dataset.from_dict(
    {
        "texts": samples_arr,
        "labels": labels_arr
    },
    features=features
)

In [73]:
from huggingface_hub import login
from huggingface_hub import HfApi

login()
api = HfApi()
api.create_repo(repo_id="zpp-murmuras/ds-coupon-selecting-v01", repo_type="dataset", private=True)

ds.push_to_hub("zpp-murmuras/ds-coupon-selecting-v01", private=True)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

HfHubHTTPError: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67a525f1-6dab751249abe9e1514c4043;2d14ab66-674b-4e9c-b462-09c879f58808)

You already created this dataset repo

# Dataset in JSON fromat

In [90]:
features = datasets.Features({
    "tokens": datasets.Sequence(datasets.Value("string")),
    "labels": datasets.Sequence(labels)
})

examples = []
for (path_content, path_coupons) in to_annotate:
    content_frame = pd.read_csv(PATH_PREFIX + path_content)
    coupons_frame = pd.read_csv(PATH_PREFIX + path_coupons)
    as_json = frame_to_json(content_frame, coupons_frame)
    sample = json_to_labeled_tokens(as_json)
    for ex in sample:
        assert len(ex[0]) == len(ex[1])
    examples += sample
