In [1]:
import json
import pprint
import sys

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

sys.path.append(r'./../snorkel')

## Task Overview

Given product search queries like 'apple watch series 3', our goal is to tag the entities with labels like 'MODELNAME', 'CATEGORY', etc.

Example: "apple watch series 3" -> "brand category modelname modelname"

## Dataset
In this tutorial, we use <a href="https://dataturks.com/projects/Mohan/Best%20Buy%20E-commerce%20NER%20dataset">BestBuy E-Commerge Dataset</a> from DataTurks. The dataset contains actual search queries on bestBuy.com manually labled by human experts. There are 772 unique products and 1872 tagged entities in total.


#### Data Preprocessing
The original dataset has 7 labels (category, modelname, brand, storage, screen size, etc). If a token does not belong to one of these 7 categories, then it is not tagged. We preprocess the data by only keeping 3 labels **(CATEGORY, MODELNAME, OTHER)** from the label set. All other labels (including abstain) are mapped to **OTHER**.

### Load BestBuy Dataset

In [37]:
import pickle
data = pickle.load(open("./data/bestbuy_data.p", "rb"))
product_names = list(data.keys())

In [53]:
pprint.pprint(data['apple watch'])

print('Number of tokens in the dataset', sum([len(w.split()) for w in data.keys()]))
print('Size of the vocabulary in the dataset', len(set(itertools.chain.from_iterable([w.split() for w in data.keys()]))))

{'apple': 'brand', 'watch': 'category'}
Number of tokens in the dataset 1905
Size of the vocabulary in the dataset 837


In [39]:
CATEGORY = 0
MODELNAME = 1
BRAND = 2
OTHER = 3

## Label Model

To begin with, we will use Snorkel's Label Model to generate training labels for each token in the product names.

### Prepare dataframe for labeling functions

We construct a dataframe where each row is a token that want to predict. The columns are:
* 'Product Name': the product name that the token appears in.
* 'word_idx': the index of the token in the product name.
* 'feature': features used by the LFs.
* 'label': the entity label of the token.

For instance, the entry for 'apple watch' in the original dataset will be represented with the first 2 rows in this dataframe.


In [12]:
start_tok, end_tok = '[S]', '[E]'
to_tokens = lambda s: [start_tok] + s.split() + [end_tok]
df_data = []

label_list = ['category', 'modelname', 'brand', 'other']
label_to_int = lambda label: label_list.index(label)

for name, annotations in data.items():
    toks = to_tokens(name)
    ngrams = list(zip(*[toks[i:] for i in range(3)]))
    for i, ngram in enumerate(ngrams):
        token_to_tag = ngram[1]
        if token_to_tag not in annotations:
            label = label_list.index('other')
        else:
            label = label_list.index(annotations.get(ngram[1]))
        df_data.append({'product_name': name, 'word_idx': i, 'feature': ngram, 
                        'label': label})
df = pd.DataFrame(df_data)
df.head()

Unnamed: 0,product_name,word_idx,feature,label
0,apple watch,0,"([S], apple, watch)",2
1,apple watch,1,"(apple, watch, [E])",0
2,ipad,0,"([S], ipad, [E])",1
3,apple watch series 3,0,"([S], apple, watch)",2
4,apple watch series 3,1,"(apple, watch, series)",0


### Split Data into Train, Dev, Test sets

In [13]:
train_products, test_products = train_test_split(product_names, test_size=0.1, random_state=123) 
train_products, dev_products = train_test_split(train_products, test_size=0.2, random_state=123) 
print('Number of products in train set:', len(train_products))
print('Number of products in dev set:', len(dev_products))
print('Number of products in test set:', len(test_products))

Number of products in train set: 554
Number of products in dev set: 139
Number of products in test set: 77


In this tutorial, we will assume that we don't have the labels for the training set. We have the labels for dev and test set. We will use the dev set to tune the label model, and evaluate on the test set.

In [14]:
train_df = df[df.product_name.isin(train_products)]
X_train = train_df[['product_name', 'word_idx', 'feature']]

dev_df = df[df.product_name.isin(dev_products)]
X_dev, y_dev = dev_df[['product_name', 'word_idx', 'feature']], dev_df['label']

test_df = df[df.product_name.isin(test_products)]
X_test, y_test = test_df[['product_name', 'word_idx', 'feature']], test_df['label']

X_train.shape, X_dev.shape, X_test.shape

((1349, 3), (354, 3), (202, 3))

### Write Labeling Functions

We can write labeling functions using the following rules.
1. Heuristics.
    - if the token is before a number, then it is likely MODELNAME. E.g. "apple watch *series* 3"
    - if the token is a number, then it is likely MODELNAME. E.g. "apple watch series *3*"
    - if the token contains some measurement units (e.g. tb, inch), then it is probably OTHER.
2. External knowledge source (a list of brand names scrapped from BestBuy)
    - if the token is in the list of brand names, then it is BRANDNAME.
    - if the token is after a brand name, it is likely MODELNAME. E.g. beats *solo3* wireless

#### 1. Heuristics

In [32]:
from snorkel.labeling import labeling_function


@labeling_function()
def lf_is_before_num(x):
    """ If the word is before a number, it is likely MODELNAME E.g. apple watch *series* 3
    """
    words = x.feature
    if x.feature[2].isnumeric():
        return MODELNAME
    return -1

@labeling_function()
def lf_is_numeric(x):
    """ E.g. E.g. apple watch series *3*
    """
    words = x.feature
    if x.feature[1].isnumeric():
        return MODELNAME
    return -1

@labeling_function()
def lf_is_other(x):
    word = x.feature[1]
    # if a word ends with those measurement units, then it is likely OTHER
    if word.endswith('inch') or word.endswith('gb') or word.endswith('tb') or word.endswith('$'):
        return OTHER
    if word.startswith('$'):
        return OTHER
    return -1

#### 2. External knowledge source

In [33]:
from bs4 import BeautifulSoup
import codecs

# Use BeautifulSoup to scrap the list of brand names from the BestBuy brands page.
f=codecs.open(r"data/NameBrands_BestBuy.html", 'r', 'utf-8')
soup= BeautifulSoup(f.read(), 'lxml')
brand_list_div = soup.find_all('div', {"class": "alphabetical-list"})[0]
brand_names = [url.get('data-lid').lower() for url in brand_list_div.find_all('a')]


@labeling_function(resources={'brand_names': brand_names})
def lf_is_brand(x, brand_names):
    """ Checks if the word is in the list of brand names.
    """
    words = x.feature
    if x.feature[1] in brand_names:
        return BRAND
    return -1

@labeling_function(resources={'brand_names': brand_names})
def lf_is_after_brand(x, brand_names):
    """ If the word is after a brand name, it is likely MODELNAME. E.g. beats *solo3* wireless
    """
    words = x.feature
    if x.feature[0] in brand_names:
        return MODELNAME
    return -1

### Tune the LFs

In [34]:
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis

lfs = [lf_is_brand, lf_is_before_num, lf_is_numeric, lf_is_other]

applier = PandasLFApplier(lfs=lfs)

L_train = applier.apply(df=X_train)
L_dev = applier.apply(df=X_dev)
L_test = applier.apply(df=X_test)

LFAnalysis(L=L_dev, lfs=lfs).lf_summary(y_dev.values)

  from pandas import Panel
100%|████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:00<00:00, 9659.35it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 354/354 [00:00<00:00, 9849.77it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 202/202 [00:00<00:00, 9629.80it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_is_brand,0,[2],0.096045,0.0,0.0,33,1,0.970588
lf_is_before_num,1,[1],0.053672,0.0,0.0,16,3,0.842105
lf_is_numeric,2,[1],0.053672,0.0,0.0,17,2,0.894737
lf_is_other,3,[3],0.008475,0.0,0.0,3,0,1.0


### Train a Label Model

In [29]:
from snorkel.labeling import LabelModel
from snorkel.labeling import MajorityLabelVoter

majority_model = MajorityLabelVoter(cardinality=4)
preds_train = majority_model.predict(L=L_train)

majority_acc = majority_model.score(L=L_dev, Y=y_dev)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")



Majority Vote Accuracy:   92.0%


In [31]:
label_model = LabelModel(cardinality=4, verbose=True)
label_model.fit(L_train=L_train, n_epochs=2000, lr=1e-3, seed=123)

label_model_acc = label_model.score(L=L_dev, Y=y_dev)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")



Label Model Accuracy:     92.0%


#### Choose the label with highest soft probability

In [24]:
train_labels = label_model.predict_proba(L_train).argmax(axis=1)
X_train['label'] = train_labels
X_train.head(10)

Unnamed: 0,product_name,word_idx,feature,label
0,apple watch,0,"([S], apple, watch)",2
1,apple watch,1,"(apple, watch, [E])",0
2,ipad,0,"([S], ipad, [E])",0
3,apple watch series 3,0,"([S], apple, watch)",2
4,apple watch series 3,1,"(apple, watch, series)",0
5,apple watch series 3,2,"(watch, series, 3)",1
6,apple watch series 3,3,"(series, 3, [E])",1
7,apple watch series 2,0,"([S], apple, watch)",2
8,apple watch series 2,1,"(apple, watch, series)",0
9,apple watch series 2,2,"(watch, series, 2)",1


### Build dataframe for seq2seq entity recognition task

In [25]:
def get_tag(df, product, word_idx):
    df = df[df.product_name==product]
    tag_id = df[df.word_idx==word_idx]['label'].values[0]
    return label_list[tag_id]

def build_seq_tag_df(df):
    product_tags = []
    for product in df.product_name.unique():
        tokens = product.split()
        tags = [get_tag(df, product, i) for i in range(len(tokens))]
        product_tags.append((product, ' '.join(tags)))
    
    seq_df = pd.DataFrame(product_tags, columns=['product', 'label'])
    return seq_df

In [26]:
train_seq_df = build_seq_tag_df(X_train)
dev_seq_df = build_seq_tag_df(dev_df)
test_seq_df = build_seq_tag_df(test_df)

train_seq_df.head()

Unnamed: 0,product,label
0,apple watch,brand category
1,ipad,category
2,apple watch series 3,brand category modelname modelname
3,apple watch series 2,brand category modelname modelname
4,apple homepod,brand category


### Save the Dataframes
We will train our end model using dataframes.

In [36]:
train_seq_df.to_pickle("./train_seq_df.pkl")
dev_seq_df.to_pickle("./dev_seq_df.pkl")
test_seq_df.to_pickle("./test_seq_df.pkl")