# Receipt Cord Data

Link: <https://huggingface.co/datasets/naver-clova-ix/cord-v2>

# Data preprocessing

In [1]:
import pandas as pd
import os

In [2]:
df_list = []
for filepath in os.listdir('receipt_cord/'):
    df = pd.read_parquet(os.path.join('receipt_cord', filepath))
    df.drop(columns=['image'], inplace=True)
    df['split'] = 'train' if 'train' in filepath else 'test' if 'test' in filepath else 'val'
    df_list.append(df)

In [3]:
df = pd.concat(df_list, ignore_index=True)
df.shape

(1000, 2)

In [4]:
# parse nested json to get product name
import ast

def get_prod_names(nested_json):
    try:
        nested_json = ast.literal_eval(nested_json)
    except:
        return None
    
    if 'gt_parse' in nested_json:
        if 'menu' in nested_json['gt_parse']:
            if isinstance(nested_json['gt_parse']['menu'], list):
                try:
                    product_names = list(map(lambda x: x['nm'], nested_json['gt_parse']['menu']))
                    return product_names
                except:
                    return None
            else:
                try:
                    return [nested_json['gt_parse']['menu']['nm']]
                except:
                    return None
    else:
        return None

In [5]:
df['product_name'] = df['ground_truth'].apply(get_prod_names)

In [6]:
df.dropna(inplace=True)

In [7]:
df = df.explode('product_name').reset_index(drop=True)

In [8]:
df.drop(columns=['ground_truth'], inplace=True)

In [9]:
df['type'] = df['product_name'].apply(lambda x: isinstance(x, list))

In [10]:
df = df[df.type==False]

In [11]:
df

Unnamed: 0,split,product_name,type
0,test,-TICKET CP,False
1,test,J.STB PROMO,False
2,test,Y.B.BAT,False
3,test,Y.BASO PROM,False
4,test,JASMINE MT ( L ),False
...,...,...,...
2563,val,PAHA BAWAH,False
2564,val,Choco Cheese,False
2565,val,Lemon Tea (L),False
2566,val,Hulk Topper Package,False


## For labelling

In [13]:
# text_file = open("for_labelling_2.txt", "w")
# product_list = df.product_name.unique().tolist()[1004:]
# product_list = list(set(product_list))
# print(len(product_list))
# n = text_file.write('\n'.join(product_list))
# text_file.close()

716


In [None]:
# stopped at 1004 row

In [None]:
# stopped at 1004 + 224

In [56]:
len(df.product_name.tolist()), len(df.product_name.unique().tolist())

(2564, 1720)

## Using labelled annotations

In [15]:
import pandas as pd
import json

In [16]:
with open('annotations.json') as json_file:
    data = json.load(json_file)

In [17]:
with open('annotations_2.json') as json_file:
    add_data = json.load(json_file)

In [18]:
annotations = data['annotations']

In [20]:
annotations += add_data['annotations']

In [21]:
annotations = [i for i in annotations if i is not None]

In [22]:
text = list(map(lambda x: x[0].replace('\r', ''), annotations))
entities = list(map(lambda x: x[1]['entities'], annotations))

In [23]:
df = pd.DataFrame({'text': text, 'entities': entities})
df

Unnamed: 0,text,entities
0,J.STB PROMO,"[[0, 5, PROD], [6, 11, ADJ]]"
1,Y.B.BAT,"[[0, 7, PROD]]"
2,Y.BASO PROM,"[[0, 6, PROD], [7, 11, ADJ]]"
3,JASMINE MT ( L ),"[[0, 10, PROD], [11, 16, ADJ]]"
4,DONAT GULA,"[[0, 5, PROD], [6, 10, ADJ]]"
...,...,...
1053,DADAR PISANG,"[[0, 12, PROD]]"
1054,JAPANEESE GR TEA HOT,"[[0, 16, PROD], [17, 20, ADJ]]"
1055,<FC Winger HC,"[[4, 10, PROD]]"
1056,SUPER CHEESE,"[[0, 12, PROD]]"


In [24]:
df = df.groupby('text')['entities'].first().reset_index()

In [25]:
def split_text_to_tags(row):
    text = row['text']
    entities = row['entities']
    
    all_tags = []
    left = 0
    while len(entities) > 0:
        if left!=entities[0][0]:
            all_tags.append([left, entities[0][0], 'O'])
        all_tags.append(entities[0])
        left = entities[0][1]
        entities = entities[1:]
    if left!=len(text):
        all_tags.append([left, len(text), 'O'])
    return all_tags

In [26]:
df['tags'] = df.apply(split_text_to_tags, axis=1)

In [27]:
df = df.explode('tags').reset_index().rename(columns={'index': 'text_index'})
df.dropna(inplace=True)
df['entity'] = df.apply(lambda row: row['text'][row['tags'][0]: row['tags'][1]], axis=1)
df = df[df.entity!=' '].reset_index(drop=True)

In [28]:
df['label'] = df['tags'].apply(lambda x: x[2])

In [29]:
import re
df['entity'] = df['entity'].apply(lambda x: re.split(r"([^a-zA-Z0-9])", x))

In [30]:
df = df.explode('entity').reset_index()
df = df[~df.entity.isin(['', ' '])].reset_index(drop=True)

In [31]:
df['pos'] = df.groupby(['index']).cumcount()+1

In [32]:
df['pos'] = df.apply(lambda row: '' if row['label']=='O' else 'B-' if row['pos']==1 else 'I-', axis=1)

In [33]:
df['pos_tag'] = df['pos'] + df['label']

In [34]:
df = df[['text_index', 'text', 'entity', 'pos_tag']]

In [35]:
data = df.groupby('text_index')['pos_tag'].apply(list).reset_index()

In [36]:
new_df = df.groupby('text_index')['text'].first().reset_index()

In [37]:
data = pd.merge(data, new_df, on='text_index')

In [38]:
data

Unnamed: 0,text_index,pos_tag,text
0,0,"[O, O, B-PROD, B-PROD, O]",#F11 CREAM HAMBURG 0
1,1,"[O, B-PROD]",#PKTPOLSBTSPON2S
2,2,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) BIHUN GORENG SEAFOOD
3,3,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) KWETIAW SEAFOOD SIRAM
4,4,"[O, O, O, B-PROD, I-PROD]",(TA) NASI GORENG
...,...,...,...
933,933,"[B-ADJ, B-PROD]",green tea
934,934,"[B-PROD, I-PROD]",phad thai
935,935,"[B-ADJ, B-PROD, I-PROD]",red curry beef
936,936,"[B-PROD, I-PROD]",steamed rice


In [39]:
# data.to_csv('labelled_data_new.csv', index=False)

## Split data and Define Unique Labels

In [86]:
data = pd.read_csv('labelled_data_new.csv')

In [87]:
import ast

In [88]:
data['pos_tag'] = data['pos_tag'].apply(ast.literal_eval)

In [89]:
data

Unnamed: 0,text_index,pos_tag,text
0,0,"[O, O, B-PROD, B-PROD, O]",#F11 CREAM HAMBURG 0
1,1,"[O, B-PROD]",#PKTPOLSBTSPON2S
2,2,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) BIHUN GORENG SEAFOOD
3,3,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) KWETIAW SEAFOOD SIRAM
4,4,"[O, O, O, B-PROD, I-PROD]",(TA) NASI GORENG
...,...,...,...
933,933,"[B-ADJ, B-PROD]",green tea
934,934,"[B-PROD, I-PROD]",phad thai
935,935,"[B-ADJ, B-PROD, I-PROD]",red curry beef
936,936,"[B-PROD, I-PROD]",steamed rice


In [None]:
data.rename(columns={'pos_tag': 'labels'}, inplace=True)

In [90]:
labels = data['pos_tag'].tolist()
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]
    
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

# BERT pretrained

In [121]:
import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, BertModel
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

## Build Model

In [119]:
class MyBertModel(torch.nn.Module):

    def __init__(self):

        super(MyBertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

## Model

In [120]:
model = MyBertModel()
model.load_state_dict(torch.load('bert-base-uncased.pt'))
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

MyBertModel(
  (bert): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=T

In [95]:
from seqeval.metrics import f1_score

def evaluate(model, df_test):
    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0
    total_f1_test = 0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][test_label[i] != -100]
                label_clean = test_label[i][test_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_test += acc

                pred_labels = [[ids_to_labels[i] for i in label_clean.tolist()]]
                val_labels = [[ids_to_labels[i] for i in predictions.tolist()]]
                f1 = f1_score(val_labels, pred_labels)
                total_f1_test += f1

    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f} | Test F1 Score: {total_f1_test / len(df_test): .3f}')

In [96]:
def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)

In [97]:
evaluate_one_text(model, '(TA) BIHUN GORENG SEAFOOD')

(TA) BIHUN GORENG SEAFOOD
['O', 'O', 'O', 'B-BRAND', 'B-PROD', 'B-PROD']


# Embeddings

Using the predicted tags, remove all but product tags

In [98]:
import re
data['text_tokenized'] = data['text'].apply(lambda x: re.split(r"([^a-zA-Z0-9])", x))
data['text_tokenized'] = data['text_tokenized'].apply(lambda x: [i for i in x if i not in ['',' ']])

In [99]:
data['match'] = data.apply(lambda row: len(row['labels']) == len(row['text_tokenized']), axis=1)

In [100]:
data = data[data.match==True].reset_index(drop=True)

In [101]:
data

Unnamed: 0,text_index,labels,text,text_tokenized,match
0,0,"[O, O, B-PROD, B-PROD, O]",#F11 CREAM HAMBURG 0,"[#, F11, CREAM, HAMBURG, 0]",True
1,1,"[O, B-PROD]",#PKTPOLSBTSPON2S,"[#, PKTPOLSBTSPON2S]",True
2,2,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) BIHUN GORENG SEAFOOD,"[(, TA, ), BIHUN, GORENG, SEAFOOD]",True
3,3,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) KWETIAW SEAFOOD SIRAM,"[(, TA, ), KWETIAW, SEAFOOD, SIRAM]",True
4,4,"[O, O, O, B-PROD, I-PROD]",(TA) NASI GORENG,"[(, TA, ), NASI, GORENG]",True
...,...,...,...,...,...
928,933,"[B-ADJ, B-PROD]",green tea,"[green, tea]",True
929,934,"[B-PROD, I-PROD]",phad thai,"[phad, thai]",True
930,935,"[B-ADJ, B-PROD, I-PROD]",red curry beef,"[red, curry, beef]",True
931,936,"[B-PROD, I-PROD]",steamed rice,"[steamed, rice]",True


In [102]:
def get_prod_names(row):
    keep = []
    for i in range(len(row['labels'])):
        if 'PROD' in row['labels'][i]:
            keep.append(row['text_tokenized'][i])
    return keep

In [103]:
def get_adj(row):
    keep = []
    for i in range(len(row['labels'])):
        if 'ADJ' in row['labels'][i]:
            keep.append(row['text_tokenized'][i])
    return keep

In [104]:
data['adj_tags'] = data.apply(get_adj, axis=1)
data['product_tags'] = data.apply(get_prod_names, axis=1)

In [106]:
data['product_name'] = data['product_tags'].apply(lambda x: ' '.join(x))
data['adj'] = data['adj_tags'].apply(lambda x: ' '.join(x))

In [107]:
data[data.product_name=='']

Unnamed: 0,text_index,labels,text,text_tokenized,match,adj_tags,product_tags,product_name,adj
29,29,"[O, O, O, B-ADJ, I-ADJ]",3002-Kyoto Choco Mochi,"[3002, -, Kyoto, Choco, Mochi]",True,"[Choco, Mochi]",[],,Choco Mochi
54,54,"[B-BRAND, B-ADJ, I-ADJ]",AMBUSH DBL CHS,"[AMBUSH, DBL, CHS]",True,"[DBL, CHS]",[],,DBL CHS
193,193,"[B-BRAND, I-BRAND]",CIRENG PANDAWA,"[CIRENG, PANDAWA]",True,[],[],,
194,194,"[O, O, B-ADJ, O]",CK.MANTAP A,"[CK, ., MANTAP, A]",True,[MANTAP],[],,MANTAP
197,197,"[B-BRAND, I-BRAND]",COCA COLA,"[COCA, COLA]",True,[],[],,
268,268,"[O, O, B-BRAND, I-BRAND]",DN. LONG JOHN,"[DN, ., LONG, JOHN]",True,[],[],,
277,278,"[B-BRAND, O]",Dendeng PDS,"[Dendeng, PDS]",True,[],[],,
306,307,"[B-BRAND, O, O]",FIXALL HK 26521,"[FIXALL, HK, 26521]",True,[],[],,
341,342,"[B-ADJ, I-ADJ]",GULAI OTAK,"[GULAI, OTAK]",True,"[GULAI, OTAK]",[],,GULAI OTAK
364,365,[B-ADJ],Hokkaido,[Hokkaido],True,[Hokkaido],[],,Hokkaido


In [108]:
data = data[data.product_name!=''].reset_index(drop=True)
data = data.groupby('product_name')[['labels', 'text']].first().reset_index()
data.shape

(792, 3)

## Convert to embedding

### TF-IDF

In [142]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [143]:
vectorizer = TfidfVectorizer(analyzer='word')
vectors = vectorizer.fit_transform(data.product_name.tolist())
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [144]:
df['product_name'] = data.product_name.tolist()

In [145]:
df.product_name.nunique()

792

In [146]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy

In [147]:
sparse_test = scipy.sparse.csr_matrix(df[feature_names.tolist()].values)

In [148]:
cosine_sim = pd.DataFrame(cosine_similarity(sparse_test))
cosine_sim.index = data.text.tolist()
cosine_sim.columns = data.text.tolist()

In [149]:
cosine_sim

Unnamed: 0,"Oriental ""Ketjap"" Pork Rice",Large *Plastik Kcl,500 days of summer (P),A.MINERAL GELAS,A.MINERAL BOTOL,ABURA UDON (ORIGINAL),ACCESORIES(SUN BABES PENUTUP MATA),AIR MINERAL,ALMOND CHOCO CREAMCHEESE,ALMOND CHOCO CREAM CHEESE,...,garlic pepper beef,caisim special,cashew nuts chkn,red curry beef,Fre ice grentea,phad thai,steamed rice,sun kissed (P),green tea,chapsal twister donnut
"Oriental ""Ketjap"" Pork Rice",1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.308012,0.0,0.0,0.0
Large *Plastik Kcl,0.000000,1.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
500 days of summer (P),0.000000,0.0,1.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
A.MINERAL GELAS,0.000000,0.0,0.0,1.000000,0.707107,0.0,0.0,0.634352,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
A.MINERAL BOTOL,0.000000,0.0,0.0,0.707107,1.000000,0.0,0.0,0.448555,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
phad thai,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.0
steamed rice,0.308012,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0
sun kissed (P),0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.0,0.0,0.0
green tea,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0


In [150]:
cosine_sim.sort_values(by='Ice Lemon Tea', ascending=False).head(50)

Unnamed: 0,"Oriental ""Ketjap"" Pork Rice",Large *Plastik Kcl,500 days of summer (P),A.MINERAL GELAS,A.MINERAL BOTOL,ABURA UDON (ORIGINAL),ACCESORIES(SUN BABES PENUTUP MATA),AIR MINERAL,ALMOND CHOCO CREAMCHEESE,ALMOND CHOCO CREAM CHEESE,...,garlic pepper beef,caisim special,cashew nuts chkn,red curry beef,Fre ice grentea,phad thai,steamed rice,sun kissed (P),green tea,chapsal twister donnut
Ice Lemon Tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610033,0.0
ICED LEMON TEA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610033,0.0
Lemon iced tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610033,0.0
Lemon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
green tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Hot Tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
CHINESE TEA KWAN'IM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
JERUK LEMON IMP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tebu Lemon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S-Fresh Lemon Lime,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Bert embeddings

Making use of pretrained Bert to do word embeddings

In [109]:
data

Unnamed: 0,product_name,labels,text
0,""" Ketjap "" Pork Rice","[B-ADJ, B-PROD, I-PROD, I-PROD, I-PROD, I-PROD]","Oriental ""Ketjap"" Pork Rice"
1,* Plastik,"[B-ADJ, B-PROD, I-PROD, O]",Large *Plastik Kcl
2,500 days of summer,"[B-PROD, I-PROD, I-PROD, I-PROD, O, O, O]",500 days of summer (P)
3,A . MINERAL,"[B-PROD, I-PROD, I-PROD, B-ADJ]",A.MINERAL GELAS
4,A . MINERAL BOTOL,"[B-PROD, I-PROD, I-PROD, I-PROD]",A.MINERAL BOTOL
...,...,...,...
787,phad thai,"[B-PROD, I-PROD]",phad thai
788,steamed rice,"[B-PROD, I-PROD]",steamed rice
789,sun kissed,"[B-PROD, I-PROD, O, O, O]",sun kissed (P)
790,tea,"[B-ADJ, B-PROD]",green tea


In [110]:
texts = data.product_name.unique().tolist()
tokenized_text = list(map(tokenizer.tokenize, texts))
indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, tokenized_text))

In [111]:
segments_ids = list(map(lambda x: [1] * len(x), indexed_tokens))

In [112]:
tokens_tensor = [torch.tensor([x]) for x in indexed_tokens]
segments_tensors = [torch.tensor([x]) for x in segments_ids]

In [115]:
BertModel

__main__.BertModel

In [122]:
# Load pre-trained model (weights)
emb_model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
emb_model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [123]:
outputs = []
for i in range(len(tokens_tensor)):
    with torch.no_grad():
        outputs.append(emb_model(tokens_tensor[i], segments_tensors[i]))

In [124]:
token_vecs = list(map(lambda x: x[2][-2][0], outputs))

In [125]:
sentence_embeddings = list(map(lambda x: torch.mean(x, dim=0), token_vecs))

In [127]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy

In [128]:
sparse_bert = scipy.sparse.csr_matrix(torch.stack(sentence_embeddings))

In [151]:
cosine_sim = pd.DataFrame(cosine_similarity(sparse_bert))
cosine_sim.columns=data.text.unique().tolist()
cosine_sim.index = data.text.unique().tolist()

In [153]:
cosine_sim[['Ice Lemon Tea']].sort_values(by='Ice Lemon Tea', ascending=False).head(50)

Unnamed: 0,Ice Lemon Tea
Ice Lemon Tea,1.000001
ICED LEMON TEA,1.000001
Lemon iced tea,1.000001
S-Fresh Lemon Lime,0.959027
Sweet Tea,0.923163
CHOCOLATE TWIST,0.877407
Chocolate Twist,0.877407
REGULAR JASMINE TEA,0.873812
REAL CHOCOLATE ROLL,0.871326
AQUA AIR MINERAL 600,0.870291


## Fuzzy Matching

### Levenshtein Distance

In [56]:
import Levenshtein as lev

In [63]:
texts = data.text.tolist()
product_names = data.product_name.tolist()

In [66]:
text_mat = {}
for i in texts:
    curr_row = []
    for j in texts:
        curr_row.append(lev.distance(i, j))
    text_mat[i] = curr_row

In [72]:
text_lev = pd.DataFrame(text_mat)
text_lev.index = texts

In [75]:
text_lev.sort_values(by='Ice Lemon Tea').head(20)

Unnamed: 0,Large *Plastik Kcl,500 days of summer (P),A.MINERAL GELAS,A.MINERAL BOTOL,ACCESORIES(SUN BABES PENUTUP MATA),AIR MINERAL,ALMOND CHOCO CREAMCHEESE,ALMOND CHOCO CREAM CHEESE,ALMOND CROSSIANT,AMANDEL BROOD,...,Mie Jumbo Pst/bakso,garlic pepper beef,cashew nuts chkn,red curry beef,Fre ice grentea,phad thai,steamed rice,sun kissed (P),green tea,chapsal twister donnut
A.MINERAL GELAS,18,21,0,5,27,9,18,18,12,10,...,18,18,16,14,15,14,14,15,14,21
A.MINERAL BOTOL,18,21,5,0,28,9,19,19,13,8,...,18,18,16,14,15,14,14,15,14,21
* MINERAL WATER,18,20,7,6,28,9,19,19,13,11,...,18,18,16,14,15,14,14,15,14,20
DONAT GULA,17,21,9,12,28,10,20,21,13,11,...,18,17,15,13,14,9,11,14,9,21
JASMINE MT ( L ),17,18,9,10,26,12,20,20,13,12,...,18,17,15,15,15,15,15,13,15,20
AIR MINERAL,17,21,9,9,29,0,20,21,13,11,...,18,17,15,13,14,11,12,13,11,21
ICED TEA,17,21,10,11,29,8,20,21,13,11,...,18,17,15,13,14,8,11,13,8,21
CUMI BANGKA,16,21,10,12,27,9,21,22,13,12,...,18,17,15,14,14,10,12,14,11,21
MIKA SEDANG,17,21,10,12,29,9,21,22,11,12,...,17,17,15,14,14,10,12,14,11,21
MIKA KECIL,17,21,10,10,30,8,20,21,13,11,...,17,17,15,14,14,9,12,14,10,21


### Jaro Winkler

In [76]:
import jaro

In [79]:
texts = data.text.tolist()
product_names = data.product_name.tolist()

In [80]:
text_mat = {}
for i in texts:
    curr_row = []
    for j in texts:
        curr_row.append(jaro.jaro_winkler_metric(i, j))
    text_mat[i] = curr_row

In [81]:
text_lev = pd.DataFrame(text_mat)
text_lev.index = texts

In [87]:
text_lev.sort_values(by='Ice Lemon Tea', ascending=False).head(20)

Unnamed: 0,Large *Plastik Kcl,500 days of summer (P),A.MINERAL GELAS,A.MINERAL BOTOL,ACCESORIES(SUN BABES PENUTUP MATA),AIR MINERAL,ALMOND CHOCO CREAMCHEESE,ALMOND CHOCO CREAM CHEESE,ALMOND CROSSIANT,AMANDEL BROOD,...,Mie Jumbo Pst/bakso,garlic pepper beef,cashew nuts chkn,red curry beef,Fre ice grentea,phad thai,steamed rice,sun kissed (P),green tea,chapsal twister donnut
green tea,0.555556,0.378788,0.392593,0.392593,0.380174,0.400673,0.384259,0.383704,0.391204,0.396011,...,0.551657,0.626984,0.489352,0.57672,0.65291,0.555556,0.611111,0.404762,1.0,0.458754
Free Ice Tea,0.498148,0.421717,0.266667,0.372222,0.400327,0.282828,0.416667,0.415556,0.479167,0.386752,...,0.51462,0.578704,0.444444,0.646825,0.836667,0.462963,0.611111,0.376984,0.722222,0.414646
Sweet Tea,0.472222,0.378788,0.392593,0.451852,0.427015,0.400673,0.384259,0.383704,0.391204,0.396011,...,0.468324,0.477778,0.556019,0.515873,0.562963,0.444444,0.611111,0.28836,0.722222,0.460943
Pepenero Pastel,0.534259,0.386869,0.377778,0.377778,0.397386,0.385859,0.369444,0.368889,0.376389,0.381197,...,0.516458,0.466667,0.415278,0.496825,0.563889,0.511111,0.516667,0.430159,0.682407,0.521549
Ice Lemon Tea,0.420798,0.466977,0.365812,0.441453,0.475113,0.445221,0.340812,0.339145,0.361645,0.269231,...,0.592443,0.542735,0.565705,0.518926,0.591239,0.458689,0.533761,0.447802,0.653846,0.476024
Fre ice grentea,0.534259,0.453535,0.377778,0.377778,0.397386,0.385859,0.405556,0.404444,0.376389,0.381197,...,0.516374,0.660269,0.425,0.659921,1.0,0.451852,0.588095,0.434127,0.65291,0.521549
Organic Green Sa,0.523148,0.379924,0.4625,0.376389,0.372549,0.38447,0.472222,0.47,0.458333,0.379808,...,0.458553,0.660185,0.482143,0.550595,0.716035,0.395833,0.530754,0.489881,0.643188,0.546086
Lychee Ice Tea,0.531746,0.394805,0.25873,0.360317,0.384454,0.274892,0.446429,0.444762,0.511905,0.432234,...,0.498747,0.534392,0.545635,0.507937,0.576587,0.515873,0.59127,0.365079,0.637566,0.511544
Arem Arem,0.5,0.378788,0.4,0.4,0.473856,0.535354,0.486111,0.484444,0.449074,0.410256,...,0.385965,0.477778,0.449074,0.637566,0.562963,0.407407,0.590741,0.28836,0.62963,0.378788
Mineral Water,0.547212,0.41317,0.42906,0.42906,0.368778,0.278555,0.412393,0.411282,0.426282,0.435897,...,0.536887,0.56161,0.499038,0.447253,0.591239,0.500712,0.533761,0.364469,0.629223,0.493007


## Create product groups

In [157]:
data['product_group'] = data['product_name'].apply(lambda x: 'lemon' if 'lemon' in x.lower() else 'ayam' if 'ayam' in x.lower() else 'cheese' if 'cheese' in x.lower() else None)

In [162]:
data.product_group.value_counts()

product_group
ayam      21
cheese    20
lemon     10
Name: count, dtype: int64

In [165]:
data[data.product_group=='lemon']

Unnamed: 0,product_name,labels,text,product_group
347,JERUK LEMON,"[B-PROD, I-PROD, O]",JERUK LEMON IMP,lemon
403,LEMON TEA,"[B-ADJ, B-PROD, I-PROD]",ICED LEMON TEA,lemon
404,LEMONADE,"[B-ADJ, B-PROD]",KOREAN LEMONADE,lemon
416,Lemon,[B-PROD],Lemon,lemon
417,Lemon Lime,"[B-ADJ, I-ADJ, I-ADJ, B-PROD, I-PROD]",S-Fresh Lemon Lime,lemon
418,Lemon Tea,"[B-ADJ, B-PROD, I-PROD]",Ice Lemon Tea,lemon
419,Lemon tea,"[B-PROD, B-ADJ, B-PROD]",Lemon iced tea,lemon
420,Lemonade,"[B-ADJ, I-ADJ, B-PROD]",Home Made Lemonade,lemon
421,Lemongrass Aloe,"[O, B-PROD, B-PROD]",RTD Lemongrass Aloe,lemon
740,Tebu Lemon,"[B-PROD, I-PROD]",Tebu Lemon,lemon


In [166]:
anomalies = ['steamed rice', 'garlic pepper beef', 'Pepenero Pastel']