# Receipt Cord Data

Link: <https://huggingface.co/datasets/naver-clova-ix/cord-v2>

In [1]:
import pandas as pd
import os

In [2]:
df_list = []
for filepath in os.listdir('receipt_cord/'):
    df = pd.read_parquet(os.path.join('receipt_cord', filepath))
    df.drop(columns=['image'], inplace=True)
    df['split'] = 'train' if 'train' in filepath else 'test' if 'test' in filepath else 'val'
    df_list.append(df)

In [3]:
df = pd.concat(df_list, ignore_index=True)
df.shape

(1000, 2)

In [4]:
# parse nested json to get product name
import ast

def get_prod_names(nested_json):
    try:
        nested_json = ast.literal_eval(nested_json)
    except:
        return None
    
    if 'gt_parse' in nested_json:
        if 'menu' in nested_json['gt_parse']:
            if isinstance(nested_json['gt_parse']['menu'], list):
                try:
                    product_names = list(map(lambda x: x['nm'], nested_json['gt_parse']['menu']))
                    return product_names
                except:
                    return None
            else:
                try:
                    return [nested_json['gt_parse']['menu']['nm']]
                except:
                    return None
    else:
        return None

In [5]:
df['product_name'] = df['ground_truth'].apply(get_prod_names)

In [6]:
df.dropna(inplace=True)

In [7]:
df = df.explode('product_name').reset_index(drop=True)

In [8]:
df.drop(columns=['ground_truth'], inplace=True)

In [9]:
df['type'] = df['product_name'].apply(lambda x: isinstance(x, list))

In [10]:
df = df[df.type==False]

In [11]:
df

Unnamed: 0,split,product_name,type
0,test,-TICKET CP,False
1,test,J.STB PROMO,False
2,test,Y.B.BAT,False
3,test,Y.BASO PROM,False
4,test,JASMINE MT ( L ),False
...,...,...,...
2563,val,PAHA BAWAH,False
2564,val,Choco Cheese,False
2565,val,Lemon Tea (L),False
2566,val,Hulk Topper Package,False


In [13]:
text_file = open("for_labelling_2.txt", "w")
product_list = df.product_name.unique().tolist()[1004:]
product_list = list(set(product_list))
print(len(product_list))
n = text_file.write('\n'.join(product_list))
text_file.close()

716


In [None]:
# stopped at 1004 row

In [None]:
# stopped at 1004 + 224

In [56]:
len(df.product_name.tolist()), len(df.product_name.unique().tolist())

(2564, 1720)

# Spacy NER

In [9]:
import spacy
from spacy import displacy

# Load english large model
nlp_sm = spacy.load('en_core_web_lg')
roberta_nlp = spacy.load('en_core_web_trf')

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
document = roberta_nlp(df.iloc[4]['product_name'])
for entity in document.ents:
    print(entity.text + ':', entity.label_)

JASMINE MT: PERSON


doesnt work at all

# BERT pretrained

## Data preprocessing

In [24]:
import pandas as pd
import json

In [25]:
with open('annotations.json') as json_file:
    data = json.load(json_file)

In [26]:
annotations = data['annotations']

In [27]:
annotations = [i for i in annotations if i is not None]

In [28]:
text = list(map(lambda x: x[0].replace('\r', ''), annotations))
entities = list(map(lambda x: x[1]['entities'], annotations))

In [29]:
df = pd.DataFrame({'text': text, 'entities': entities})
df

Unnamed: 0,text,entities
0,J.STB PROMO,"[[0, 5, PROD], [6, 11, ADJ]]"
1,Y.B.BAT,"[[0, 7, PROD]]"
2,Y.BASO PROM,"[[0, 6, PROD], [7, 11, ADJ]]"
3,JASMINE MT ( L ),"[[0, 10, PROD], [11, 16, ADJ]]"
4,DONAT GULA,"[[0, 5, PROD], [6, 10, ADJ]]"
...,...,...
841,CHOCOLATE SUNDAE,"[[0, 16, PROD]]"
842,REGULAR FRIES,"[[0, 7, ADJ], [8, 13, PROD]]"
843,KUE CUBIT OVO/ SKIPPY,"[[0, 9, PROD]]"
844,ES BUAH,"[[0, 2, ADJ], [3, 7, PROD]]"


In [30]:
df = df.groupby('text')['entities'].first().reset_index()

In [31]:
def split_text_to_tags(row):
    text = row['text']
    entities = row['entities']
    
    all_tags = []
    left = 0
    while len(entities) > 0:
        if left!=entities[0][0]:
            all_tags.append([left, entities[0][0], 'O'])
        all_tags.append(entities[0])
        left = entities[0][1]
        entities = entities[1:]
    if left!=len(text):
        all_tags.append([left, len(text), 'O'])
    return all_tags

In [32]:
df['tags'] = df.apply(split_text_to_tags, axis=1)

In [33]:
df = df.explode('tags').reset_index().rename(columns={'index': 'text_index'})
df.dropna(inplace=True)
df['entity'] = df.apply(lambda row: row['text'][row['tags'][0]: row['tags'][1]], axis=1)
df = df[df.entity!=' '].reset_index(drop=True)

In [34]:
df['label'] = df['tags'].apply(lambda x: x[2])

In [35]:
import re
df['entity'] = df['entity'].apply(lambda x: re.split(r"([^a-zA-Z0-9])", x))

In [36]:
df = df.explode('entity').reset_index()
df = df[~df.entity.isin(['', ' '])].reset_index(drop=True)

In [37]:
df['pos'] = df.groupby(['index']).cumcount()+1

In [38]:
df['pos'] = df.apply(lambda row: '' if row['label']=='O' else 'B-' if row['pos']==1 else 'I-', axis=1)

In [39]:
df['pos_tag'] = df['pos'] + df['label']

In [40]:
df = df[['text_index', 'text', 'entity', 'pos_tag']]

In [41]:
data = df.groupby('text_index')['pos_tag'].apply(list).reset_index()

In [42]:
new_df = df.groupby('text_index')['text'].first().reset_index()

In [43]:
data = pd.merge(data, new_df, on='text_index')

In [44]:
data

Unnamed: 0,text_index,pos_tag,text
0,0,"[O, O, B-PROD, B-PROD, O]",#F11 CREAM HAMBURG 0
1,1,"[O, B-PROD]",#PKTPOLSBTSPON2S
2,2,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) BIHUN GORENG SEAFOOD
3,3,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) KWETIAW SEAFOOD SIRAM
4,4,"[O, O, O, B-PROD, I-PROD]",(TA) NASI GORENG
...,...,...,...
721,721,"[B-ADJ, B-PROD]",green tea
722,722,"[B-PROD, I-PROD]",phad thai
723,723,"[B-ADJ, B-PROD, I-PROD]",red curry beef
724,724,"[B-PROD, I-PROD]",steamed rice


In [45]:
# data.to_csv('labelled_data.csv', index=False)

## Initialize Tokenizer

In [17]:
import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

tokenizer = BertTokenizerFast.from_pretrained('distilbert-base-uncased')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.


In [18]:
label_all_tokens = False

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = df['labels'].values.tolist()
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

## Split data and Define Unique Labels

In [19]:
data = pd.read_csv('labelled_data.csv')

In [20]:
import ast

In [21]:
data['pos_tag'] = data['pos_tag'].apply(ast.literal_eval)

In [22]:
data

Unnamed: 0,text_index,pos_tag,text
0,0,"[O, O, B-PROD, B-PROD, O]",#F11 CREAM HAMBURG 0
1,1,"[O, B-PROD]",#PKTPOLSBTSPON2S
2,2,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) BIHUN GORENG SEAFOOD
3,3,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) KWETIAW SEAFOOD SIRAM
4,4,"[O, O, O, B-PROD, I-PROD]",(TA) NASI GORENG
...,...,...,...
721,721,"[B-ADJ, B-PROD]",green tea
722,722,"[B-PROD, I-PROD]",phad thai
723,723,"[B-ADJ, B-PROD, I-PROD]",red curry beef
724,724,"[B-PROD, I-PROD]",steamed rice


In [23]:
labels = data['pos_tag'].tolist()
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]
    
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

In [24]:
data.rename(columns={'pos_tag': 'labels'}, inplace=True)

In [25]:
df_train, df_val, df_test = np.split(data.sample(frac=1, random_state=42),
                            [int(.8 * len(data)), int(.9 * len(data))])

In [26]:
df_test

Unnamed: 0,text_index,labels,text
401,401,"[O, B-ADJ, O, O, B-PROD]",Mie Jumbo Pst/bakso
476,476,"[B-ADJ, B-PROD]",PKT AYAM
105,105,"[B-PROD, I-PROD]",BUBUR GO
565,565,"[O, O, B-BRAND, B-PROD]",S-Ovaltine Macchiat
389,389,"[B-PROD, I-PROD]",MIKA SEDANG
...,...,...,...
71,71,"[B-PROD, B-PROD, I-PROD, I-PROD, B-PROD]",Ayam goreng+Sayur asem
106,106,"[B-PROD, B-ADJ]",BUBUR UNGU
270,270,"[B-PROD, B-PROD]",GULAI HATI
435,435,"[B-PROD, I-PROD]",Nasi Liwet


## Build Model

In [27]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

### Metrics

In [31]:
from seqeval.metrics import f1_score

In [38]:
y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]

In [39]:
f1_score(y_true, y_pred)

0.5

In [41]:
f1_score([['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O']], [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O']])

0.0

In [42]:
f1_score([['B-PER', 'I-PER', 'O']], [['B-PER', 'I-PER', 'O']])

1.0

In [29]:
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")

def compute_metrics(predictions, labels):

    #select predicted index with maximum logit for each token
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Model

In [34]:
LEARNING_RATE = 5e-3
EPOCHS = 5
BATCH_SIZE = 2

model = BertModel()
model.load_state_dict(torch.load('model_multilingual.pt'))
model.eval()
# train_loop(model, df_train, df_val)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

BertModel(
  (bert): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tr

In [35]:
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')

In [36]:
def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)

In [37]:
evaluate_one_text(model, '(TA) BIHUN GORENG SEAFOOD')

(TA) BIHUN GORENG SEAFOOD
['B-ADJ', 'B-ADJ', 'B-ADJ', 'I-PROD', 'B-QTY', 'B-QTY']


## DistilBert

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)lve/main/config.json: 100%|████████████████████████████████████████████████████| 483/483 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████████████| 28.0/28.0 [00:00<?, ?B/s]
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████| 232k/232k [00:00<00:00, 502kB/s]
Downloading (…)/main/tokenizer.json: 100%|███████████████████████████████████████████| 466k/466k [00:00<00:00, 660kB/s]


In [15]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW

In [16]:
#check if gpu is present
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [None]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_names))
model.to(device)

# Step 2

Using the predicted tags, remove all but product tags

In [38]:
import re
data['text_tokenized'] = data['text'].apply(lambda x: re.split(r"([^a-zA-Z0-9])", x))
data['text_tokenized'] = data['text_tokenized'].apply(lambda x: [i for i in x if i not in ['',' ']])

In [39]:
data['match'] = data.apply(lambda row: len(row['labels']) == len(row['text_tokenized']), axis=1)

In [40]:
data = data[data.match==True].reset_index(drop=True)

In [41]:
data

Unnamed: 0,text_index,labels,text,text_tokenized,match
0,0,"[O, O, B-PROD, B-PROD, O]",#F11 CREAM HAMBURG 0,"[#, F11, CREAM, HAMBURG, 0]",True
1,1,"[O, B-PROD]",#PKTPOLSBTSPON2S,"[#, PKTPOLSBTSPON2S]",True
2,2,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) BIHUN GORENG SEAFOOD,"[(, TA, ), BIHUN, GORENG, SEAFOOD]",True
3,3,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) KWETIAW SEAFOOD SIRAM,"[(, TA, ), KWETIAW, SEAFOOD, SIRAM]",True
4,4,"[O, O, O, B-PROD, I-PROD]",(TA) NASI GORENG,"[(, TA, ), NASI, GORENG]",True
...,...,...,...,...,...
720,721,"[B-ADJ, B-PROD]",green tea,"[green, tea]",True
721,722,"[B-PROD, I-PROD]",phad thai,"[phad, thai]",True
722,723,"[B-ADJ, B-PROD, I-PROD]",red curry beef,"[red, curry, beef]",True
723,724,"[B-PROD, I-PROD]",steamed rice,"[steamed, rice]",True


In [42]:
def only_prod_names(row):
    keep = []
    for i in range(len(row['labels'])):
        if 'PROD' in row['labels'][i]:
            keep.append(row['text_tokenized'][i])
    return keep

In [43]:
data['product_tags'] = data.apply(only_prod_names, axis=1)

In [44]:
data

Unnamed: 0,text_index,labels,text,text_tokenized,match,product_tags
0,0,"[O, O, B-PROD, B-PROD, O]",#F11 CREAM HAMBURG 0,"[#, F11, CREAM, HAMBURG, 0]",True,"[CREAM, HAMBURG]"
1,1,"[O, B-PROD]",#PKTPOLSBTSPON2S,"[#, PKTPOLSBTSPON2S]",True,[PKTPOLSBTSPON2S]
2,2,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) BIHUN GORENG SEAFOOD,"[(, TA, ), BIHUN, GORENG, SEAFOOD]",True,"[BIHUN, GORENG, SEAFOOD]"
3,3,"[O, O, O, B-PROD, I-PROD, I-PROD]",(TA) KWETIAW SEAFOOD SIRAM,"[(, TA, ), KWETIAW, SEAFOOD, SIRAM]",True,"[KWETIAW, SEAFOOD, SIRAM]"
4,4,"[O, O, O, B-PROD, I-PROD]",(TA) NASI GORENG,"[(, TA, ), NASI, GORENG]",True,"[NASI, GORENG]"
...,...,...,...,...,...,...
720,721,"[B-ADJ, B-PROD]",green tea,"[green, tea]",True,[tea]
721,722,"[B-PROD, I-PROD]",phad thai,"[phad, thai]",True,"[phad, thai]"
722,723,"[B-ADJ, B-PROD, I-PROD]",red curry beef,"[red, curry, beef]",True,"[curry, beef]"
723,724,"[B-PROD, I-PROD]",steamed rice,"[steamed, rice]",True,"[steamed, rice]"


In [45]:
data['product_name'] = data['product_tags'].apply(lambda x: ' '.join(x))

In [46]:
data[data.product_name=='']

Unnamed: 0,text_index,labels,text,text_tokenized,match,product_tags,product_name
20,20,"[O, O, O, B-ADJ, I-ADJ]",3002-Kyoto Choco Mochi,"[3002, -, Kyoto, Choco, Mochi]",True,[],
41,41,"[B-BRAND, B-ADJ, I-ADJ]",AMBUSH DBL CHS,"[AMBUSH, DBL, CHS]",True,[],
163,163,"[B-BRAND, I-BRAND]",CIRENG PANDAWA,"[CIRENG, PANDAWA]",True,[],
164,164,"[O, O, B-ADJ, O]",CK.MANTAP A,"[CK, ., MANTAP, A]",True,[],
223,223,"[B-BRAND, O]",Dendeng PDS,"[Dendeng, PDS]",True,[],
245,245,"[B-BRAND, O, O]",FIXALL HK 26521,"[FIXALL, HK, 26521]",True,[],
271,271,"[B-ADJ, I-ADJ]",GULAI OTAK,"[GULAI, OTAK]",True,[],
288,288,[B-ADJ],Hokkaido,[Hokkaido],True,[],
300,301,"[B-ADJ, B-ADJ]",ICED White,"[ICED, White]",True,[],
345,346,"[B-BRAND, I-BRAND, O]",KP BRANDING L,"[KP, BRANDING, L]",True,[],


In [47]:
data = data[data.product_name!=''].reset_index(drop=True)
data = data.groupby('product_name')[['labels', 'text']].first().reset_index()
data.shape

(613, 3)

## Convert to embedding

### TF-IDF

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
vectorizer = TfidfVectorizer(analyzer='word')
vectors = vectorizer.fit_transform(data.product_name.tolist())
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [50]:
df['product_name'] = data.product_name.tolist()

In [51]:
df.product_name.nunique()

613

### Cosine Similarity

In [52]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy

In [53]:
sparse_test = scipy.sparse.csr_matrix(df[feature_names.tolist()].values)

In [54]:
cosine_sim = pd.DataFrame(cosine_similarity(sparse_test))
cosine_sim.index = data.text.tolist()
cosine_sim.columns = data.text.tolist()

In [55]:
cosine_sim

Unnamed: 0,Large *Plastik Kcl,500 days of summer (P),A.MINERAL GELAS,A.MINERAL BOTOL,ACCESORIES(SUN BABES PENUTUP MATA),AIR MINERAL,ALMOND CHOCO CREAMCHEESE,ALMOND CHOCO CREAM CHEESE,ALMOND CROSSIANT,AMANDEL BROOD,...,Mie Jumbo Pst/bakso,garlic pepper beef,cashew nuts chkn,red curry beef,Fre ice grentea,phad thai,steamed rice,sun kissed (P),green tea,chapsal twister donnut
Large *Plastik Kcl,1.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
500 days of summer (P),0.0,1.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.MINERAL GELAS,0.0,0.0,1.000000,0.685718,0.0,0.631184,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.MINERAL BOTOL,0.0,0.0,0.685718,1.000000,0.0,0.432815,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACCESORIES(SUN BABES PENUTUP MATA),0.0,0.0,0.000000,0.000000,1.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
phad thai,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
steamed rice,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
sun kissed (P),0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
green tea,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [110]:
cosine_sim.sort_values(by='Ice Lemon Tea', ascending=False).head(50)

Unnamed: 0,Large *Plastik Kcl,500 days of summer (P),A.MINERAL GELAS,A.MINERAL BOTOL,ACCESORIES(SUN BABES PENUTUP MATA),AIR MINERAL,ALMOND CHOCO CREAMCHEESE,ALMOND CHOCO CREAM CHEESE,ALMOND CROSSIANT,AMANDEL BROOD,...,Mie Jumbo Pst/bakso,garlic pepper beef,cashew nuts chkn,red curry beef,Fre ice grentea,phad thai,steamed rice,sun kissed (P),green tea,chapsal twister donnut
ICED LEMON TEA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587059,0.0
Ice Lemon Tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587059,0.0
CHINESE TEA KWAN'IM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
green tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Hot Tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
S-Fresh Lemon Lime,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EARL GREY MILK TEA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642189,0.0
DumDum Thai Iced Green Tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.601543,0.0
Lychee Ice Tea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.601543,0.0
ES CHOCO GREEN TEA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.601543,0.0


### Levenshtein Distance

In [56]:
import Levenshtein as lev

In [63]:
texts = data.text.tolist()
product_names = data.product_name.tolist()

In [66]:
text_mat = {}
for i in texts:
    curr_row = []
    for j in texts:
        curr_row.append(lev.distance(i, j))
    text_mat[i] = curr_row

In [72]:
text_lev = pd.DataFrame(text_mat)
text_lev.index = texts

In [75]:
text_lev.sort_values(by='A.MINERAL GELAS').head(20)

Unnamed: 0,Large *Plastik Kcl,500 days of summer (P),A.MINERAL GELAS,A.MINERAL BOTOL,ACCESORIES(SUN BABES PENUTUP MATA),AIR MINERAL,ALMOND CHOCO CREAMCHEESE,ALMOND CHOCO CREAM CHEESE,ALMOND CROSSIANT,AMANDEL BROOD,...,Mie Jumbo Pst/bakso,garlic pepper beef,cashew nuts chkn,red curry beef,Fre ice grentea,phad thai,steamed rice,sun kissed (P),green tea,chapsal twister donnut
A.MINERAL GELAS,18,21,0,5,27,9,18,18,12,10,...,18,18,16,14,15,14,14,15,14,21
A.MINERAL BOTOL,18,21,5,0,28,9,19,19,13,8,...,18,18,16,14,15,14,14,15,14,21
* MINERAL WATER,18,20,7,6,28,9,19,19,13,11,...,18,18,16,14,15,14,14,15,14,20
DONAT GULA,17,21,9,12,28,10,20,21,13,11,...,18,17,15,13,14,9,11,14,9,21
JASMINE MT ( L ),17,18,9,10,26,12,20,20,13,12,...,18,17,15,15,15,15,15,13,15,20
AIR MINERAL,17,21,9,9,29,0,20,21,13,11,...,18,17,15,13,14,11,12,13,11,21
ICED TEA,17,21,10,11,29,8,20,21,13,11,...,18,17,15,13,14,8,11,13,8,21
CUMI BANGKA,16,21,10,12,27,9,21,22,13,12,...,18,17,15,14,14,10,12,14,11,21
MIKA SEDANG,17,21,10,12,29,9,21,22,11,12,...,17,17,15,14,14,10,12,14,11,21
MIKA KECIL,17,21,10,10,30,8,20,21,13,11,...,17,17,15,14,14,9,12,14,10,21


### Jaro Winkler

In [76]:
import jaro

In [79]:
texts = data.text.tolist()
product_names = data.product_name.tolist()

In [80]:
text_mat = {}
for i in texts:
    curr_row = []
    for j in texts:
        curr_row.append(jaro.jaro_winkler_metric(i, j))
    text_mat[i] = curr_row

In [81]:
text_lev = pd.DataFrame(text_mat)
text_lev.index = texts

In [87]:
text_lev.sort_values(by='green tea', ascending=False).head(20)

Unnamed: 0,Large *Plastik Kcl,500 days of summer (P),A.MINERAL GELAS,A.MINERAL BOTOL,ACCESORIES(SUN BABES PENUTUP MATA),AIR MINERAL,ALMOND CHOCO CREAMCHEESE,ALMOND CHOCO CREAM CHEESE,ALMOND CROSSIANT,AMANDEL BROOD,...,Mie Jumbo Pst/bakso,garlic pepper beef,cashew nuts chkn,red curry beef,Fre ice grentea,phad thai,steamed rice,sun kissed (P),green tea,chapsal twister donnut
green tea,0.555556,0.378788,0.392593,0.392593,0.380174,0.400673,0.384259,0.383704,0.391204,0.396011,...,0.551657,0.626984,0.489352,0.57672,0.65291,0.555556,0.611111,0.404762,1.0,0.458754
Free Ice Tea,0.498148,0.421717,0.266667,0.372222,0.400327,0.282828,0.416667,0.415556,0.479167,0.386752,...,0.51462,0.578704,0.444444,0.646825,0.836667,0.462963,0.611111,0.376984,0.722222,0.414646
Sweet Tea,0.472222,0.378788,0.392593,0.451852,0.427015,0.400673,0.384259,0.383704,0.391204,0.396011,...,0.468324,0.477778,0.556019,0.515873,0.562963,0.444444,0.611111,0.28836,0.722222,0.460943
Pepenero Pastel,0.534259,0.386869,0.377778,0.377778,0.397386,0.385859,0.369444,0.368889,0.376389,0.381197,...,0.516458,0.466667,0.415278,0.496825,0.563889,0.511111,0.516667,0.430159,0.682407,0.521549
Ice Lemon Tea,0.420798,0.466977,0.365812,0.441453,0.475113,0.445221,0.340812,0.339145,0.361645,0.269231,...,0.592443,0.542735,0.565705,0.518926,0.591239,0.458689,0.533761,0.447802,0.653846,0.476024
Fre ice grentea,0.534259,0.453535,0.377778,0.377778,0.397386,0.385859,0.405556,0.404444,0.376389,0.381197,...,0.516374,0.660269,0.425,0.659921,1.0,0.451852,0.588095,0.434127,0.65291,0.521549
Organic Green Sa,0.523148,0.379924,0.4625,0.376389,0.372549,0.38447,0.472222,0.47,0.458333,0.379808,...,0.458553,0.660185,0.482143,0.550595,0.716035,0.395833,0.530754,0.489881,0.643188,0.546086
Lychee Ice Tea,0.531746,0.394805,0.25873,0.360317,0.384454,0.274892,0.446429,0.444762,0.511905,0.432234,...,0.498747,0.534392,0.545635,0.507937,0.576587,0.515873,0.59127,0.365079,0.637566,0.511544
Arem Arem,0.5,0.378788,0.4,0.4,0.473856,0.535354,0.486111,0.484444,0.449074,0.410256,...,0.385965,0.477778,0.449074,0.637566,0.562963,0.407407,0.590741,0.28836,0.62963,0.378788
Mineral Water,0.547212,0.41317,0.42906,0.42906,0.368778,0.278555,0.412393,0.411282,0.426282,0.435897,...,0.536887,0.56161,0.499038,0.447253,0.591239,0.500712,0.533761,0.364469,0.629223,0.493007
