## Tokenizers and Pre-trained BERT

In [45]:
import os
import pandas as pd
import numpy as np

import torch

from tokenizers import Tokenizer, BertWordPieceTokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

from sklearn.metrics import accuracy_score, f1_score

In [2]:
raw_path = "../data/raw"

In [3]:
d_train = pd.read_csv(os.path.join(raw_path, "new_training_set.csv"), 
                      usecols=["title_1", "image_1", "title_2", "image_2", "Label"])

In [4]:
d_train.dropna(inplace=True)

In [5]:
d_train.shape

(10179, 5)

In [6]:
d_train.isna().sum()

title_1    0
image_1    0
title_2    0
image_2    0
Label      0
dtype: int64

In [11]:
d_train.title_1 = d_train.title_1.str.lower()
d_train.title_2 = d_train.title_2.str.lower()

In [12]:
title_list = d_train.title_1.to_list() + d_train.title_2.to_list()

In [15]:
if not os.path.exists("../data/bert_tokenizer/raw.txt"):
    with open("../data/bert_tokenizer/raw.txt", 'w') as f:
        for title in title_list:
            if isinstance(title, str):
                f.write(title)
                f.write('\n')

In [16]:
tokenizer = BertWordPieceTokenizer()
tokenizer.train('../data/bert_tokenizer/raw.txt')
tokenizer.save_model('../data/bert_tokenizer')

['../data/bert_tokenizer/vocab.txt']

In [17]:
d_train.head()

Unnamed: 0,title_1,image_1,title_2,image_2,Label
0,johnson’s ® top to toe hair & body bath 500ml,fdff8b9b8229da091dd7d070aae05f81.jpg,johnson's cottontouch top to toe hair & body b...,41e191742760932598c7bd201e5dad47.jpg,0
1,sandal humble,906cc44f0be72d4e767669b5b63e3a17.jpg,sandal humble glass - glanzton,7a556b836bfdd08ea592216440524a34.jpg,0
2,promo likuid likuit liquit baby pod liquid sal...,475c26635de18b9f93032400732ff336.jpg,voporizer liquit - likuit - likuid - liquid pr...,ace93bec689f3f1565800c500a8341fa.jpg,0
3,6 pasang / set anting tusuk bentuk lingkaran a...,e630997f6217555d6026547ad1c15f0b.jpg,subei 6 pasang / set anting tusuk boho bohemia...,31abbc176b09f5bd1728cfc3ecbbfb9c.jpg,0
4,rorec natural skin care mask rorec sheet mask ...,a27d11700a7902febd039dc3a96f10f2.jpg,rorec 86 natural skin care shert mask all variant,813ad9dd638c10f1765db9dde20c9e42.jpg,1


In [18]:
config = BertConfig()

In [19]:
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [20]:
tokenizer = BertTokenizer('../data/bert_tokenizer/vocab.txt')

In [21]:
model = BertForSequenceClassification(config)

In [22]:
t1 = d_train.title_1.tolist()
t2 = d_train.title_2.tolist()

In [29]:
def predict(t1_batch, t2_batch):
    t_encode = tokenizer(t1_batch, t2_batch, return_tensors="pt", padding=True)
    logits = model(**t_encode)
    y_pred = torch.softmax(logits[0], dim=1).argmax(1)
    
    return y_pred

In [38]:
start = 0
end = 32
y_pred_batch = []
for idx in range(int(len(t1)/32)+1):
    t1_batch = t1[start:end]
    t2_batch = t2[start:end]
    
    y_pred = predict(t1_batch, t2_batch)
    y_pred_batch.extend(y_pred.tolist())
    start += 32
    end += 32

In [43]:
y_pred = np.array(y_pred_batch)

In [48]:
y_test = d_train.loc[:, 'Label'].values

In [49]:
accuracy_score(y_test, y_pred)

0.5124275469103056

In [50]:
f1_score(y_test, y_pred)

0.5792284866468843

## Test

In [53]:
d_test = pd.read_csv(os.path.join(raw_path, "new_test_sample.csv"))

In [55]:
d_test.title_1 = d_test.title_1.str.lower()
d_test.title_2 = d_test.title_2.str.lower()

In [58]:
t1 = d_test.title_1.to_list()
t2 = d_test.title_2.to_list()