# setup

In [1]:
HOME = "/data/git/product-category"

In [2]:
# http://localhost:8080/notebooks/git/product-category/notebooks/prep_20210304A1.ipynb
prfx_prp = 'prep_20210304A1'
p_out = f'{HOME}/data/transformer_20210304A1'
!mkdir -p {p_out}

In [3]:
import pandas as pd
import numpy as np
from collections import Counter

# eda 

In [4]:
%%time
# df = pd.read_csv(f'../data/data__{prfx_prp}.csv')
df = pd.read_csv(f'{HOME}/data/data_sample__{prfx_prp}.csv')

print(df.shape)
df.sample(3)

(10000, 9)
CPU times: user 109 ms, sys: 29.1 ms, total: 138 ms
Wall time: 191 ms


Unnamed: 0,category,description,title,brand,feature,asin,domain,txt,is_validation
8181,Appliances|Parts & Accessories|Refrigerator Pa...,,"AQUACREST Refrigerator Water Filter, Compatibl...",AQUA CREST,"NSF 42 certified to reduce chlorine, taste and...",B01G4XRSXI,Appliance,"AQUACREST Refrigerator Water Filter, Compatibl...",0
6293,Appliances|Parts & Accessories|Refrigerator Pa...,This is an O.E.M. Authorized part . This is an...,GE MXRC Refrigerator Water Filter,GE,This is an O.E.M. Authorized part\nThis is an ...,B0002GTTOU,Appliance,GE MXRC Refrigerator Water Filter GE This is a...,0
1721,"Appliances|Refrigerators, Freezers & Ice Maker...",ExtendFresh Temperature Management System Inte...,Kitchen Aid KRFC300ESS KRFC300ESS 20 Cu. Ft. s...,KitchenAid,Total (cu. Ft.): 20 / Fridge: 14.38 / Freezer:...,B00UZK9HUY,Appliance,Kitchen Aid KRFC300ESS KRFC300ESS 20 Cu. Ft. s...,0


In [5]:
MIN_CNT = 50
dmn2cnt = Counter(df.domain.value_counts().to_dict())
i2dmn = sorted(dmn2cnt.keys())
dmn2i = {v:k for k,v in enumerate(i2dmn)}
cat2cnt = Counter((j for i in df.category.apply(lambda x: x.split('|')) for j in i))
i2cat = sorted(k for k,v in cat2cnt.items() if v>50)
cat2i = {v:k for k,v in enumerate(i2cat)}

print("len(i2dmn), len(i2cat)", len(i2dmn), len(i2cat))
print("|".join(i2cat))

len(i2dmn), len(i2cat) 1 39
Accessories|Appliances|Bins|Built-In Dishwashers|Cooktop Parts & Accessories|Cooktops|Dishwasher Parts & Accessories|Dishwashers|Drip Pans|Dryer Parts & Accessories|Dryers|Filters|Freestanding Ranges|Freezer Parts & Accessories|Humidifier Parts & Accessories|Humidity Meters|Ice Makers|Knobs|Laundry Appliances|Motors|Oven Parts & Accessories|Parts & Accessories|Parts &amp; Accessories|Range Hood Parts & Accessories|Range Hoods|Range Parts & Accessories|Ranges|Ranges, Ovens & Cooktops|Refrigerator Parts & Accessories|Refrigerators|Refrigerators, Freezers & Ice Makers|Replacement Parts|Replacement Wicks|Vents|Wall Ovens|Washer Parts & Accessories|Washers|Washers & Dryers|Water Filters


In [6]:
## make ys

ys = np.zeros((len(df), len(i2cat)))

for i,cats in enumerate(df.category):
    idx_pos = [cat2i[cat] for cat in cats if cat in cat2i]
    ys[i,idx_pos] = 1

print("ys.shape", ys.shape)

ys.shape (10000, 39)


# modeling setup

- https://colab.research.google.com/drive/1F_RNcHzTfFuQf-LeKvSlud6x7jXYkG31#scrollTo=goRmGIRI5cfC

In [7]:
from argparse import ArgumentParser
import pytorch_lightning as pl
import transformers as tfm
from transformers.optimization import AdamW

# dataset

In [102]:
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer
import torch

In [107]:
def mk_tensors(txt, tokenizer, max_seq_length):
    tok_res = tokenizer(
        txt, truncation=True, padding='max_length', max_length=max_seq_length
    )
    input_ids = tok_res["input_ids"]
    attention_mask = tok_res["attention_mask"]
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    return input_ids, attention_mask

def mk_ds(txt, tokenizer, max_seq_length, ys):
    input_ids, attention_mask = mk_tensors(txt, tokenizer, max_seq_length)
    return TensorDataset(input_ids, 
                         attention_mask, 
                         torch.tensor(ys, dtype=torch.long)) 

In [159]:
class PCDataModule(pl.LightningDataModule):
    def __init__(self, 
                 model_name_or_path, 
                 max_seq_length, 
                 min_products_for_category,
                 train_batch_size,
                 val_batch_size,
                 data_file_path=None,
                 dataframe=None):
        super().__init__()
        self.data_file_path = data_file_path
        self.dataframe = dataframe
        self.min_products_for_category = min_products_for_category
        self.model_name_or_path = model_name_or_path
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
  
    def setup(self, stage):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)

        if self.dataframe is None:
            self.dataframe = pd.read_csv(self.data_file_path)

        cats = self.dataframe.category.apply(lambda x: x.split('|'))
        cat2cnt = Counter((j for i in cats for j in i))
        i2cat = sorted(k for k,v in cat2cnt.items() if v>self.min_products_for_category)
        cat2i = {v:k for k,v in enumerate(i2cat)}
        self.num_classes = len(i2cat)
        
        ys = np.zeros((len(df), len(i2cat)))
        for i,cats in enumerate(self.dataframe.category):
            idx_pos = [cat2i[cat] for cat in cats if cat in cat2i]
            ys[i,idx_pos] = 1
        
        msk_val = self.dataframe.is_validation==1
        idx_val = np.where(msk_val)[0]
        idx_trn = np.where(~msk_val)[0]
        ys_trn, ys_val = ys[idx_trn], ys[idx_val]
        
        txt = self.dataframe.txt.values
        self.train_dataset = mk_ds(list(txt[idx_trn]), tokenizer, self.max_seq_length, ys_trn)
        self.eval_dataset  = mk_ds(list(txt[idx_val]), tokenizer, self.max_seq_length, ys_val)
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.train_batch_size,
        )

    def val_dataloader(self):
        return DataLoader(
            self.eval_dataset,
            batch_size=self.val_batch_size,
        )

In [160]:
parser = ArgumentParser()
parser.add_argument('--model_name_or_path', type=str,
                    default="distilbert-base-cased")
parser.add_argument('--max_seq_length', type=int, default=32)
parser.add_argument('--min_products_for_category', type=int, default=100)
parser.add_argument('--train_batch_size', type=int, default=16)
parser.add_argument('--val_batch_size', type=int, default=8)
parser = pl.Trainer.add_argparse_args(parser)


args = parser.parse_args([
    '--default_root_dir', p_out,
])

data_module = PCDataModule(
    model_name_or_path=args.model_name_or_path,
    data_file_path=f'{HOME}/data/data_sample__{prfx_prp}.csv',
    min_products_for_category=args.min_products_for_category,
    max_seq_length=args.max_seq_length,
    train_batch_size=args.train_batch_size,
    val_batch_size=args.val_batch_size,
)

In [161]:
data_module.setup(0)

In [163]:
data_module.num_classes

30

In [152]:
dl_trn = data_module.train_dataloader()
dl_val = data_module.val_dataloader()

In [153]:
for dat in dl_trn:
    dat
    break

[o.shape for o in dat]

[torch.Size([16, 32]), torch.Size([16, 32]), torch.Size([16, 30])]

In [170]:
dat[0]

tensor([[  101,   160, 14518,  1233, 13764,  4539,  7421,   160, 10424, 16229,
          1604,  1604, 22737,   160, 14518,  1233, 13764,   160, 14518,  1233,
         13764,   118,   160, 10424, 16229,  1604,  1604, 22737,   118,   153,
         25810,   102],
        [  101, 25075,   160,  2064, 22639,  3190, 22737, 10973,  1475,  7389,
          1457, 18122,   152,  7912,   144, 10132,  7916, 25075,  4539,  7421,
           160,  2064, 22639,  3190, 22737, 10973,  1475, 22974,   160,  2064,
         22639,   102],
        [  101,   160, 14518,  1233, 13764,   160, 10424, 22737,  1604, 11964,
          1495, 11662,  4434, 17355, 18041,   160, 14518,  1233, 13764,  1188,
          1110,   170,  9198, 14846,  1162, 20777, 17510,  1880,  4539,   117,
          1109,   102],
        [  101,  5928,  4982, 12120,  2737, 24745,  1200, 12120,  2737,  3982,
          4679,  8226,   120,   153, 15629,  2970,   160,  2137, 25129,  3190,
         20150,  1568,  1495,  5928,  4982,  1188,  1110,  

In [154]:
for dat in dl_val:
    dat
    break

[o.shape for o in dat]

[torch.Size([8, 32]), torch.Size([8, 32]), torch.Size([8, 30])]

# model

In [171]:
import torch.nn as nn
import torch.nn.functional as F


In [248]:
class PCModel(pl.LightningModule):
    def __init__(self, model_name_or_path, num_classes, learning_rate, adam_beta1, adam_beta2, adam_epsilon):
        super().__init__()
        self.save_hyperparameters()
        self.bert = tfm.AutoModel.from_pretrained(model_name_or_path)
        self.num_classes = num_classes
        self.W = nn.Linear(self.bert.config.hidden_size, self.num_classes)

    def forward(self, input_ids, attention_mask):
        h = self.bert(input_ids, attention_mask)['last_hidden_state']
        h_cls = h[:, 0]
        return self.W(h_cls)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, ys = batch    
        logits = self(input_ids, attention_mask)
        loss = F.binary_cross_entropy_with_logits(logits, ys)
        self.log('train_loss', loss, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, ys = batch    
        loss = F.binary_cross_entropy_with_logits(logits, ys)
        self.log('valid_loss', loss, on_step=True, sync_dist=True)
        return {'val_loss': loss}
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),
                          self.hparams.learning_rate,
                          betas=(self.hparams.adam_beta1,
                                 self.hparams.adam_beta2),
                          eps=self.hparams.adam_epsilon,)
        return optimizer    
    
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--learning_rate', type=float, default=5e-5)
        parser.add_argument('--adam_beta1', type=float, default=0.9)
        parser.add_argument('--adam_beta2', type=float, default=0.999)
        parser.add_argument('--adam_epsilon', type=float, default=1e-8)
        return parser

In [249]:
parser = ArgumentParser()
parser.add_argument('--model_name_or_path', type=str,
                    default="distilbert-base-cased")
parser = pl.Trainer.add_argparse_args(parser)
parser = PCModel.add_model_specific_args(parser)

args = parser.parse_args([])

pcmodel = PCModel(
    model_name_or_path=args.model_name_or_path,
    num_classes= data_module.num_classes,
    learning_rate=args.learning_rate,
    adam_beta1=args.adam_beta1,
    adam_beta2=args.adam_beta2,
    adam_epsilon=args.adam_epsilon,
)

In [250]:
pl.seed_everything(1234)


# trainer = pl.Trainer.from_argparse_args(args)
trainer = pl.Trainer.from_argparse_args(args, fast_dev_run=True)
trainer.fit(pcmodel, data_module)


Global seed set to 1234
GPU available: True, used: False
TPU available: None, using: 0 TPU cores
Running in fast_dev_run mode: will run a full train, val and test loop using 1 batch(es).

  | Name | Type            | Params
-----------------------------------------
0 | bert | DistilBertModel | 65.2 M
1 | W    | Linear          | 23.1 K
-----------------------------------------
65.2 M    Trainable params
0         Non-trainable params
65.2 M    Total params
260.856   Total estimated model params size (MB)


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…




RuntimeError: result type Float can't be cast to the desired output type Long