## Import library

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, DataCollatorWithPadding, BertModel
from tqdm import tqdm

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

## Load Data

In [2]:
dataPath = Path("../data/shopify/")
[files for files in dataPath.glob("*")]

[PosixPath('../data/shopify/models'),
 PosixPath('../data/shopify/raw'),
 PosixPath('../data/shopify/shopify_apps.csv')]

In [3]:
apps_df = pd.read_csv(dataPath/"shopify_apps.csv")
apps_df.head(10)

Unnamed: 0,app_id,url,title,description,tagline,category
0,9e4748a9-7eda-4814-83b6-0537d44152b1,https://apps.shopify.com/translate,Panda Language Translate,Panda Language Translation improve your busine...,Translate your store into multiple languages,Store design
1,d1476138-a608-4bb9-8d39-b30f3ca7617d,https://apps.shopify.com/instant-brand-page,Instant Brand Page,Having a brand page gives your shoppers a quic...,A-Z Brand Index Page and Favourites Slider,Store design
2,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,https://apps.shopify.com/powr-faq,FAQ Accordion | Help Center,"Create, organize and display Frequently Asked ...","FAQ page, FAQ accordion menu for product Info ...","Store design,Customer support"
3,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,https://apps.shopify.com/mps-promote-me,Promote Me | Many apps in one,Promote Me app is a bundle of apps that includ...,"Spin Wheel,Currency Converter,Quick ATC Button...","Store design,Sales and conversion optimization"
4,7aac2a1f-ff03-4f38-aeb7-7619403a6f05,https://apps.shopify.com/installify-boost-mobi...,Instalify,We are trusted by hundreds of Shopify and Shop...,Supercharge Your Mobile App Installs,Store design
5,c13bfb7f-8b5a-40c6-a338-dbdec5cfd130,https://apps.shopify.com/easy-product-feed,EASY product feed,There are a lot of product feed apps out there...,Get your products listed on Google Shopping th...,Marketing
6,6a71634f-f94f-498d-8713-d2b01ca90917,https://apps.shopify.com/posbill-connect,PosBill Connect,PosBill Connect - the perfection connection to...,Connect your Webshop to the solutions of PosBill.,"Orders and shipping,Inventory management"
7,da50f0bf-d116-46a0-b0c1-0d90c05d8ffe,https://apps.shopify.com/bulk-fulfill,Bulk Fulfill,Upload to fulfill: \n Bulk Fulfill by Upatra a...,Reduce wasted time - Automatically sync your t...,"Orders and shipping,Productivity"
8,89735c3c-4d25-40f2-8b54-150f59cfb099,https://apps.shopify.com/show-price-in-btc,Show Price in BTC,Simply convert your store's prices into ANY cu...,A better currency converter app. Premium featu...,Store design
9,3c89c108-4858-4b07-893a-460ea5a0d91a,https://apps.shopify.com/product-copy,POKY ‑ Product Importer,Poky \n Allows you to import/copy products fro...,Copy / Import products from any Shop store wit...,"Productivity,Finding and adding products"


## Creating Multilabel targets

In [4]:
apps_df['category'] = apps_df.category.apply(lambda x: x.split(','))
mlb = MultiLabelBinarizer()
targets = mlb.fit_transform(apps_df.category)

## Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(apps_df.drop(['category'],axis=1), apps_df.category, test_size=0.2, random_state =21)

In [6]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train = targets[y_train.index]
y_test = targets[y_test.index]

In [7]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3800, 5), (3800, 12), (950, 5), (950, 12))

## Instantiating a Tokenizer

In [8]:
## Specifying which bert model to use
checkpoint = "bert-base-uncased"
## Loading Tokenizer for the respective checkpoint
tokenizer = BertTokenizer.from_pretrained(checkpoint)

## BertDataset Class

In [10]:
class BertDataset:
    def __init__(self, sentence1, target, tokenizer):
        self.sentence1 = sentence1
        self.target = target
        self.tokenizer = tokenizer
        pass
    
    def __len__(self):
        return len(self.sentence1)
    
    def __getitem__(self, idx):
        sentence1 = self.sentence1[idx]
        
        ##inputs = self.tokenizer(sentence1, sentence2, truncation=True)
        inputs = self.tokenizer.encode_plus(
            sentence1,
            None,
            add_special_tokens=True,
            max_length=128, ## Sequence length
            padding='max_length', ## Pad to 128
            truncation=True ## Truncate if bigger
        )
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[idx], dtype=torch.float)
        }

In [12]:
train_dataset = BertDataset(
    sentence1 = X_train.description.astype('str'),
    target=y_train,
    tokenizer=tokenizer
)

test_dataset = BertDataset(
    sentence1 = X_test.description.astype('str'),
    target = y_test,
    tokenizer=tokenizer
)

In [13]:
## check
train_dataset[0]

{'input_ids': tensor([  101,  6412,  6047,  9598, 10463,  2131,  5310,  3295,  8073,  1998,
         10463,  2035,  7597,  2000,  2037,  2334, 12731, 14343, 14767,  1012,
          2036,  8834,  1037,  4530,  7698, 12183,  2000,  2689,  9598,  4289,
          1012,  9958,  2248,  5198,  5198,  2013,  2060,  9530,  3775,  6072,
          2514,  8796,  2000,  3191,  7597,  1999,  2060,  9598,  1010,  2000,
          2012,  7559,  6593,  2122,  2828,  1997,  5198,  2256, 10439,  7126,
          2017,  1012,  8272,  1999,  2309, 11562,  6047,  9598, 10463,  2121,
         10439,  2003,  2200,  4248,  1998,  3733,  2000, 16500,  1012,  2689,
          2028,  2465,  1999,  2115,  3573, 10906,  1998,  2717,  2097,  5047,
          2011, 10439,  8073,  1012,  2057,  3073,  2035,  2828,  1997,  2490,
          1012,  2022, 21823, 12031,  7597, 10439,  2097,  2022,  8352,  2011,
          2048, 26066,  3182,  2061,  2009,  2097,  2507,  3835,  5310,  8278,
          1012,  2838,  1024,  6882,  3

## Creating Data Loaders

In [14]:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=8
)

test_dataloader  = DataLoader(
    test_dataset, batch_size=8
)

In [15]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 128]),
 'attention_mask': torch.Size([8, 128]),
 'token_type_ids': torch.Size([8, 128]),
 'targets': torch.Size([8, 12])}

## Model Class

In [16]:
class MultilabeledSequenceModel(nn.Module):
    def __init__(self,
                 pretrained_model_name,
                 label_nbr):
        """
        Just extends the AutoModelForSequenceClassification for N labels
        pretrained_model_name string -> name of the pretrained model to be fetched from HuggingFace repo
        label_nbr int -> number of labels of the dataset
        """
        super(MultilabeledSequenceModel,self).__init__()
        self.transformer = BertModel.from_pretrained(pretrained_model_name)
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(768,label_nbr)
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.transformer(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )[1]
        return self.classifier(x)

## Loss Function

In [17]:
def loss_fn(logits, targets):
    bceLoss = nn.BCEWithLogitsLoss()
    return bceLoss(logits, targets)

In [18]:
## check
model = MultilabeledSequenceModel(checkpoint, 12)

preds = model(
    input_ids=batch["input_ids"], 
    attention_mask=batch["attention_mask"],  
    token_type_ids=batch["token_type_ids"])

del model
loss_fn(torch.tensor(preds, dtype = float), batch['targets'])

  loss_fn(torch.tensor(preds, dtype = float), batch['targets'])


tensor(0.7686)

## Training and Testing Function

In [19]:
## Training Function
def train_fn(data_loader, model, optimizer, device, scheduler):
    ## Setting model in training mode
    model.to(device)
    model.train()
    
    for idx, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        input_ids = batch['input_ids'].to(device, dtype=torch.long)
        attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
        targets = batch["targets"].to(device, dtype=torch.float)
        
        ## Making optimizer gradient to zero
        optimizer.zero_grad()
        
        ## Calcualte output
        outputs = model(
            input_ids = input_ids, 
            attention_mask= attention_mask, 
            token_type_ids=token_type_ids
        )
        
        ## Calculate loss
        loss = loss_fn(outputs, targets)
        
        ## Compute gradients
        loss.backward()
        
        ## updating weights
        optimizer.step()
        
        scheduler.step()

In [20]:
def eval_fn(data_loader, model, device):
    model.to(device)
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            
            targets = batch["targets"]
            fin_targets.extend(targets.detach().numpy().tolist())
            
            outputs = model(input_ids = input_ids, attention_mask= attention_mask, token_type_ids=token_type_ids)
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

## Metric Function

In [21]:
def accuracy_thresh(y_pred, y_true, thresh=0.5):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    return ((np.array(y_pred)>thresh).astype(float)==np.array(y_true)).mean()

In [22]:
## check
model = MultilabeledSequenceModel(checkpoint, 12)
o,t = eval_fn(
    data_loader = test_dataloader, 
    model=model,
    device = torch.device("cuda")
)
del model
accuracy_thresh(y_pred = o, y_true = t, thresh=0.5)

100%|██████████| 119/119 [00:10<00:00, 11.33it/s]


0.4791666666666667

## Run Model

In [23]:
## Initiate model
model = MultilabeledSequenceModel(checkpoint, 12)

In [24]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]


num_train_steps = int(len(X_train) / 8 * 5)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

In [25]:
## Train Model
best_accuracy = 0
n_epoch = 3
device = torch.device("cuda")
for epoch in range(3):
    ## Call Training
    train_fn(train_dataloader, model=model, optimizer=optimizer, device=device, scheduler=scheduler)
    
    ## Call Evaluation
    o, t = eval_fn(test_dataloader, model, device)
    
    ## Compute metric
    accuracy = accuracy_thresh(y_pred = o, y_true = t, thresh= 0.5)
    print(f"Epoch : {epoch} Accuracy Score = {accuracy*100}%")

100%|██████████| 475/475 [01:39<00:00,  4.79it/s]
100%|██████████| 119/119 [00:10<00:00, 11.64it/s]
  0%|          | 0/475 [00:00<?, ?it/s]

Epoch : 0 Accuracy Score = 100.0%


100%|██████████| 475/475 [01:39<00:00,  4.78it/s]
100%|██████████| 119/119 [00:10<00:00, 11.62it/s]
  0%|          | 0/475 [00:00<?, ?it/s]

Epoch : 1 Accuracy Score = 100.0%


100%|██████████| 475/475 [01:39<00:00,  4.77it/s]
100%|██████████| 119/119 [00:10<00:00, 11.65it/s]

Epoch : 2 Accuracy Score = 100.0%





In [26]:
o,t = eval_fn(
    data_loader = test_dataloader, 
    model=model,
    device = torch.device("cuda")
)
accuracy_thresh(y_pred = o, y_true = t, thresh=0.5)

100%|██████████| 119/119 [00:10<00:00, 11.62it/s]


1.0

In [34]:
## save model
torch.save(model.state_dict(), dataPath/"models/hf_bert.bin")

In [37]:
## Loading Model
model = MultilabeledSequenceModel(checkpoint, 12)
model.load_state_dict(torch.load(dataPath/"models/hf_bert.bin"))
model.to(torch.device("cuda"))
o,t = eval_fn(
    data_loader = test_dataloader, 
    model=model,
    device = torch.device("cuda")
)
accuracy_thresh(y_pred = o, y_true = t, thresh=0.5)

100%|██████████| 119/119 [00:10<00:00, 11.67it/s]


1.0

In [41]:
mlb.inverse_transform(np.array(o)[0:5] > 0.5), mlb.inverse_transform(np.array(t)[0:5])

([('Finances', 'Reporting'),
  ('Sales and conversion optimization', 'Store design'),
  ('Store design',),
  ('Store design',),
  ('Trust and security',)],
 [('Finances', 'Reporting'),
  ('Sales and conversion optimization', 'Store design'),
  ('Store design',),
  ('Store design',),
  ('Trust and security',)])