In [4]:
# !pip3 install -r ./bert/requirements.txt

In [3]:
import torch
import pickle
import subprocess
import numpy as np
import pandas as pd
from tqdm import tqdm

from typing import List
from os.path import join
from bert.tokenizer import tokenizer
from transformers import BertTokenizer, BertModel

# Preprocessing data

In [5]:
# item metadata
articles = pd.read_csv("../../data/articles.csv")
articles.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [6]:
# selecting the relevant attributes
articles = articles[
    [
        "article_id",
        "prod_name",
        "product_type_name",
        "product_group_name",
        "graphical_appearance_name",
        "index_group_name",
        "section_name",
        "colour_group_name",
        "perceived_colour_value_name",
    ]
].copy()

# combining all columns except the id and performing ordered deduplication
articles["combined"] = articles.iloc[:, 1:].agg(" ".join, axis=1)
articles["combined"] = [
    " ".join(list(dict.fromkeys(name.split()))) for name in articles.combined
]
articles.drop_duplicates("combined", inplace=True)
articles.combined[0]

'Strap top Vest Garment Upper body Solid Ladieswear Womens Everyday Basics Black Dark'

# Train new model

In [7]:
class BERT:
    def build_tokenizer(self, training_data: List, output_dir: str):
        tokenizer(training_data=training_data, output_dir=output_dir)

    def train(self, mlm_file_path: str, **kwargs):
        command = f"python3 {mlm_file_path}"

        for arg, value in kwargs.items():
            command += f" --{arg} {value}"

        data_process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=True,
            encoding="utf-8",
            errors="replace",
        )

        while True:
            realtime_output = data_process.stdout.readline()

            if realtime_output:
                print(realtime_output.strip(), flush=True)

            if realtime_output == "" and data_process.poll() is not None:
                break

In [9]:
training_data = articles["combined"].unique().tolist()
output_dir = "./bert/checkpoint/"
mlm_file_path = "./bert/run_mlm.py"

bert = BERT()
bert.build_tokenizer(training_data, output_dir)
bert.train(
    mlm_file_path,
    model_type="bert",
    tokenizer_name=join(output_dir, "config/"),
    config_overrides="hidden_size=256,num_attention_heads=8,num_hidden_layers=2",
    train_file="./bert/input.txt",
    output_dir="./bert/output/",
    save_strategy="epoch",
    do_train="",
    num_train_epochs=20,
    overwrite_output_dir=False,
    line_by_line=True,
    mlm_probability=0.3,
    max_seq_length=256,
    per_device_train_batch_size=16,
    logging_strategy="epoch",
    disable_tqdm=False,
)

# Load Trained Model and Infer


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenize = BertTokenizer.from_pretrained("./bert/checkpoint/config/")

In [8]:
model = BertModel.from_pretrained(
    "./bert/output",
    output_hidden_states=True,
).to(device)
model.eval()
print("model loaded")

Some weights of the model checkpoint at ./bert/output were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ./bert/output and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should prob

model loaded


In [9]:
def get_embedding(model, text):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenize.tokenize(marked_text)
    indexed_tokens = tokenize.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    token_vecs = torch.mean(torch.cat(hidden_states), dim=0)
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return sentence_embedding


get_embedding(
    model, "POP TART seamless tee T-shirt Garment Upper body Solid Black Dark"
)

tensor([-0.3275, -0.0493, -0.3559, -0.3456, -0.0847, -0.0233,  0.8762, -0.0392,
         0.1263,  0.3204, -0.1534, -0.0048,  0.2028, -0.1132, -0.2078,  0.3503,
         0.4012,  0.2594, -0.3952,  0.0934, -0.0383,  0.3705, -0.2891, -0.0314,
         0.0613, -0.3339, -0.2797, -0.2661,  0.4252, -0.0125,  0.0588, -0.0103,
        -0.0290,  0.3385, -0.1068, -0.0578,  0.1480,  0.1303,  0.5816,  0.3885,
         0.1377,  0.5050, -0.5104, -0.5025, -0.4040, -0.0362, -0.7397, -0.1146,
         0.3284, -0.4785,  0.2793, -0.0295,  0.3171,  0.0466, -0.4530,  0.0068,
        -0.1888, -0.7240,  0.2058,  0.0711,  0.1197, -0.0370,  1.3536, -0.2934,
        -0.3175, -0.1377,  0.5862, -0.2475,  0.0646,  0.7662, -0.2638, -0.5100,
         0.2307,  0.1332,  0.0287,  0.1223,  0.1541, -0.5523,  0.1427,  0.1104,
         0.1932,  0.3432,  0.3786, -0.0476, -0.4729, -0.2059,  0.1740, -0.2800,
         0.0158, -0.0989, -0.0049, -0.1887, -0.0382,  0.0589,  0.2237, -0.1506,
        -0.4239, -0.1613, -0.4059, -0.12

In [11]:
# corpus building
items = list(articles["combined"])

item_embedding = []
for name in tqdm(items):
    inference = np.array(get_embedding(model, name).cpu())
    item_embedding.append(inference)

identifier = list(articles["article_id"])
product_embeddings = [items, item_embedding, identifier]

with open("./checkpoint/custom_trained_item_embedding.pkl", "wb") as f:
    pickle.dump(product_embeddings, f, protocol=2)

In [12]:
product_embeddings = pickle.load(
    open("./checkpoint/custom_trained_item_embedding.pkl", "rb")
)
product_embeddings[1] = torch.nn.functional.normalize(
    torch.tensor(product_embeddings[1])
)

  product_embeddings[1] = torch.nn.functional.normalize(torch.tensor(product_embeddings[1]))


In [13]:
def upsell_topn(vector, item_embeddings, n):
    emb = torch.nn.functional.normalize(vector, dim=0)
    cosine = torch.matmul(item_embeddings, emb)
    score, idx = torch.sort(cosine, dim=-1, descending=True)
    return score[:n].tolist(), idx[:n].tolist()

In [23]:
result = []

for id, name in tqdm(articles[["article_id", "combined"]].values):
    embedded = get_embedding(model, name).cpu()
    result.extend(
        [
            (id, product_embeddings[2][idx], round(score, 5))
            for score, idx in zip(*upsell_topn(embedded, product_embeddings[1], n=20))
        ]
    )

In [24]:
result = pd.DataFrame(
    result, columns=["article_id", "similar_article_id", "similar_score"]
)
result = result[result["article_id"] != result["similar_article_id"]]

result = (
    data[["customer_id", "article_id"]]
    .drop_duplicates()
    .merge(result, on=["article_id"])
)

result = (
    result.groupby(["customer_id", "similar_article_id"])["similar_score"]
    .sum()
    .reset_index()
)

result = result[["customer_id", "similar_article_id", "similar_score"]].rename(
    columns={"similar_article_id": "article_id"}
)
result.sort_values(
    ["customer_id", "similar_score"], ascending=[True, False], inplace=True
)
result.head()

In [25]:
result.to_csv("./upsell_recommendation.csv", index=False)

# Qualitative Analysis

In [16]:
def infer(item, count=10):
    """
    Returns products similar to given item
    item : name of the item to get recommendations for
    count: number of recommendations to return
    """
    embedded = get_embedding(model, item).cpu()
    res = pd.DataFrame(
        [
            (item, product_embeddings[0][idx], round(score, 5))
            for score, idx in zip(
                *upsell_topn(embedded, product_embeddings[1], n=count)
            )
        ],
        columns=["item", "recommendation", "score"],
    )

    return res[res["item"] != res["recommendation"]]

In [17]:
infer("Trento Jacket Garment Upper body Solid Baby/Children Baby Girl Dark Red")

Unnamed: 0,item,recommendation,score
1,Trento Jacket Garment Upper body Solid Baby/Ch...,Trento jkt Jacket Garment Upper body Solid Bab...,0.96805
2,Trento Jacket Garment Upper body Solid Baby/Ch...,Trento Jacket Garment Upper body Solid Baby/Ch...,0.95886
3,Trento Jacket Garment Upper body Solid Baby/Ch...,Cheapy Jacket Garment Upper body Solid Baby/Ch...,0.95067
4,Trento Jacket Garment Upper body Solid Baby/Ch...,RUT Jacket Garment Upper body Solid Baby/Child...,0.9484
5,Trento Jacket Garment Upper body Solid Baby/Ch...,Stenmark jacket Jacket Garment Upper body Soli...,0.94739
6,Trento Jacket Garment Upper body Solid Baby/Ch...,Luke Jacket Garment Upper body Solid Baby/Chil...,0.94535
7,Trento Jacket Garment Upper body Solid Baby/Ch...,Mango Jacket Garment Upper body Solid Baby/Chi...,0.9405
8,Trento Jacket Garment Upper body Solid Baby/Ch...,OLINE Jacket Garment Upper body Solid Baby/Chi...,0.93968
9,Trento Jacket Garment Upper body Solid Baby/Ch...,Zion Jacket Garment Upper body Solid Baby/Chil...,0.93806


In [18]:
infer(
    "Strip silk blouse Shirt Garment Upper body Solid Ladieswear Womens Trend Dark Green Bright"
)

Unnamed: 0,item,recommendation,score
1,Strip silk blouse Shirt Garment Upper body Sol...,Strip silk blouse Shirt Garment Upper body Sol...,0.93804
2,Strip silk blouse Shirt Garment Upper body Sol...,Maine Linen Blouse Garment Upper body Solid La...,0.92414
3,Strip silk blouse Shirt Garment Upper body Sol...,Lala Tunic Blouse Garment Upper body Solid Lad...,0.92125
4,Strip silk blouse Shirt Garment Upper body Sol...,Strip shirt Shirt Garment Upper body Solid Lad...,0.91619
5,Strip silk blouse Shirt Garment Upper body Sol...,Pastis blouse Blouse Garment Upper body Solid ...,0.91253
6,Strip silk blouse Shirt Garment Upper body Sol...,Granger Top Blouse Garment Upper body Check La...,0.91145
7,Strip silk blouse Shirt Garment Upper body Sol...,Pastis blouse Blouse Garment Upper body Solid ...,0.91027
8,Strip silk blouse Shirt Garment Upper body Sol...,Bonjour jaquard blouse Blouse Garment Upper bo...,0.90902
9,Strip silk blouse Shirt Garment Upper body Sol...,Lucy blouse Shirt Garment Upper body Solid Lad...,0.9086


In [22]:
infer(
    "Albus cross Bag Accessories All over pattern Ladieswear Womens Big accessories Light Beige Dusty"
)

Unnamed: 0,item,recommendation,score
1,Albus cross Bag Accessories All over pattern L...,Thirteen baguette bag Bag Accessories All over...,0.95104
2,Albus cross Bag Accessories All over pattern L...,Gracie Camera Cross Bag Accessories All over p...,0.94876
3,Albus cross Bag Accessories All over pattern L...,London bucket shopper Bag Accessories All over...,0.94668
4,Albus cross Bag Accessories All over pattern L...,Ritz mini bag Bag Accessories All over pattern...,0.94313
5,Albus cross Bag Accessories All over pattern L...,Twelve cross bag Bag Accessories All over patt...,0.94248
6,Albus cross Bag Accessories All over pattern L...,Bagsy snake bag Bag Accessories All over patte...,0.93745
7,Albus cross Bag Accessories All over pattern L...,Mickey Quilted Cross Bag Accessories All over ...,0.9333
8,Albus cross Bag Accessories All over pattern L...,Flirty crossbag Bag Accessories All over patte...,0.92667
9,Albus cross Bag Accessories All over pattern L...,W Anna coin purse Bag Accessories All over pat...,0.92594
