In [4]:
import torch
import pickle
import pandas as pd
from tqdm import tqdm
from torch.nn.functional import cosine_similarity
from sentence_transformers import SentenceTransformer

In [5]:
# read sample transaction data
data = pd.read_csv("../../data/transactions_train.csv")
data.head(2)

In [19]:
# item metadata
articles = pd.read_csv("../../data/articles.csv")
articles.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [16]:
# selecting the relevant attributes
articles = articles[
    [
        "article_id",
        "prod_name",
        "product_type_name",
        "product_group_name",
        "graphical_appearance_name",
        "index_group_name",
        "section_name",
        "colour_group_name",
        "perceived_colour_value_name",
    ]
].copy()

# combining all columns except the id and performing ordered deduplication
articles["combined"] = articles.iloc[:, 1:].agg(" ".join, axis=1)
articles["combined"] = [
    " ".join(list(dict.fromkeys(name.split()))) for name in articles.combined
]
articles.drop_duplicates("combined", inplace=True)
articles.combined[0]

'Strap top Vest Garment Upper body Solid Black Dark'

In [1]:
# using a pre-trained model
bert = SentenceTransformer("bert-base-nli-mean-tokens")

In [8]:
# creating and saving embedding for every product
items = list(articles["combined"])
item_embedding = bert.encode(items)

identifier = list(articles["article_id"])
product_embeddings = [items, item_embedding, identifier]

with open("./checkpoint/pretrained_item_embedding.pkl", "wb") as f:
    pickle.dump(product_embeddings, f, protocol=2)

In [9]:
product_embeddings = pickle.load(
    open("./checkpoint/pretrained_item_embedding.pkl", "rb")
)
product_embeddings[1] = torch.nn.functional.normalize(
    torch.tensor(product_embeddings[1])
)

In [10]:
def upsell_topn(vector, item_embeddings, n):
    emb = torch.nn.functional.normalize(vector, dim=0)
    cosine = torch.matmul(item_embeddings, emb)
    score, idx = torch.sort(cosine, dim=-1, descending=True)
    return score[:n].tolist(), idx[:n].tolist()

In [6]:
# get top n recommendations for each item
result = []

for id, name in tqdm(articles[["article_id", "combined"]].values):
    embedded = torch.from_numpy(bert.encode(name))
    result.extend(
        [
            (id, product_embeddings[2][idx], round(score, 5))
            for score, idx in zip(*upsell_topn(embedded, product_embeddings[1], n=20))
        ]
    )

In [7]:
# map user's purchased items to their respective similar items
result = pd.DataFrame(
    result, columns=["article_id", "similar_article_id", "similar_score"]
)
result = result[result["article_id"] != result["similar_article_id"]]

result = (
    data[["customer_id", "article_id"]]
    .drop_duplicates()
    .merge(result, on=["article_id"])
)

result = (
    result.groupby(["customer_id", "similar_article_id"])["similar_score"]
    .sum()
    .reset_index()
)

result = result[["customer_id", "similar_article_id", "similar_score"]].rename(
    columns={"similar_article_id": "article_id"}
)
result.sort_values(
    ["customer_id", "similar_score"], ascending=[True, False], inplace=True
)
result.head()

In [8]:
result.to_csv("./upsell_recommendation.csv", index=False)

# Qualitative Analysis

In [11]:
def infer(item, count=10):
  """
  Returns products similar to given item 
  item : name of the item to get recommendations for
  count: number of recommendations to return
  """
    embedded = torch.from_numpy(bert.encode(item))
    res = pd.DataFrame([(item, product_embeddings[0][idx], round(score, 5)) 
          for score, idx in zip(*upsell_topn(embedded, product_embeddings[1], n=count))],
              columns=['item', 'recommendation', 'score'])

    return res[res['item'] != res['recommendation']]

In [12]:
infer("Trento Jacket Garment Upper body Solid Baby/Children Baby Girl Dark Red")

Unnamed: 0,item,recommendation,score
1,Trento Jacket Garment Upper body Solid Baby/Ch...,Trento jkt Jacket Garment Upper body Solid Bab...,0.99067
2,Trento Jacket Garment Upper body Solid Baby/Ch...,Nika hood Hoodie Garment Upper body Solid Baby...,0.96047
3,Trento Jacket Garment Upper body Solid Baby/Ch...,Angela sweater Sweater Garment Upper body Soli...,0.95901
4,Trento Jacket Garment Upper body Solid Baby/Ch...,Lucia sweater Sweater Garment Upper body Solid...,0.95886
5,Trento Jacket Garment Upper body Solid Baby/Ch...,Nova Cardigan Garment Upper body Solid Baby/Ch...,0.95688
6,Trento Jacket Garment Upper body Solid Baby/Ch...,NOVA Cardigan Garment Upper body Solid Baby/Ch...,0.95688
7,Trento Jacket Garment Upper body Solid Baby/Ch...,Ferne trousers Trousers Garment Lower body Sol...,0.95648
8,Trento Jacket Garment Upper body Solid Baby/Ch...,Cora chenille Sweater Garment Upper body Solid...,0.95139
9,Trento Jacket Garment Upper body Solid Baby/Ch...,CORA chenille Sweater Garment Upper body Solid...,0.95139


In [13]:
infer(
    "Strip silk blouse Shirt Garment Upper body Solid Ladieswear Womens Trend Dark Green Bright"
)

Unnamed: 0,item,recommendation,score
1,Strip silk blouse Shirt Garment Upper body Sol...,Ferenz sweater Dress Garment Full body Solid L...,0.96156
2,Strip silk blouse Shirt Garment Upper body Sol...,Strip shirt Shirt Garment Upper body Solid Lad...,0.95791
3,Strip silk blouse Shirt Garment Upper body Sol...,Hofvander top Blouse Garment Upper body Solid ...,0.95704
4,Strip silk blouse Shirt Garment Upper body Sol...,Unni dress Dress Garment Full body Solid Ladie...,0.9569
5,Strip silk blouse Shirt Garment Upper body Sol...,Como dress Dress Garment Full body Solid Ladie...,0.95494
6,Strip silk blouse Shirt Garment Upper body Sol...,Star knot dress Dress Garment Full body Solid ...,0.95365
7,Strip silk blouse Shirt Garment Upper body Sol...,Twix dress Dress Garment Full body Solid Ladie...,0.95293
8,Strip silk blouse Shirt Garment Upper body Sol...,Paulie blouse Blouse Garment Upper body Solid ...,0.95288
9,Strip silk blouse Shirt Garment Upper body Sol...,Lala Tunic Blouse Garment Upper body Solid Lad...,0.94986


In [14]:
infer(
    "Albus cross Bag Accessories All over pattern Ladieswear Womens Big accessories Light Beige Dusty"
)

Unnamed: 0,item,recommendation,score
1,Albus cross Bag Accessories All over pattern L...,Pansy cross bag Bag Accessories Embroidery Lad...,0.97224
2,Albus cross Bag Accessories All over pattern L...,Allegra bucket bag Bag Accessories Other patte...,0.96066
3,Albus cross Bag Accessories All over pattern L...,Felicity visor Cap/peaked Accessories All over...,0.95953
4,Albus cross Bag Accessories All over pattern L...,Natalia cross Bag Accessories Colour blocking ...,0.95643
5,Albus cross Bag Accessories All over pattern L...,Brandy waist belt Belt Accessories All over pa...,0.95548
6,Albus cross Bag Accessories All over pattern L...,Melrose Scarf Accessories All over pattern Lad...,0.95359
7,Albus cross Bag Accessories All over pattern L...,Ritz mini bag Bag Accessories All over pattern...,0.95323
8,Albus cross Bag Accessories All over pattern L...,Zitta Scarf Accessories All over pattern Ladie...,0.95295
9,Albus cross Bag Accessories All over pattern L...,Hedi top handle Bag Accessories Other structur...,0.95279
