# Notebook to test ranking proposals by sentence transformers

In [1]:
%load_ext autoreload
%autoreload 2

## Load resources

In [2]:
from data import load_data

train, val, test = load_data()

In [3]:
from postprocessing import read_proposals_files

models = [
    "BART-large-submission-21", 
    "BART-base-submission-23",
    "pegasus-large-submission-30",
    "t5-base-submission-40",
    "t5-small-submission-36",
    "t5-small-submission-38"
]
    
val["name_proposals"] = read_proposals_files([f"val_{model}.csv" for model in models], remove_duplicates='global')
test["name_proposals"] = read_proposals_files([f"submission_{model.split('-')[-1]}.csv" for model in models], remove_duplicates='global')

In [4]:
train

Unnamed: 0,name,description
25291,fabric tote bag,tote bag in a combination of colours. braided ...
5328,knit cardigan with ruffle trims,knit cardigan with a round neck. featuring lon...
28974,mercurised glass soap dish,mercurised glass soap dish.
10697,joggers,relaxed fit trousers made of a linen blend. fe...
8656,ribbed knit cardigan,"cardigan with round neckline, long sleeves and..."
...,...,...
7820,ripped skinny jeans,five-pocket skinny jeans with an adjustable in...
15507,flannel shirt with stand-up collar,regular fit shirt with a flannel finish. stand...
17874,basic denim jacket,collared denim jacket featuring long buttoned ...
28245,low rattan basket with lid,seagrass basket in two tones with lid and hand...


In [5]:
val

Unnamed: 0,name,description,name_proposals
1190,t-shirt with polka dot flocking,round neck t-shirt featuring long sleeves with...,"[ruff t-shirt, t-shirt with ruffles, plain t-s..."
2774,double strap wrap dress trf,short v-neck dress featuring double straps and...,"[satin dress, printed dress, printed dress trf..."
32709,cloud design bedspread with metallic thread,bedspread with a cloud design and metallic thr...,"[cloud bedspread, cloudbedspread, cloud bedspr..."
32403,ceramic door knob with transfer (pack of 2),round white ceramic door knob featuring a blue...,"[ceramic door knob with sketches (pack of 2).,..."
14237,puffer jacket,long sleeve puffer jacket featuring a detachab...,"[water repellent puffer jacket, quilted jacket..."
...,...,...,...
31096,water lily voile dress,children's dress featuring a water lily print ...,"[water lily-print dress, water lily print mini..."
9682,striped sweatshirt,long sleeve hoodie. button fastening on the yo...,"[cropped sweatshirt, textured hoodie, ribbed s..."
2355,ruffled t-shirt trf,round neck t-shirt with short sleeves and a ru...,"[ruff t-shirt, frilled hem t-shirt, plain t-sh..."
31358,pine cone and sleigh bells napkin holders (pac...,"napkin holder with faux twigs, pine cone, slei...","[pine pine cone napkin holder (pack of 2), pin..."


In [6]:
test

Unnamed: 0,description,name_proposals
0,"knit midi dress with a v-neckline, straps and ...","[knit dress with lace trim, lace dress trf, kn..."
1,"loose-fitting dress with a round neckline, lon...","[floral print dress trf, flowing dress trf, lo..."
2,nautical cap with peak.this item must be retur...,"[foil nautical cap, check nautical cap, nautic..."
3,nautical cap with peak. adjustable inner strap...,"[nautical cap with inner strap, check nautical..."
4,nautical cap with side button detail.this item...,"[check nautical cap, nautical cap, nautical ca..."
...,...,...
1436,striped print cotton cushion cover. cushion fi...,"[stripedt/crg 05, stripedt print cushion cover..."
1437,rectangular cushion featuring a gnome print.,"[nogome cushion, nogger cushion, noggin cushio..."
1438,cotton jersey eye mask featuring an elastic ba...,"[cotton jersey eye mask, c cotton jersey eye m..."
1439,padded chipboard hanger featuring an iron hook...,"[paisley jacquard padded hanger (pack of 3), p..."


## Compute similarities with sentence transformers

In [7]:
#from sentence_transformers import SentenceTransformer
#model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [17]:
def combine_proposals(dataset):
    models = list(set(val.columns) - set(["name", "description"]))
    proposals = [[] for _ in range(len(dataset))]
    for model in models:
        proposals = [p + added for p, added in zip(proposals, dataset[model])]
    return proposals

In [21]:
import numpy as np
from sentence_transformers import util

def rank_sentence_embeddings(model, dataset):
    return [
        rank_proposals(model, description, proposals)
        #for description, proposals in zip(dataset["description"], combine_proposals(dataset))
        for description, proposals in zip(dataset["description"], dataset["name_proposals"])
    ]
    
    
def rank_proposals(model, description, proposals, limit=10):
    #Compute embedding for both lists
    embedding_description = model.encode([description], convert_to_tensor=True)
    embeddings_proposals = model.encode(proposals, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.pytorch_cos_sim(embedding_name, embeddings_proposals)[0]
    
    # Sort by similarity
    sorted_proposals = [proposals[z[0]] for z in sorted(enumerate(cosine_scores), key=lambda x: x[1], reverse=True)][:limit]
    return sorted_proposals

In [22]:
sample = 100

In [23]:
%%time
ranked_proposals = rank_sentence_embeddings(model, val.iloc[:sample])

NameError: name 'embedding_name' is not defined

In [12]:
from ranker import majority_ranking

majority_ranked_proposals = majority_ranking(combine_proposals(val.iloc[:sample]))

In [13]:
from model import dcg

print(f"DCG majority ranked = {dcg(majority_ranked_proposals, val['name'][:sample])}")
print(f"DCG s-transformers  = {dcg(ranked_proposals, val['name'][:sample])}")

DCG majority ranked = 21.259635900581586


NameError: name 'ranked_proposals' is not defined

## Train similarity model with sentence tranformers

In [14]:
from postprocessing import unroll_data

unrolled_val = unroll_data(val["name"], val["description"], val["name_proposals"])

In [15]:
unrolled_val

Unnamed: 0,name,description,label
0,t-shirt with polka dot flocking,round neck t-shirt featuring long sleeves with...,1
1,striped t-shirt,round neck t-shirt featuring long sleeves with...,0
2,ribbed top with ruffles,round neck t-shirt featuring long sleeves with...,0
3,poplin t-shirt,round neck t-shirt featuring long sleeves with...,0
4,gathered t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0
...,...,...,...
379310,sequinned oxford sweatshirt,"long sleeve sweatshirt with a round neckline, ...",0
379311,sequinverted lapel collar sweatshirt,"long sleeve sweatshirt with a round neckline, ...",0
379312,asymmetric sequinned sweatshirt,"long sleeve sweatshirt with a round neckline, ...",0
379313,sweatshirt with sequinned sleeves,"long sleeve sweatshirt with a round neckline, ...",0


In [75]:
from sentence_transformers import SentenceTransformer

#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [78]:
from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader

#Define your train examples.
train_examples = [
    InputExample(texts=[name, description], label=float(label))
    for name, description, label in zip(unrolled_val["name"], unrolled_val["description"], unrolled_val["label"])
]

#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=128)
train_loss = losses.CosineSimilarityLoss(model)

#Tune the model
#evaluator = evaluation.EmbeddingSimilarityEvaluator(unrolled_val["name"].iloc[:128], unrolled_val["description"].iloc[:128], unrolled_val["label"].iloc[:128])
#model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100, evaluator=evaluator, evaluation_steps=1)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2964 [00:00<?, ?it/s]

In [79]:
model.evaluate(evaluator)

0.056261097289757186

* 1-epoch train:  0.0460318068734377
* 2-epochs train: 0.056261097289757186

## Test model

In [35]:
import numpy as np
from sentence_transformers import util

def rank_sentence_embeddings(model, dataset):
    return [
        rank_proposals(model, description, proposals)
        for description, proposals in zip(dataset["description"], dataset["name_proposals"])
    ]
    
    
def rank_proposals(model, description, proposals, limit=10):
    #Compute embedding for both lists
    embedding_description = model.encode([description], convert_to_tensor=True)
    embeddings_proposals = model.encode(proposals, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.pytorch_cos_sim(embedding_description, embeddings_proposals)[0]
    
    # Sort by similarity
    sorted_proposals = [proposals[z[0]] for z in sorted(enumerate(cosine_scores), key=lambda x: x[1], reverse=True)][:limit]
    return sorted_proposals

In [36]:
sample = 100

In [37]:
%%time
ranked_proposals = rank_sentence_embeddings(model, val.iloc[:sample])

CPU times: user 2min 49s, sys: 30.8 s, total: 3min 20s
Wall time: 26.5 s


In [40]:
ranked_proposals

[['printed t-shirt',
  'printed t-shirt',
  'printed t-shirt with ruffles',
  'printed t-shirt with ruffles',
  'floral print t-shirt',
  'long sleeve t-shirt with ruffles',
  'animal print t-shirt',
  'contrast sleeve t-shirt',
  'ruffled print t-shirt',
  'ruffle-trimmed t-shirt'],
 ['printed dress',
  'printed dress',
  'printed dress',
  'printed dress',
  'printed dress',
  'check dress',
  'printed mini dress',
  'printed mini dress',
  'printed mini dress',
  'printed mini dress'],
 ['cloud bedpread',
  'cloud bedpread',
  'Cloud bedpread',
  'cloud bedpread',
  'cloud print cotton bedspread',
  'cloud mesh bedspread',
  'metallic thread cloud bedspread',
  'metallic thread cloud bedspread',
  'metallic thread cloud bedspread',
  'cloud bed sheet'],
 ['round ceramic door knob',
  'round ceramic door knob',
  'cerceramic door knob',
  'round ceramic door knob with sketch design transfer',
  'ceramic door knob (pack of 2).',
  'ceramic door knob (pack of 2).',
  'blue and black ce

In [38]:
from ranker import majority_ranking

majority_ranked_proposals = majority_ranking(combine_proposals(val.iloc[:sample]))

In [39]:
from model import dcg

print(f"DCG majority ranked = {dcg(majority_ranked_proposals, val['name'][:sample])}")
print(f"DCG s-transformers  = {dcg(ranked_proposals, val['name'][:sample])}")

DCG majority ranked = 21.259635900581586
DCG s-transformers  = 4.003598325126213
