# Notebook to test ranking model

In [1]:
%load_ext autoreload
%autoreload 2

## Load data

In [2]:
from data import load_data

train, val, test = load_data()

Load proposals

In [37]:
from postprocessing import read_proposals_files

In [38]:
files = [
    "BART-base-submission-23.csv", 
    "BART-large-submission-21.csv",
    "pegasus-large-submission-30.csv",
    "t5-base-submission-40.csv",
    "t5-small-submission-36.csv",
]

In [39]:
train["name_proposals"] = read_proposals_files([f"train_{file}" for file in files])

In [40]:
train

Unnamed: 0,name,description,name_proposals
25291,fabric tote bag,tote bag in a combination of colours. braided ...,"[braided tote bag with bag detail, plaited tot..."
5328,knit cardigan with ruffle trims,knit cardigan with a round neck. featuring lon...,"[rib knit cardigan with ruffles, check knit ca..."
28974,mercurised glass soap dish,mercurised glass soap dish.,"[, enameled glass soap dish, mercurized glass ..."
10697,joggers,relaxed fit trousers made of a linen blend. fe...,"[loose fit trousers, limited edition linen tro..."
8656,ribbed knit cardigan,"cardigan with round neckline, long sleeves and...","[rib knit cardigan, basic knit cardigan trf, c..."
...,...,...,...
7820,ripped skinny jeans,five-pocket skinny jeans with an adjustable in...,"[just soft bleach wash skinny jeans, skin carg..."
15507,flannel shirt with stand-up collar,regular fit shirt with a flannel finish. stand...,"[flannel jersey shirt, striped striped shirt, ..."
17874,basic denim jacket,collared denim jacket featuring long buttoned ...,"[hooded denim jacket, denim jacket with stripe..."
28245,low rattan basket with lid,seagrass basket in two tones with lid and hand...,"[seagrass basket and lid, seagrass basket, sea..."


In [41]:
val["name_proposals"] = read_proposals_files([f"val_{file}" for file in files])
val

Unnamed: 0,name,description,name_proposals
1190,t-shirt with polka dot flocking,round neck t-shirt featuring long sleeves with...,"[t-shirt, poplin t-shirt, t-shirt with ruffled..."
2774,double strap wrap dress trf,short v-neck dress featuring double straps and...,"[, belted dress, polka dot print dress, pleate..."
32709,cloud design bedspread with metallic thread,bedspread with a cloud design and metallic thr...,"[metallic thread clouds bedspread, blanket wit..."
32403,ceramic door knob with transfer (pack of 2),round white ceramic door knob featuring a blue...,"[round ceramic door knob, ceramic door knob wi..."
14237,puffer jacket,long sleeve puffer jacket featuring a detachab...,"[jacket with faux fur trim, long puffer jacket..."
...,...,...,...
31096,water lily voile dress,children's dress featuring a water lily print ...,"[water lily kids dress, water lily slip dress,..."
9682,striped sweatshirt,long sleeve hoodie. button fastening on the yo...,"[buttoned knit hoodie, sweatshirt with contras..."
2355,ruffled t-shirt trf,round neck t-shirt with short sleeves and a ru...,"[poplin t-shirt, t-shirt with ruffle detail, t..."
31358,pine cone and sleigh bells napkin holders (pac...,"napkin holder with faux twigs, pine cone, slei...",[pine cone and sleigh bell napkin holders (pac...


## Input data statistics

Count number of characters in data

In [None]:
all_descriptions = train["description"].append(val["description"]).append(test["description"])

In [None]:
all_descriptions

In [None]:
all_names = train["name"].append(val["name"])

In [None]:
all_names

In [None]:
all_texts = all_descriptions.append(all_names)

In [None]:
max_len_descriptions = max([len(d) for d in all_descriptions])
max_len_names = max([len(d) for d in all_names])

In [None]:
max_len_descriptions

In [None]:
max_len_names

Size of the vocabulary

In [None]:
from itertools import chain

vocabulary = set(chain(*all_descriptions, *all_names))
vocabulary_len = len(vocabulary)

## Siamese network model

In [None]:
from keras import Model
from keras.layers import Input, Embedding, GRU, Bidirectional, GlobalAveragePooling1D, Dense, Dot, Activation, Dropout
from keras import backend as K

embedding_size = 32
mixer_size = 128
hidden_size = 256
dropout = 0

shared_embedding = Embedding(vocabulary_len+1, embedding_size)

# Name tower
name_input = Input(shape=(max_len_names,))
name_embedding = shared_embedding(name_input)
name_mix = Bidirectional(GRU(mixer_size, return_sequences=True))(name_embedding)
name_pool = GlobalAveragePooling1D()(name_mix)
name_dense = Dense(hidden_size, activation="relu")(name_pool)
name_drop = Dropout(dropout)(name_dense)

# Description tower
description_input = Input(shape=(max_len_descriptions,))
description_embedding = shared_embedding(description_input)
description_mix = Bidirectional(GRU(mixer_size, return_sequences=True))(description_embedding)
description_pool = GlobalAveragePooling1D()(description_mix)
description_dense = Dense(hidden_size, activation="relu")(description_pool)
description_drop = Dropout(dropout)(description_dense)

# Comparison
dot_product = Dot(axes=1, normalize=True)([name_drop, description_drop])
output = Activation('relu')(dot_product)

# Final model
model = Model(inputs=[name_input, description_input], outputs=output)
model.compile(optimizer='adam', loss="binary_crossentropy", metrics=['accuracy'])

In [None]:
model.summary()

## Data generator

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(filters=None, char_level=True)
tokenizer.fit_on_texts(all_texts)

In [None]:
from keras.preprocessing.sequence import pad_sequences 

X_train_desc = tokenizer.texts_to_sequences(train["description"])
X_train_desc = pad_sequences(X_train_desc, maxlen=max_len_descriptions)
X_train_name = tokenizer.texts_to_sequences(train["name"])
X_train_name = pad_sequences(X_train_name, maxlen=max_len_names)

In [None]:
X_val_desc = tokenizer.texts_to_sequences(val["description"])
X_val_desc = pad_sequences(X_val_desc, maxlen=max_len_descriptions)
X_val_name = tokenizer.texts_to_sequences(val["name"])
X_val_name = pad_sequences(X_val_name, maxlen=max_len_names)

In [None]:
from itertools import islice
import numpy as np

def splitevery(iterable, n):
    """Returns blocks of elements from an iterator"""
    i = iter(iterable)
    piece = list(islice(i, n))
    while piece:
        yield piece
        piece = list(islice(i, n))

        
def data_generator(X_name, X_desc, batch_size):
    """Generator for batches of ranker training data"""
    while True:
        for idx in splitevery(range(len(X_desc)), batch_size//2):
            # Positive samples
            names = X_name[idx]
            positive_descriptions = X_desc[idx]
            # Negative samples: randomly take other descriptions for the same names
            negative_descriptions = X_desc[np.random.choice(range(len(X_desc)), len(idx), replace=False)]
            # Yield batch
            inputs = (
                np.vstack([names, names]),
                np.vstack([positive_descriptions, negative_descriptions])
            )
            outputs = np.expand_dims(np.array([1] * len(idx) + [0] * len(idx)), axis=1)
            yield inputs, outputs

## Training

In [None]:
from keras.callbacks import EarlyStopping

batch_size = 64
model.fit(
    data_generator(X_train_name, X_train_desc, batch_size),
    batch_size=batch_size, 
    steps_per_epoch=len(X_train_desc) // int(np.ceil(batch_size/2)),
    validation_data=data_generator(X_val_name, X_val_desc, batch_size),
    validation_steps=len(X_val_desc) // int(np.ceil(batch_size/2)),
    epochs=20,
    callbacks=[
        EarlyStopping(patience=10, restore_best_weights=True)
    ]
)

# Different approach: DistilBERT model

In [26]:
from postprocessing import unroll_data

unrolled_train = unroll_data(train["name"], train["description"], train["name_proposals"])

In [27]:
unrolled_train["description"][0]

'tote bag in a combination of colours. braided exterior in a combination of materials. shoulder straps with a decorative stud. lined interior with pocket and zip purse. magnetic clasp closure.height x length x width 26.3 x 38.5 x 14.5 cm. / 10.3 x 15.1 x 5.7″'

In [28]:
for name in unrolled_train[unrolled_train["description"] == unrolled_train["description"][0]]["name"]:
    if "fabric" in name:
        print(name)

fabric tote bag
fabric tote bag - limited edition
fabric tote bag with stud
fabric tote bag trf
fabric tote bag with studs


In [11]:
from ranker import train_ranker
trainer, model, tokenizer, collator = train_ranker("distilbert-base-uncased", train, val, train_batch_size=32)
#trainer, model, tokenizer, collator = train_ranker("distilbert-base-uncased", train[:1024], val[:1024], train_batch_size=32)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Step,Training Loss,Validation Loss,Auc
100,No log,0.111319,0.50926
200,No log,0.115204,0.510436
300,No log,0.113703,0.511998


AUC=0.5092601779696868
AUC=0.5104362569276104
AUC=0.5119978168695545


KeyboardInterrupt: 

### Try predicting for some texts

In [13]:
from data import normalize_text

sample_description = normalize_text("Knit midi dress with a V-neckline, straps and matching lace detail.<br/><br/>HEIGHT OF MODEL: 177 CM. / 69.6")
sample_description

'knit midi dress with a v-neckline, straps and matching lace detail.'

In [14]:
name_candidates = "knit dress with lace trim,lace dress trf,knit dress with lace trf,knit dress with lace,knit camisole dress,lace dress,knit dress with lace detail,cable-knit dress,knit dress,contrast lace dress trf,pepino,beautiful dress,hand-knit dress,hand-knit dress with lace,floral print dress,pleated dress trf,printed dress trf,printed dress,pleated dress,flowing dress trf,jacquilted dress,floral print dress trf,jacquard dress,loose-fitting dress".split(",")
name_candidates

['knit dress with lace trim',
 'lace dress trf',
 'knit dress with lace trf',
 'knit dress with lace',
 'knit camisole dress',
 'lace dress',
 'knit dress with lace detail',
 'cable-knit dress',
 'knit dress',
 'contrast lace dress trf',
 'pepino',
 'beautiful dress',
 'hand-knit dress',
 'hand-knit dress with lace',
 'floral print dress',
 'pleated dress trf',
 'printed dress trf',
 'printed dress',
 'pleated dress',
 'flowing dress trf',
 'jacquilted dress',
 'floral print dress trf',
 'jacquard dress',
 'loose-fitting dress']

In [17]:
from ranker import rank_candidates
#rank_candidates(trainer, [sample_description], [name_candidates])
rank_candidates(ranker, [sample_description], [name_candidates])

[['lace dress trf',
  'contrast lace dress trf',
  'knit dress with lace detail',
  'knit dress with lace',
  'jacquard dress',
  'knit dress with lace trim',
  'knit dress with lace trf',
  'pleated dress trf',
  'jacquilted dress',
  'floral print dress trf']]

### Load a previously saved model and rank validation proposals

In [16]:
from ranker import load_ranker

ranker = load_ranker("./ranker/checkpoint-20000", "distilbert-base-uncased")

In [25]:
result_loaded = rank_candidates(ranker, val["description"].values, val["name_proposals"].values[:128])

In [26]:
result_loaded

[['ruffled t-shirt trf',
  'ruff t-shirt trf',
  'ruffled poplin t-shirt',
  'floral t-shirt with ruffles',
  'satin t-shirt with ruffles',
  'ruffled top',
  'frilled t-shirt trf',
  'ruffled t-shirt with ruffles',
  't-shirt with ruffle trim trf',
  'textured t-shirt with ruffle trim'],
 ['satin dress',
  'floral print dress',
  'printed dress',
  'printed midi dress',
  'printed mini dress',
  'satin dress trf',
  'flowing dress',
  'pleated dress',
  'pleated dress trf',
  'jacquard dress'],
 ['melange cloud bedspread',
  'clouds bedspread',
  'cloudbedspread',
  'cloud melange bedspread',
  'cloud bedspread',
  'metallic thread cloud bedspread',
  'cloudspread with metallic thread',
  'cloudsatin bedspread',
  'cloud embroidery bedspread',
  'cloud motif bedspread'],
 ['cerceramic door knob',
  'ceramic door knob with sketch design',
  'cer ceramic door knob',
  'cerceramic door knob with sketch (pack of 2)',
  'cerceramic door knob with transfer (pack of 2)',
  'cerceramic door k

In [27]:
val["name"]

1190                       t-shirt with polka dot flocking
2774                           double strap wrap dress trf
32709          cloud design bedspread with metallic thread
32403          ceramic door knob with transfer (pack of 2)
14237                                        puffer jacket
                               ...                        
31096                               water lily voile dress
9682                                    striped sweatshirt
2355                                   ruffled t-shirt trf
31358    pine cone and sleigh bells napkin holders (pac...
9442                            sequinned heart sweatshirt
Name: name, Length: 3730, dtype: object

### Rank an ensemble of previous submissions

In [28]:
submissions = ["submission_21.csv", "submission_23.csv"]

proposals = read_proposals_files(submissions)

In [29]:
proposals

[['lace-trimmed dress',
  'knit camisole dress',
  'knit dress with lace detail',
  'strappy camisole dress',
  'camisole dress trf',
  'knit dress with lace',
  'contrast lace dress trf',
  'camisole dress',
  'knit dress',
  'contrast camisole dress',
  'lace dress',
  'lace dress trf',
  'knit dress with lace trf',
  'knit dress with lace trim',
  'combined camisole dress',
  'camisole dress with lace',
  'cable-knit dress'],
 ['printed dress',
  'printed midi dress trf',
  'pleated dress trf',
  'long printed dress trf',
  'long pleated dress trf',
  'pleated dress',
  'flowing dress with pleats',
  'jacquard dress',
  'flowing dress trf',
  'jacquilted dress',
  'printed dress trf',
  'loose-fitting dress',
  'satin dress with pleats',
  'floral print dress',
  'floral print dress trf',
  'faux leather dress trf'],
 ['textured nautical cap',
  'nautical cap',
  'check nautical cap',
  'long nautical cap',
  'printed nautical cap',
  'cap with peak',
  'seersucker nautical cap',
  

In [None]:
with open("submission_21.csv") as f:
    name_candidates = [line[:-1].split(",") for line in f.readlines()][1:]

In [None]:
with open("submission_23.csv") as f:
    name_candidates_2 = [line[:-1].split(",") for line in f.readlines()][1:]

In [None]:
len(name_candidates)

In [None]:
len(name_candidates_2)

In [None]:
name_candidates = [list(set(n1 + n2)) for n1, n2 in zip(name_candidates, name_candidates_2)]

In [30]:
test["description"]

0       knit midi dress with a v-neckline, straps and ...
1       loose-fitting dress with a round neckline, lon...
2       nautical cap with peak.this item must be retur...
3       nautical cap with peak. adjustable inner strap...
4       nautical cap with side button detail.this item...
                              ...                        
1436    striped print cotton cushion cover. cushion fi...
1437         rectangular cushion featuring a gnome print.
1438    cotton jersey eye mask featuring an elastic ba...
1439    padded chipboard hanger featuring an iron hook...
1440    iron hanger suitable for hanging all kinds of ...
Name: description, Length: 1441, dtype: object

In [31]:
from ranker import rank_candidates

result = rank_candidates(ranker, test["description"].values, proposals)
result

[['lace dress trf',
  'contrast lace dress trf',
  'knit dress with lace detail',
  'knit dress with lace',
  'knit dress with lace trf',
  'knit dress with lace trim',
  'lace-trimmed dress',
  'contrast camisole dress',
  'camisole dress',
  'camisole dress with lace'],
 ['jacquard dress',
  'pleated dress trf',
  'flowing dress trf',
  'floral print dress trf',
  'printed dress',
  'printed midi dress trf',
  'long printed dress trf',
  'long pleated dress trf',
  'flowing dress with pleats',
  'jacquilted dress'],
 ['herringbone nautical cap',
  'accesories 05',
  'textured nautical cap',
  'nautical cap',
  'neon nautical cap',
  'aquamarine nautical cap',
  'navy cap',
  'cap with peak',
  'n nautical cap',
  'sporty nautical cap'],
 ['herringbone nautical cap',
  'seersucker nautical cap',
  'linen nautical cap',
  'cap with inner strap detail',
  'textured nautical cap',
  'nautical cap with inner strap',
  'metallic nautical cap',
  'seamless nautical cap',
  'sporty nautical 

In [32]:
from postprocessing import save_submission

save_submission(result, "submission_27")

## Silly idea: just do majority voting

In [5]:
from postprocessing import read_proposals_files

submissions = [
#    "submission_56.csv",  # 30.29  BART-large x22
#    "submission_71.csv",  # 30.10  BART-large x14 crawl v3
#    "submission_77.csv",  # 29.76  BART-large x5 crawl v4
#    "submission_58.csv",  # 29.79  Pegasus-large x11
#    "submission_48.csv",  # 29.38  BART-base x21
#    "submission_55.csv",  # 29.08  t5-base x12
#    "submission_60.csv"   # 26.68  t5-small x12
]

# Hyper ensemble: join the best ensembles so far
submissions = [
    "submission_82.csv",  # 37.00
    "submission_79.csv",  # 36.97
    "submission_74.csv",  # 35.37
    "submission_66.csv",  # 34.83
    "submission_59.csv",  # 34.29
    "submission_57.csv",  # 34.21
    "submission_61.csv",  # 34.04
    #"submission_53.csv",  # 33.71
    "submission_49.csv",  # 32.72
    "submission_45.csv",  # 32.16
    "submission_41.csv",  # 32.91
    "submission_37.csv",  # 32.00
    "submission_32.csv",  # 30.94
    #"submission_29.csv",  # 29.03
]

    #"submission_50.csv",  # 29.07  t5-base x5
    #"submission_40.csv",  # 27.71  t5-base
    #"submission_21.csv",  # 27.37  BART-large
    #"submission_31.csv",  # 26.04  Pegasus-large
    #"submission_36.csv"   # 25.1   t5-small

proposals = read_proposals_files(submissions, remove_duplicates=None)

In [6]:
#from postprocessing import normalize_proposals_list
#
#proposals = normalize_proposals_list(proposals)
# TODO: seems to worsen results

In [7]:
from collections import Counter

def majority_ranking(proposals):
    return [[elem[0] for elem in Counter(names).most_common(10)] for names in proposals]

In [8]:
result = majority_ranking(proposals)

In [9]:
result

[['knit dress with lace trim',
  'lace dress trf',
  'knit dress with lace trf',
  'knit dress with lace detail',
  'knit dress with lace',
  'lace-trimmed dress',
  'combined lace dress',
  'contrast lace dress trf',
  'knit dress with matching lace',
  'camisole dress'],
 ['pleated dress trf',
  'printed dress trf',
  'flowing dress trf',
  'floral print dress trf',
  'floral print dress',
  'pleated dress',
  'pleated midi dress',
  'oversized dress trf',
  'textured dress with pleats',
  'printed dress'],
 ['nautical cap',
  'textured nautical cap',
  'nautical cap with peak',
  'striped nautical cap',
  'check nautical cap',
  'n nautical cap',
  'limited edition nautical cap',
  'seashell nautical cap',
  'sporty nautical cap',
  'faux leather nautical cap'],
 ['textured nautical cap',
  'nautical cap',
  'striped nautical cap',
  'limited edition nautical cap',
  'faux leather nautical cap',
  'nautical cap with strap',
  'sporty nautical cap',
  'technical nautical cap',
  'che

In [10]:
from postprocessing import save_submission

submission_name = "submission_84"
save_submission(result, submission_name)
save_submission(result, submission_name, zip=False)