# Notebook to test measuring log likelihoods of predictions

In [1]:
%load_ext autoreload
%autoreload 2

## Config

In [2]:
model_path = "./good_models/BART-base-submission-23/"
model_class = 'facebook/bart-base'

## Load resources

In [3]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained(model_path)

In [4]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768, padding_idx=1)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
   

In [5]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained(model_class)

In [6]:
from model import SpainAICollator

collator = SpainAICollator(tokenizer, model)

In [7]:
from data import load_data

train, val, test = load_data()

## Generate log-probabilities for a name

In [8]:
sample_description = train["description"].iloc[0]
sample_description

'tote bag in a combination of colours. braided exterior in a combination of materials. shoulder straps with a decorative stud. lined interior with pocket and zip purse. magnetic clasp closure.height x length x width 26.3 x 38.5 x 14.5 cm. / 10.3 x 15.1 x 5.7″'

In [9]:
sample_name = train["name"].iloc[0]
sample_name

'fabric tote bag'

In [30]:
import torch

def model_logprob(model, collator, input_text, output_text, length_penalty=0):
    encoded_inputs = collator.encode_inputs([input_text])
    encoded_outputs = collator.encode_outputs([output_text])
    with torch.no_grad():
        output = model(**encoded_inputs, **encoded_outputs)
    total_logprob = sum([
        output["logits"][0, i].log_softmax(dim=0)[encoded_outputs["labels"][0][i]]
        for i in range(1, len(encoded_outputs["labels"][0]) - 1)
    ])
    # Normalize by length: https://www.aclweb.org/anthology/W18-6322.pdf
    num_output_tokens = len(encoded_outputs["labels"][0]) - 2  # Ignore tokens for text start/end
    total_logprob /= (5+num_output_tokens)**length_penalty / (5+1)**length_penalty
    return total_logprob

In [31]:
def resort_name_proposals(model, collator, descriptions, name_proposals, length_penalty=0):
    resorted_proposals = []
    for description, proposals in zip(descriptions, name_proposals):
        logprobs = [model_logprob(model, collator, description, name, length_penalty) for name in proposals]
        #resorted_proposals.append([z[0] for z in sorted(zip(proposals, logprobs), key=lambda x: x[1], reverse=True)])
        resorted_proposals.append([z for z in sorted(zip(proposals, logprobs), key=lambda x: x[1], reverse=True)])
    return resorted_proposals

In [11]:
from postprocessing import read_proposals_files, normalize_proposals_list

name_proposals = normalize_proposals_list(read_proposals_files(["val_BART-base-submission-23.csv"]))

In [38]:
resort_name_proposals(model, collator, train["description"][0:1], name_proposals[0:1], length_penalty=1)

[[('fabric tote bag', tensor(-0.4432, device='cuda:0')),
  ('fabric tote bag with studs', tensor(-0.7342, device='cuda:0')),
  ('fabric tote bag with stud', tensor(-0.7806, device='cuda:0')),
  ('braided tote bag', tensor(-1.1178, device='cuda:0')),
  ('braided tote bag with studs', tensor(-1.2826, device='cuda:0')),
  ('plaited tote bag with stud', tensor(-1.3074, device='cuda:0')),
  ('plaited tote bag', tensor(-1.3076, device='cuda:0')),
  ('contrast braided tote bag', tensor(-1.3372, device='cuda:0')),
  ('braided tote bag with stud', tensor(-1.3636, device='cuda:0')),
  ('combined braided tote bag', tensor(-1.9073, device='cuda:0')),
  ('fabric tote bag trf', tensor(-1.9641, device='cuda:0')),
  ('woven tote bag with studs', tensor(-2.3306, device='cuda:0')),
  ('braided tote bag with purse', tensor(-2.3714, device='cuda:0')),
  ('multicoloured braided tote bag', tensor(-2.4489, device='cuda:0')),
  ('braided maxi handbag trf', tensor(-2.5390, device='cuda:0')),
  ('beaded tote ba

In [12]:
from postprocessing import unroll_test_data

unrolled_proposals = unroll_test_data(val["description"], name_proposals)
unrolled_proposals

Unnamed: 0,name,description,original_row
0,textured t-shirt with ruffle trim,round neck t-shirt featuring long sleeves with...,0
1,ruffled poplin t-shirt,round neck t-shirt featuring long sleeves with...,0
2,t-shirt with frilled neck,round neck t-shirt featuring long sleeves with...,0
3,plush t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0
4,ruffled t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0
...,...,...,...
110965,sweatshirt with reversible sequin hood,"long sleeve sweatshirt with a round neckline, ...",3729
110966,plush sweatshirt with reversible sequins,"long sleeve sweatshirt with a round neckline, ...",3729
110967,sequinned doll sweatshirt trf,"long sleeve sweatshirt with a round neckline, ...",3729
110968,sequinned long sleeve sweatshirt,"long sleeve sweatshirt with a round neckline, ...",3729


In [13]:
len(unrolled_proposals)

110970

In [55]:
%%time
from postprocessing import model_logprobs
#logprobs = model_logprobs(model, collator, unrolled_proposals["description"], unrolled_proposals["name"])
logprobs_penalty = model_logprobs(model, collator, unrolled_proposals["description"], unrolled_proposals["name"], length_penalty=1)

CPU times: user 5min 20s, sys: 50.2 s, total: 6min 11s
Wall time: 6min 2s


In [15]:
len(logprobs)

110970

In [56]:
unrolled_proposals["score"] = logprobs_penalty

In [57]:
unrolled_proposals

Unnamed: 0,name,description,original_row,score
0,textured t-shirt with ruffle trim,round neck t-shirt featuring long sleeves with...,0,-14.405650
1,ruffled poplin t-shirt,round neck t-shirt featuring long sleeves with...,0,-23.951827
2,t-shirt with frilled neck,round neck t-shirt featuring long sleeves with...,0,-23.716456
3,plush t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0,-14.545201
4,ruffled t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0,-14.691045
...,...,...,...,...
110965,sweatshirt with reversible sequin hood,"long sleeve sweatshirt with a round neckline, ...",3729,-26.678821
110966,plush sweatshirt with reversible sequins,"long sleeve sweatshirt with a round neckline, ...",3729,-26.217781
110967,sequinned doll sweatshirt trf,"long sleeve sweatshirt with a round neckline, ...",3729,-28.294693
110968,sequinned long sleeve sweatshirt,"long sleeve sweatshirt with a round neckline, ...",3729,-28.489211


In [58]:
from postprocessing import reroll_score_test_data

sorted_proposals = reroll_score_test_data(unrolled_proposals, ascending=False)
sorted_proposals

0       [ribbed t-shirt with ruffles, floral t-shirt w...
1       [flowing dress with belt, satin dress with bel...
2       [cloudbedspread with metallic thread, cloud be...
3       [ceramic door knob with drawings (pack of 2), ...
4       [puffer jacket with faux fur trim, reflective-...
                              ...                        
3725    [water lily dress - limited edition, water lil...
3726    [buttoned knit hoodie, plain sweatshirt with b...
3727    [t-shirt with ruffled hem, plain t-shirt with ...
3728    [pine cone and tartan napkin holder (pack of 2...
3729    [sweatshirt with reversible sequin detail, lim...
Name: proposals, Length: 3730, dtype: object

In [59]:
unrolled_proposals.iloc[0:31]

Unnamed: 0,name,description,original_row,score
0,textured t-shirt with ruffle trim,round neck t-shirt featuring long sleeves with...,0,-14.40565
1,ruffled poplin t-shirt,round neck t-shirt featuring long sleeves with...,0,-23.951827
2,t-shirt with frilled neck,round neck t-shirt featuring long sleeves with...,0,-23.716456
3,plush t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0,-14.545201
4,ruffled t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0,-14.691045
5,t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0,-24.523283
6,floral t-shirt,round neck t-shirt featuring long sleeves with...,0,-33.848721
7,t-shirt with frilled neck trf,round neck t-shirt featuring long sleeves with...,0,-14.972052
8,t-shirt with ruffles trf,round neck t-shirt featuring long sleeves with...,0,-15.221459
9,frilly t-shirt,round neck t-shirt featuring long sleeves with...,0,-35.405031


In [36]:
sorted_proposals[0]

['ribbed t-shirt with ruffles',
 'floral t-shirt with ruffles',
 'textured t-shirt with ruffles',
 'ribbed t-shirt with ruffle trim',
 'long sleeve t-shirt with ruffles',
 'satin t-shirt with ruffles',
 'textured t-shirt with ruffle trim',
 'gathered t-shirt with ruffles',
 'plush t-shirt with ruffles',
 'striped t-shirt with ruffles']

In [62]:
from model import dcg
import numpy as np

print(f"Original DCG={dcg(name_proposals, val['name'])}")
print(f"Loglikelihood DCG={dcg(sorted_proposals, val['name'])}")
print(f"Random DCG={dcg([list(np.random.permutation(x)) for x in sorted_proposals], val['name'])}")

Original DCG=12.70006517501936
Loglikelihood DCG=7.455891613002281
Random DCG=5.073725060921905
