# Notebook for generating predictions from a model

In [1]:
%load_ext autoreload
%autoreload 2

## Config

In [2]:
model_path = "./good_models/pegasus-large-submission-30/"
model_class = 'google/pegasus-large'

## Load resources

In [3]:
from transformers import BartForConditionalGeneration, PegasusForConditionalGeneration

#model = BartForConditionalGeneration.from_pretrained(model_path)
model = PegasusForConditionalGeneration.from_pretrained(model_path)

In [4]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PegasusForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): BartSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=Tru

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_class)

In [6]:
from model import SpainAICollator

collator = SpainAICollator(tokenizer, model)

In [7]:
from data import load_data

train, val, test = load_data()

## Generate predictions for test data

In [8]:
from model import generate_names
from postprocessing import save_submission

In [9]:
names = generate_names(model, tokenizer, collator, test["description"], batchsize=4)

In [10]:
names

[['knit dress with lace',
  'lace dress',
  'knit dress',
  'lace dress trf',
  'knit dress trf',
  'printed dress trf',
  'lace knit dress',
  'limited edition knit dress',
  'contrast knit dress',
  'ribbed dress trf'],
 ['printed dress',
  'printed dress trf',
  'pleated dress trf',
  'pleated dress',
  'floral print dress',
  'loose-fitting dress',
  'polka dot print dress',
  'poplin dress',
  'flowing dress trf',
  'satin dress trf'],
 ['nautical cap',
  'nautical cap with peak',
  'peak nautical cap',
  'nautical cap trf',
  'limited edition nautical cap',
  'cap with peak',
  'nautical cap with bow',
  'marine cap with peak',
  'printed nautical cap',
  'striped nautical cap'],
 ['nautical cap',
  'nautical cap trf',
  'nautical cap with peak',
  'limited edition nautical cap',
  'nautical cap with bow',
  'printed nautical cap',
  'nautical cap with strap',
  'embroidered nautical cap',
  'striped nautical cap',
  'sporty nautical cap'],
 ['nautical cap',
  'nautical cap with 

In [11]:
save_submission(names, "submission_30")

In [12]:
save_submission(names, "submission_30", zip=False)

In [None]:
extended_names = generate_names(model, tokenizer, collator, test["description"], batchsize=4, num_sequences=30)

In [None]:
extended_names

In [None]:
save_submission(extended_names, "submission_23")

Check all lines have 10 names

In [None]:
from collections import Counter

Counter([len(n) for n in extended_names])

Check the names in the basic file are the first in the extended file

In [None]:
for n, xn in zip(names, extended_names):
    assert set(n).issubset(set(xn)), f"{set(n)} not in {set(xn)}"

In [None]:
for n, xn in zip(names, extended_names):
    assert xn[:len(n)] == n, f"{xn} != {n}"

Not the same elements, neither same ordering! So we might be able to produce better results with a different generation method?

Try now using top_k and top_p filters, and removing length penalties.

In [None]:
refined_names = generate_names(model, tokenizer, collator, test["description"], batchsize=4, num_sequences=30)

In [None]:
refined_names

In [None]:
save_submission(refined_names, "submission_22")

In [None]:
Counter([len(set(rn) ^ set(xn)) for xn, rn in zip(extended_names, refined_names)])

## Generate predictions for training and validation data

This is useful for training ranker models

In [None]:
from model import generate_names
from postprocessing import save_submission

In [None]:
train_names = generate_names(model, tokenizer, collator, train["description"], batchsize=16, max_candidates=9999)
save_submission(train_names, "train_BART-base-submission-23")

In [None]:
val_names = generate_names(model, tokenizer, collator, val["description"], batchsize=16, max_candidates=9999)
save_submission(val_names, "val_BART-base-submission-23")