# Spain AI hackathon notebook

## Prepare environment

In [None]:
import transformers

In [None]:
!nvidia-smi

## Prepare data

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")

In [None]:
train.head()

In [None]:
test = pd.read_csv("test_descriptions.csv")
test.head()

Load extra crawled data

In [None]:
import json

with open("crawleddata.json") as f:
  crawled = json.load(f)

crawled = pd.DataFrame(crawled)
crawled.head()

Normalize data

In [None]:
import re

def normalize_texts(texts):
  return [normalize_text(text) for text in texts]

def normalize_text(text):
  # To lower case
  text = text.lower()
  # Remove html tags
  text = text.replace("<br/>", "")
  # Remove HEIGHT OF MODEL sections at the end of texts
  if "height of model" in text:
    text = text[:text.find("height of model")]
  return text

In [None]:
train = train.apply(normalize_texts, axis=0)
test = test.apply(normalize_texts, axis=0)
crawled = crawled.apply(normalize_texts, axis=0)

In [None]:
train

In [None]:
test

In [None]:
crawled

Prepare validation dataset, and append crawled data to train data

In [None]:
train = train.append(crawled).drop_duplicates().sample(frac=1., random_state=123)

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(train, test_size=0.1, random_state=42)

In [None]:
len(train)

In [None]:
len(val)

In [None]:
#FIXME: downsample
#train = train[:10]
#val = val[:10]

Create Pytorch DatasetLoader

In [None]:
import torch
from transformers.models.bart.modeling_bart import shift_tokens_right

class SpainAICollator:
    """Data collator for SpainAI hackathon data"""
    
    def __init__(self, tokenizer, model):
        """Initializes the collator with a tokenizer"""
        self.tokenizer = tokenizer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model

    def to_device(self, tensors):
        """Moves to the computing device a dictionary of Pytorch tensors"""
        for key in tensors:
            tensors[key] = tensors[key].to(self.device)
        return tensors
    
    def encode_inputs(self, texts):
        """Transforms an iterable of input texts into a dictionary of model input tensors, stored in the GPU"""
        input_encodings = self.tokenizer.batch_encode_plus(texts, padding="longest", return_tensors="pt")
        return self.to_device({
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
        })

    def encode_outputs(self, texts):
        target_encodings = tokenizer.batch_encode_plus(texts, padding="longest", return_tensors="pt")
        labels = target_encodings['input_ids']
        decoder_input_ids = shift_tokens_right(labels, self.model.config.pad_token_id)
        labels[labels[:, :] == self.model.config.pad_token_id] = -100
        return self.to_device({
            'decoder_input_ids': decoder_input_ids,
            'labels': labels,
        })
    
    def __call__(self, patterns):
        """Collate a batch of patterns
        
        Arguments:
            - patterns: iterable of tuples in the form (input_text, output_text), 
              or just iterable of input texts
            
        Output: dictionary of torch tensors ready for model input
        """
        # Check kind of input
        if len(patterns) < 1: raise ValueError(f"At least one pattern is required, found {len(patterns)}")
        if not isinstance(patterns[0], (tuple, str)): raise ValueError(f"Each pattern must be one text, or a tuple with two texts. Found {patterns[0]}")
        targets_provided = len(patterns[0]) == 2
        # Split texts and classes from the input list of tuples
        if targets_provided:
            input_texts, output_texts = zip(*patterns)
        else:
            input_texts = patterns
        # Encode inputs
        tensors = self.encode_inputs(input_texts)
        # Encode outputs (if provided)
        if targets_provided:
          tensors = {**tensors, **self.encode_outputs(output_texts)}
        return tensors

## Train BART conditional generation model

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

In [None]:
model_name = 'facebook/bart-base'
#model_name = 'facebook/bart-large'
tokenizer = BartTokenizer.from_pretrained(model_name)

In [None]:
model = BartForConditionalGeneration.from_pretrained(model_name)

In [None]:
collator = SpainAICollator(tokenizer, model)

In [None]:
from itertools import islice

def splitevery(iterable, n):
    """Returns blocks of elements from an iterator"""
    i = iter(iterable)
    piece = list(islice(i, n))
    while piece:
        yield piece
        piece = list(islice(i, n))

def generate_names(model, tokenizer, collator, descriptions, num_sequences=20, batchsize=16):
    name_proposals = []
    # Generate predictions in batches
    for descriptions in splitevery(descriptions, batchsize):
        tensors = collator.encode_inputs(descriptions)
        summary_ids = model.generate(
            tensors['input_ids'], 
            num_beams=num_sequences, 
            num_return_sequences=num_sequences, 
            do_sample=False, 
            early_stopping=True,
            top_k=50, 
            top_p=0.95,
            length_penalty=0,
        )
        decoded = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
        # Group proposals for each element
        decoded = splitevery(decoded, num_sequences)
        # Remove duplicates
        filtered = []
        for line in decoded:
            filtered_line = []
            for elem in line:
                if elem not in filtered_line:
                    filtered_line.append(elem)
            filtered.append(filtered_line[:10])
        name_proposals.extend(filtered)
    return name_proposals

In [None]:
from transformers import TrainerCallback
import numpy as np

class DCGCallback(TrainerCallback):
    def __init__(self, tokenizer, collator, val):
        self.tokenizer = tokenizer
        self.collator = collator
        self.val = val

    def on_evaluate(self, args, state, control, model, metrics, **kwargs):
        # Generate names
        names = generate_names(model, self.tokenizer, self.collator, self.val["description"])
        proposals_val = pd.DataFrame({
            "description": self.val["description"],
            "name": self.val["name"],
            "proposed": names,
        })
        # Evaluate DGC for each name
        dgc = 0
        for _, row in proposals_val.iterrows():
            if row["name"] in row["proposed"]:
                idx = row["proposed"].index(row["name"])
                dgc += 1 / np.log2(idx+2)
        dgc = dgc / len(proposals_val) * 100
        metrics["eval_dgc"] = dgc

dgc_callback = DCGCallback(tokenizer, collator, val)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./model',
    num_train_epochs=1,
    #per_device_train_batch_size=8,
    per_device_train_batch_size=32,
    #gradient_accumulation_steps=4,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=64,   
    warmup_steps=500,               
    weight_decay=0.01,              
    logging_dir='./logs',  
    evaluation_strategy="epoch",
    metric_for_best_model="eval_dgc",
    greater_is_better=True,
    load_best_model_at_end=True,
    save_steps=1000,
    save_total_limit=7
)

trainer = Trainer(
    model=model,        
    data_collator=collator,               
    args=training_args,                  
    train_dataset=list(zip(train['description'], train['name'])),
    eval_dataset=list(zip(val['description'], val['name'])),
    callbacks=[dgc_callback]
)

In [None]:
trainer.train()

Epoch 	Training Loss 	Validation Loss 	Dgc
1 	1.407647 	1.071616 	20.505750
2 	1.086566 	0.929239 	24.267701
3 	0.937676 	0.866479 	27.718055
4 	0.821126 	0.823135 	29.488869
5 	0.720493 	0.790940 	31.402732
6 	0.663704 	0.764436 	32.658482
7 	0.616000 	0.759198 	34.027026
8 	0.562149 	0.743891 	34.719459
9 	0.531981 	0.748388 	35.706914
10 	0.498206 	0.751276 	36.305989

In [None]:
# Load from checkpoint
#model = BartForConditionalGeneration.from_pretrained("./model/checkpoint-15128")
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = model.to(device)

Names generation

In [None]:
# List of common english words
!wget https://github.com/dwyl/english-words/raw/master/words.txt

In [None]:
with open("words.txt") as f:
  words = set([word.lower() for word in f.read().splitlines()])

In [None]:
from spacy.lang.en import English
nlp = English()
spacy_tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [None]:
samples = 10
names = generate_names(model, tokenizer, collator, val["description"][:samples])
proposals_val = pd.DataFrame({
    "description": val["description"][:samples],
    "name": val["name"][:samples],
    "proposed": names,
})
proposals_val

In [None]:
i = 4
print(proposals_val.iloc[i]["description"])
print(proposals_val.iloc[i]["name"])
print(len(set(proposals_val.iloc[i]["proposed"])))
proposals_val.iloc[i]["proposed"]

In [None]:
#names = generate_names(model, tokenizer, collator, train["description"])
#proposals_train = pd.DataFrame({
#    "description": train["description"],
#    "name": train["name"],
#    "proposed": names,
#})
#proposals_train

In [None]:
#!tar -zcvf model.tar.gz model/checkpoint-4725/

## Prepare submission

In [None]:
names = generate_names(model, tokenizer, collator, test["description"])

In [None]:
names[1]

In [None]:
submission = pd.DataFrame({
    "name": [",".join(name_list) for name_list in names]
})

In [None]:
submission.to_csv("submission.csv", index=False)

## Results so far

|Pre-processing|Model|Generation|Validation result|Validation CDG|Test result|
|--------------|-----|----------|-----------------|--------------|-----------|
|Lowercase, remove br|Simple BART base|Beam 10, sequences 10|-|-|10.41|
|Lowercase, remove br|BART-base, batchsize 32, warmup steps 500, weight_decay=0.01, max epochs 10, select lowest loss in 10% val.|Beam 10, sequences 10|0.883147|-|11.47|
|Lowercase, remove br|BART-large, batchsize 8, warmup steps 500, weight_decay=0.01, max epochs 6, select lowest loss in 10% val.|Beam 10, sequences 10|1.198717|-|13.57|
|Lowercase, remove br|BART-large, batchsize 8x4, warmup steps 500, weight_decay=0.01, max epochs 10, select lowest loss in 10% val in last 4 steps|Beam 10, sequences 10|1.118869|-|14.84|
|Lowercase, remove br|BART-large, batchsize 8x4, warmup steps 500, weight_decay=0.01, max epochs 4, select lowest loss in 10% val in last 4 steps|Beam 10, sequences 10|1.032960|-|15.12|
|Lowercase, remove br, remove height of model (WRONG PREPROCESSING)|BART-large, batchsize 8x4, warmup steps 500, weight_decay=0.01, max epochs 7, select lowest loss in 10% val in last 4 steps|Beam 10, sequences 10|1.075144|-|0.04|
|Lowercase, remove br, remove height of model|BART-large, batchsize 8x4, warmup steps 500, weight_decay=0.01, max epochs 9, select lowest loss in 10% val in last 4 steps|Beam 10, sequences 10|1.108078|-|15.4|
|Lowercase, remove br, remove height of model, add crawled data v2, remove duplicates|BART-large, batchsize 8x4, warmup steps 500, weight_decay=0.01, max epochs 7, select lowest loss in 10% val in last 4 steps|Beam 10, sequences 10|0.941261|-|16.72|
|Lowercase, remove br, remove height of model, add crawled data v2, remove duplicates|BART-large, batchsize 8x4, warmup steps 500, weight_decay=0.01, max epochs 7, select lowest loss in 10% val in last 4 steps|Beam 20, sequences 20, keep 10 different|0.941261|-|12.09|
|Lowercase, remove br, remove height of model, add crawled data v2, remove duplicates|BART-small, batchsize 32, warmup steps 500, weight_decay=0.01, max epochs 10, select highest CDG in 10% val in last 7 steps|Beam 20, sequences 20, top_k=50, top_p=0.95, keep 10 different|0.751276|36.305989|13.71|

Note: in last run the best val result was obtained at last epoch. We might do better with additional epochs.

## Improvements

* Fine-tune model parameters with a validation dataset and GridSearch, using [competition metric](https://en.wikipedia.org/wiki/Discounted_cumulative_gain)
* Use back-translation to generate more samples
* Filter generated texts using a dictionary. Discard texts with out-of-dictionary words.
* Try to obtain probabilities out of beam search. Sort results by probabilities.
* Try creating a simple model with one class per word in the name, that predicts which words appear in the name. Then sort the words according to most frequent order in train data.