In [None]:
# hide
!nvidia-smi

Fri May 28 11:42:45 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 960M    Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   41C    P8    N/A /  N/A |    312MiB /  4046MiB |     24%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# hide
import sys
if 'google.colab' in sys.modules:
    !pip install -Uqq fastai transformers datasets wandb
    !pip install git+git://github.com/aikindergarten/fasthugs.git

In [None]:
#all_slow

# Text classification: IMDB dataset

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, MarianConfig, AutoTokenizer, AutoConfig
from fastai.text.all import *
from fastai.callback.wandb import *

from fasthugs.learner import TransLearner
from fasthugs.data import TransformersTextBlock

## Setup

Let's define main settings for the run in one place:

In [None]:
# ds_name = 'imdb'
model_name = "Helsinki-NLP/opus-mt-fr-en"

max_len = 512
bs = 8
val_bs = bs*2

lr = 2e-5

In [None]:
df = pd.read_csv('/notebooks/data/questions_easy.csv')
df.head()

Unnamed: 0,en,fr
0,What is light ?,Qu’est-ce que la lumière?
1,Who are we?,Où sommes-nous?
2,Where did we come from?,D'où venons-nous?
3,What would we do without it?,Que ferions-nous sans elle ?
4,"What is the absolute location (latitude and longitude) of Badger, Newfoundland and Labrador?","Quelle sont les coordonnées (latitude et longitude) de Badger, à Terre-Neuve-etLabrador?"


## Dataloaders

In [None]:
model_name = 'Helsinki-NLP/opus-mt-fr-en'
config = AutoConfig.from_pretrained(model_name)
config

MarianConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_length": 512,
  "max_position_embeddings": 512,
  "mode

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#export
class TokBatchTransform(Transform):
    """
    Tokenizes texts in batches using pretrained HuggingFace tokenizer.
    The first element in a batch can be single string or 2-tuple of strings.
    If `with_labels=True` the "labels" are added to the output dictionary.
    """
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None, 
                 do_targets=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()
    
    def encodes(self, batch):
        # batch is a list of tuples of ({text or (text1, text2)}, {targets...})
        if is_listy(batch[0][0]): # 1st element is tuple
            self._two_texts = True
            texts = ([s[0][0] for s in batch], [s[0][1] for s in batch])
        elif is_listy(batch[0]): 
            texts = ([s[0] for s in batch],)
        else: # batch is list of texts
            texts = (list(batch),)
            batch = [(s, ) for s in batch]
        inps = self.tokenizer(*texts,
                              add_special_tokens=True,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        
        if self.do_targets and isinstance(batch[0][1], str):
            target_texts = [s[1] for s in batch]
            with self.tokenizer.as_target_tokenizer():
                targets = self.tokenizer(target_texts,
                                  padding=self.padding,
                                  truncation=self.truncation,
                                  max_length=self.max_length,
                                  return_tensors='pt', 
                                  **self.kwargs).input_ids
            inps['labels'] = targets
            res = (inps, )
        else:
            # inps are batched, collate targets into batches too
            labels = default_collate([s[1:] for s in batch])
            if self.with_labels:
                # TODO consider cases when there are multiple labels
                inps['labels'] = labels[0]
                res = (inps, )
            else:
                res = (inps, ) + tuple(labels)
        return res
    
    def decodes(self, x:TensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStr(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStr(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
class TransformersTextBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    @delegates(TokBatchTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, preprocessed=False, **kwargs):
        batch_tfm_cls = PadBatchTransform if preprocessed else TokBatchTransform
        before_batch_tfm = batch_tfm_cls(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls,
                 config=config, tokenizer=tokenizer, **kwargs)
        return super().__init__(dl_type=SortedDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=Undict()
                               )

In [None]:
model = AutoModelForSeq2SeqLM.from_config(config)
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [None]:
from fasthugs.data import TextGetter, Undict

In [None]:
from fastcore.transform import ItemTransform

In [None]:
@ItemTransform
def untuple1(x):
    return (*x[0], )

In [None]:
dblock = DataBlock(
    blocks = [TransformersTextBlock(tokenizer=tokenizer, do_targets=True)],
    get_x=TextGetter('fr', 'en'),
    item_tfms=untuple1,
    splitter=RandomSplitter())

In [None]:
dblock.summary(df)

Setting-up type transforms pipelines
Collecting items from                                                                                                  en  \
0                                                                                   What is light ?   
1                                                                                       Who are we?   
2                                                                           Where did we come from?   
3                                                                      What would we do without it?   
4      What is the absolute location (latitude and longitude) of Badger, Newfoundland and Labrador?   
...                                                                                             ...   
40133                                                         What actions can lead to termination?   
40134                                           What makes one crew do a go-around and another not?   
40135         

In [None]:
%%time
bs = 16
dls = dblock.dataloaders(df, bs=bs, val_bs=bs*2)

CPU times: user 14.7 s, sys: 4.18 ms, total: 14.7 s
Wall time: 14.7 s


In [None]:
b = dls.one_batch()
b

({'input_ids': tensor([[ 3625, 12775,  1538,  ..., 19226,    54,     0],
          [  350,    19,   898,  ..., 59513, 59513, 59513],
          [ 1590,    15,    21,  ..., 59513, 59513, 59513],
          ...,
          [ 4717,    43,     8,  ..., 59513, 59513, 59513],
          [ 1276,     6,    82,  ..., 59513, 59513, 59513],
          [ 1955,    16, 31548,  ..., 59513, 59513, 59513]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          ...,
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
  'labels': tensor([[  430, 11156,  1538,  ...,   632,    54,     0],
          [ 9074,     4,   898,  ..., 59513, 59513, 59513],
          [  430,   373,    45,  ..., 59513, 59513, 59513],
          ...,
          [  430,    32,     4,  ..., 59513, 59513, 59513],
          [  430,   664,    84,  ..., 59513, 59513, 59513],


In [None]:
dls.show_batch(max_n=4)

AttributeError: 'Tensor' object has no attribute 'show'

## Tracking with W&B

Here comes some details on w&b tracking and the leaderboard to be established...

In [None]:
import wandb

WANDB_NAME = f'{ds_name}-{model_name}'
GROUP = f'{ds_name}-{model_name}-simple-{lr:.0e}'
NOTES = f'finetuning {model_name} with RAdam lr={lr:.0e}'
CONFIG = {}
TAGS =[model_name, ds_name, 'radam']

In [None]:
#hide_output
wandb.init(reinit=True, project="fasthugs", entity="fastai_community",
           name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS, config=CONFIG);

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mfastai_community[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.28 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


## Training

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(model_name)
learn = TransLearner(dls, model, metrics=CorpusBLEUMetric(), loss_func=noop).to_fp16()

In [None]:
learn.fit_one_cycle(2, 1e-4)

epoch,train_loss,valid_loss,corpus_bleu,time
0,8.208636,8.356507,0.056414,06:35
1,7.910537,8.097847,0.064456,06:35


In [None]:
model.base_model

MarianModel(
  (shared): Embedding(59514, 512, padding_idx=59513)
  (encoder): MarianEncoder(
    (embed_tokens): Embedding(59514, 512, padding_idx=59513)
    (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
    (layers): ModuleList(
      (0): MarianEncoderLayer(
        (self_attn): MarianAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): MarianEncoderLayer(
        (self_attn): MarianAtten

In [None]:
model.lm_head.weight

Parameter containing:
tensor([[-9.0407e-03,  2.8072e-02,  1.4343e-02,  ..., -3.6860e-02,
         -4.2824e-02,  9.6411e-03],
        [ 3.2397e-03,  3.7629e-02,  1.3613e-02,  ..., -1.7274e-02,
         -4.1874e-03, -3.3502e-02],
        [ 3.9743e-03,  2.9589e-02,  1.3218e-02,  ..., -3.3958e-02,
          2.0799e-02, -1.6180e-02],
        ...,
        [ 2.5598e-02,  1.8182e-02,  1.2885e-02,  ...,  1.1126e-02,
          2.1903e-02,  3.5934e-03],
        [-9.4439e-03, -6.3931e-03,  2.4074e-03,  ..., -1.0516e-02,
          1.6802e-02, -2.4351e-02],
        [ 1.0829e-09,  1.3621e-08, -1.2080e-08,  ..., -1.1688e-08,
          6.7046e-09, -4.8634e-09]], device='cuda:0', requires_grad=True)

In [None]:
model.base_model.shared.weight

Parameter containing:
tensor([[-9.0407e-03,  2.8072e-02,  1.4343e-02,  ..., -3.6860e-02,
         -4.2824e-02,  9.6411e-03],
        [ 3.2397e-03,  3.7629e-02,  1.3613e-02,  ..., -1.7274e-02,
         -4.1874e-03, -3.3502e-02],
        [ 3.9743e-03,  2.9589e-02,  1.3218e-02,  ..., -3.3958e-02,
          2.0799e-02, -1.6180e-02],
        ...,
        [ 2.5598e-02,  1.8182e-02,  1.2885e-02,  ...,  1.1126e-02,
          2.1903e-02,  3.5934e-03],
        [-9.4439e-03, -6.3931e-03,  2.4074e-03,  ..., -1.0516e-02,
          1.6802e-02, -2.4351e-02],
        [ 1.0829e-09,  1.3621e-08, -1.2080e-08,  ..., -1.1688e-08,
          6.7046e-09, -4.8634e-09]], device='cuda:0', requires_grad=True)

In [None]:
model.base_model.encoder.embed_tokens.weight

Parameter containing:
tensor([[-9.0407e-03,  2.8072e-02,  1.4343e-02,  ..., -3.6860e-02,
         -4.2824e-02,  9.6411e-03],
        [ 3.2397e-03,  3.7629e-02,  1.3613e-02,  ..., -1.7274e-02,
         -4.1874e-03, -3.3502e-02],
        [ 3.9743e-03,  2.9589e-02,  1.3218e-02,  ..., -3.3958e-02,
          2.0799e-02, -1.6180e-02],
        ...,
        [ 2.5598e-02,  1.8182e-02,  1.2885e-02,  ...,  1.1126e-02,
          2.1903e-02,  3.5934e-03],
        [-9.4439e-03, -6.3931e-03,  2.4074e-03,  ..., -1.0516e-02,
          1.6802e-02, -2.4351e-02],
        [ 1.0829e-09,  1.3621e-08, -1.2080e-08,  ..., -1.1688e-08,
          6.7046e-09, -4.8634e-09]], device='cuda:0', requires_grad=True)

In [None]:
(model.base_model.decoder.embed_tokens.weight == model.base_model.encoder.embed_tokens.weight).all()

tensor(True, device='cuda:0')

In [None]:
(model.lm_head.weight == model.base_model.shared.weight).all()

tensor(True, device='cuda:0')

In [None]:
learn.show_results()

Unnamed: 0,text,category,category_
0,"There's a sign on The Lost Highway that says:<br /><br />*MAJOR SPOILERS AHEAD*<br /><br />(but you already knew that, didn't you?)<br /><br />Since there's a great deal of people that apparently did not get the point of this movie, I'd like to contribute my interpretation of why the plot makes perfect sense. As others have pointed out, one single viewing of this movie is not sufficient. If you have the DVD of MD, you can ""cheat"" by looking at David Lynch's ""Top 10 Hints to Unlocking MD"" (but only upon second or third viewing, please.) ;)<br /><br />First of all, Mulholland Drive is downright brilliant. A masterpiece. This is the kind of movie that refuse to leave your head. Not often are the comments on the DVDs very accurate, but Vogue's ""It gets inside your head and stays there"" really hit the mark.<br /><br />David Lynch deserves praise for creating",pos,pos
1,"Yeah, what did I expect? I thought this would be a film about young adults at their turning-point in life, something like ""Sonnenallee"" or ""American Pie"", which I liked a lot. I wanted to see a funny film, perhaps with an ironic look on idyllic Wuerzburg. And what did I get?<br /><br />Attention, spoilers ahead!<br /><br />This film starts with a lengthy dialogue which gives you a good hint of what will inevitably follow: more lengthy dialogues. Sometimes I thought Moritz Bleibtreu might have forgotten his text and trying to hide that fact by improvising and just repeating what he was saying before. But as I think of Bleibtreu as one of the better german actors, I believe that this effect really was intended. I think the author wanted to show how boring talking to close friends can be - especially when they are stoned. But really, I don't need",neg,neg
2,"This was one of the most dishonest, meaningless, and non-peaceful of the films I have ever seen. The representation of the other, of the Israelis, was racist, backward, and unfair. For one, the song played on E.S' car radio when pulled up alongside a very right-wing Israeli driver was ""I put a spell on you"" by Natacha Atlas. The song's style is quite Arabic, but it was released on an Israeli compilation CD, and I have even heard it on the radio in Israel. Many Israeli songs (as well as architecture, foods, and slang) are influenced by Arabic culture, and there is no reason an Israeli Jew would be offended or angered by a nearby car playing that song. The way E.S. appears so calm and collected with his sunglasses and cool glare, via a long, still shot, is meant to force the viewer into seeing the Jew as haggard",neg,neg
3,"Pistol-packing Pam Grier takes names and kicks butt as the heroine in ""Asylum of Satan"" director William Girdler's entertaining blaxploitation actioneer ""Sheba Baby,"" co-starring D'Urville Martin and Austin Stoker. ""Sheba Baby"" is one of several tough chick flicks that Grier appeared in during the 1970s, including ""Coffy,"" ""Foxy Brown,"" and ""Friday Foster."" The short-lived Girdler co-wrote this thoroughly routine private eye potboiler with producer David Shelton in one night and it features a headstrong female shamus that refuses to rely on a man to help her take care of business. Unfortunately, ""Sheba Baby"" isn't nearly as good as the blaxploitation movies that Grier made under the supervision of director Jack Hill. Hill helmed the African-American North Carolina native in ""Coffy,"" ""Foxy Brown,"" ""The Big Bird Cage,"" and ""The Big Doll House."" Anybody that analyzes images of African-American women in cinema should be familiar with these epics. The chief problem with",pos,pos
4,"Once again I must play something of the contrarian. Most of the reviews for Ab Tak Chappan have been extremely positive. Mine is positive, but only slightly. A 7 out of 10 is equivalent to a ""C"" letter grade from me.<br /><br />It seems that a lot of the praise is rooted in two factors: One, that Ab Tak Chappan is more realistic than the typical Bollywood film, and two, that it is trying to do things differently.<br /><br />The first point I couldn't care less about. I'm not looking for realism in films, and so I do not score higher for a film that shows a story and characters closer to how I believe the real world to be--I'm a big fan of surrealism, fantasy, absurdism, and so on, although I do not dislike realist films merely for the fact that they're realist.<br /><br />For the second point, I",pos,pos
5,"Master of Italian horror Dario Argento is called a lot of bad things by non-fans. And is deserving of absolutely none of the backlash. In fact, every time I hear something bad about Argento- I think they're really talking about Michele Soavi. He just doesn't get the same amount of attention because his films were never as successful in theaters. In fact, his best film - 1994's Cemetery Man - was probably his least successful. Or just didn't get the attention he felt it deserved, because after that, he left film and went into directing television. He's never gone back. So people really don't know how inferior his other films are because by the time they've seen them, they're already fans of the Italian horror aesthetic. Which means you have to accept the fact that they make almost zero sense and are usually very unattractive films. This is where The",neg,pos
6,"Wallace and Gromit are the main characters in some of the best cartoons ever crafted. The excellent mix of visual humor and claymation makes ""A Grand Day Out,"" ""The Wrong Trousers,"" and also ""A Close Shave"" some of the best animated footage ever put on television. Winning several Oscars and also countless other awards, Nick Park became quite the popular man in the U.K., yet his impact on the United States has not been big. After the third Wallace and Gromit short, there was all this speculation about a full-length Wallace and Gromit movie, yet for years nothing had happened. Then in 2000 instead of a full-length Wallace and Gromit film, we get another brilliant claymation film from Nick Park, which was Chicken Run, which almost got nominated for best picture in the Academy Awards. Perhaps it was the success of this film that ultimately drove Park to finally work",pos,pos
7,"It's almost impossible for me to sit down and write a conscientious review of THREE COLORS: RED without letting people in on some of the ideas that Krysztoff Kieszlowski has explored in the previous two entries to this fascinating trilogy. The more I see them and think of them, and imagine myself in their world, the more I get its theme: that we are more linked to each other than we would want to think ourselves, and all it takes is a little hand of fate to set some events in motion. In BLUE, Juliette Binoche played a grieving widow whose plan to live her life without connections to the past had her meet someone unexpected. In WHITE, an act of cruelty spawns an unlikely friendship between two men who will, against the odds, conspire to bring the perpetrator to justice and full circle. And now, in RED, all the",pos,pos
8,"A blaxploitation classic, this movie was terribly influential in rap music for the ""toasts"" that Rudy Ray Moore performs. Toasts are long rhyming stories that are funny and deliver a point, and you can see how they would naturally evolve into rap. For more on toasts, Rudy Ray Moore, and why this movie is important, go to Dolemite.com.<br /><br />Which leaves us just to talk about the movie itself. This movie packs in a great deal of ""laugh-at-the-funny-outfits-and-hairstyles"" bang for the buck, as nearly every shot has some sort of outrageous element or dialogue. It starts as Dolemite is being released from prison in order to find out who framed him and bring him to justice. I was unaware that prisons release people so they can prove their own innocence, but that's me, I'm a neophyte in the prison scene. He is helped in this by Queen Bee, who is",pos,neg
