<a href="https://colab.research.google.com/github/binliu0630/Deep_Learning/blob/master/Fastai%2BTransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Get Started

In [4]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/70/1a/364556102943cacde1ee00fdcae3b1615b39e52649eddbf54953e5b144c9/transformers-2.2.1-py3-none-any.whl (364kB)
[K     |█                               | 10kB 26.2MB/s eta 0:00:01[K     |█▉                              | 20kB 28.9MB/s eta 0:00:01[K     |██▊                             | 30kB 33.0MB/s eta 0:00:01[K     |███▋                            | 40kB 35.1MB/s eta 0:00:01[K     |████▌                           | 51kB 37.6MB/s eta 0:00:01[K     |█████▍                          | 61kB 39.6MB/s eta 0:00:01[K     |██████▎                         | 71kB 39.4MB/s eta 0:00:01[K     |███████▏                        | 81kB 40.2MB/s eta 0:00:01[K     |████████                        | 92kB 40.5MB/s eta 0:00:01[K     |█████████                       | 102kB 41.7MB/s eta 0:00:01[K     |█████████▉                      | 112kB 41.7MB/s eta 0:00:01[K     |██████████▊                     | 

In [5]:
import fastai
import transformers
print(f'fastai version: {fastai.__version__}')
print(f'transformers version: {transformers. __version__}')

fastai version: 1.0.59
transformers version: 2.2.1


In [0]:
import numpy as np
import pandas as pd
from pathlib import Path

import os

import torch
import torch.optim as optim

import random

# fastai
from fastai import *
from fastai.text import *
from fastai.callback import *

# transformer
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig 

In [7]:
torch.cuda.is_available()

True

## Loading the Data

In [0]:
data_path = "https://s3.amazonaws.com/tomk/h2o-world/megan/AmazonReviews.csv"
data = pd.read_csv(data_path)

In [25]:
data.head(2)

Unnamed: 0,ProductId,UserId,Summary,Score,HelpfulnessDenominator,Id,ProfileName,HelpfulnessNumerator,Time,Text
0,B00141QYSQ,A1YS02UZZGRDCT,Do Not Buy,1,2,41471,Evan Eberhardt,2,1348358400,These are made in China (do not buy ANY pet fo...
1,B0089SPEO2,A3JOYNYL458QHP,Less lemon and less zing,3,0,28582,coleridge,0,1323907200,"Everything is ok, except it just isn't as good..."


In [0]:
# create the binary label
data['label'] = np.where(data['Score'] > 3, '1', '0')

# change score into categorical so it can be the multiclass label
data['Score'] = data['Score'].astype('category')


In [27]:
data.sample()

Unnamed: 0,ProductId,UserId,Summary,Score,HelpfulnessDenominator,Id,ProfileName,HelpfulnessNumerator,Time,Text,label
50921,B001UFFZ1I,A1L1RJE3R29CLI,Packed with Nutrition,5,13,434608,"Skylar ""Health Nut""",12,1256774400,This rice is good and very nutritious. It does...,1


In [0]:
# split the data by timestamp
timesplit = data['Time'].quantile(0.8)

In [22]:
train = data[data['Time'] < timesplit]
test = data[data['Time'] >= timesplit]
train.shape, test.shape

((79992, 11), (20008, 11))

## FASTAI + TRANSFORMER

In [0]:
MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)
}

In [0]:
seed = 42
use_fp16 = False
bs = 16

model_type = 'roberta'
pretrained_model_name = 'roberta-base'

model_type = 'bert'
pretrained_model_name='bert-base-uncased'

model_type = 'distilbert'
pretrained_model_name = 'distilbert-base-uncased-distilled-squad'#'distilbert-base-uncased'#'distilbert-base-uncased'

# model_type = 'xlm'
# pretrained_model_name = 'xlm-clm-enfr-1024'

#model_type = 'xlnet'
#pretrained_model_name = 'xlnet-base-cased'

In [0]:
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [82]:
# all the pretrained model for the specific model_class
model_class.pretrained_model_archive_map

{'xlm-clm-ende-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin',
 'xlm-clm-enfr-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin',
 'xlm-mlm-100-1280': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin',
 'xlm-mlm-17-1280': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin',
 'xlm-mlm-en-2048': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin',
 'xlm-mlm-ende-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin',
 'xlm-mlm-enfr-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin',
 'xlm-mlm-enro-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin',
 'xlm-mlm-tlm-xnli15-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-p

In [0]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(seed)

### 1 Setup FASTAI Databunch

In [84]:
# load the pretrained tokenizer
transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)

100%|██████████| 1452741/1452741 [00:01<00:00, 1112496.20B/s]
100%|██████████| 1008321/1008321 [00:01<00:00, 776455.21B/s]


In [85]:
transformer_tokenizer.max_len

512

###### Create Custom FASTAI TOKENIER from pretrained transformer tokenizer

In [0]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
        return [CLS] + tokens + [SEP]

In [0]:
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

In [88]:
tokenizer_class.pretrained_vocab_files_map

{'merges_file': {'xlm-clm-ende-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt',
  'xlm-clm-enfr-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt',
  'xlm-mlm-100-1280': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt',
  'xlm-mlm-17-1280': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt',
  'xlm-mlm-en-2048': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt',
  'xlm-mlm-ende-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt',
  'xlm-mlm-enfr-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt',
  'xlm-mlm-enro-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt',
  'xlm-mlm-tlm-xnli15-1024': 'https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt',
  'xlm-mlm-xnli15-10

###### Create Custom FASTAI Vocab from the pretrained transformer tokenizer

In [0]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)

In [0]:
fastai_vocab = TransformersVocab(tokenizer=transformer_tokenizer)

###### Create Custom FASTAI Processor

In [0]:
numericalize_processor = NumericalizeProcessor(vocab=fastai_vocab)

tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)


In [0]:
fastai_processor = [tokenize_processor, numericalize_processor]

###### Create Custom FASTAI Databunch

In [93]:
print('[CLS] token :', transformer_tokenizer.cls_token)
print('[SEP] token :', transformer_tokenizer.sep_token)
print('[PAD] token :', transformer_tokenizer.pad_token)

[CLS] token : </s>
[SEP] token : </s>
[PAD] token : <pad>


In [0]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

In [95]:
databunch = (TextList.from_df(train, cols='Text', processor=fastai_processor)
             .split_by_rand_pct(0.1,seed=seed)
             .label_from_df(cols= 'label')
             .add_test(test)
             .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

In [96]:
databunch.show_batch(10)

text,target
"<</w> span</w> class</w> =</w> ""</w> tiny</w> ""</w> ></w> length</w> :</w> :</w> 4</w> :</w> 17</w> mins</w> <</w> b r</w> /</w> ></w> <</w> b r</w> /</w> ></w> <</w> /</w> span</w> ></w> i</w> talk</w> in</w> the</w> video</w> about</w> how</w> it</w> is</w> confusing</w> when</w> the</w> product</w> sent</w> is</w> somewhat</w> different</w> from</w> the</w> product</w> that</w> you</w> see</w> on</w> the</w> web</w> .</w> <</w> b r</w> /</w> ></w> <</w> b r</w> /</w> ></w> i</w> 'm</w> disappointed</w> that</w>",1
"this</w> product</w> is</w> simply</w> fabulous</w> .</w> i</w> purchased</w> it</w> for</w> my</w> 7</w> month</w> old</w> pit</w> bull</w> ,</w> who</w> is</w> always</w> into</w> something</w> and</w> very</w> highly</w> food</w> motivated</w> .</w> prior</w> to</w> the</w> ever lasting</w> treat</w> ball</w> her</w> favorite</w> toy</w> was</w> the</w> chuck le</w> ,</w> made</w> by</w> premier</w> .</w> the</w> advantage</w> to</w> the</w> chuck le</w> is</w> that</w> i</w> could</w> fill</w> it</w> with</w> her</w> food</w> for</w> a</w> nutri tious</w> but</w> fun</w> toy</w> .</w>",1
"i</w> bought</w> this</w> product</w> around</w> the</w> beginning</w> to</w> middle</w> of</w> may</w> 2011</w> &</w> ,</w> at</w> that</w> time</w> ,</w> it</w> showed</w> a</w> picture</w> of</w> the</w> box</w> from</w> the</w> company</w> cultures</w> for</w> health</w> ll c</w> .</w> it</w> also</w> had</w> (</w> &</w> still</w> has</w> )</w> their</w> name</w> on</w> it</w> as</w> ""</w> by</w> cultures</w> of</w> health</w> ll c</w> ""</w> but</w> then</w> further</w> down</w> is</w> says</w> ""</w> ships</w> from</w> and</w> sold</w> by</w> lifetime</w> kef ir</w>",1
"i</w> have</w> many</w> reviews</w> for</w> b arry</w> farm</w> products</w> here</w> on</w> amaz on</w> .</w> maybe</w> a</w> dozen</w> or</w> so</w> .</w> if</w> they</w> are</w> not</w> the</w> largest</w> distributor</w> of</w> dried</w> ve g gies</w> ,</w> spices</w> ,</w> herbs</w> ,</w> etc</w> ,</w> than</w> they</w> have</w> to</w> be</w> very</w> close</w> .</w> i</w> became</w> familiar</w> with</w> b arry</w> farm</w> products</w> many</w> years</w> ago</w> when</w> i</w> first</w> started</w> doing</w> a</w> lot</w> of</w> hiking</w> .</w> their</w> fresh",1
"i</w> have</w> been</w> on</w> something</w> of</w> honey</w> j ag</w> this</w> year</w> .</w> it</w> seems</w> to</w> find</w> it</w> 's</w> way</w> into</w> everything</w> around</w> me</w> from</w> baked</w> goods</w> to</w> bath</w> products</w> .</w> honey</w> is</w> better</w> for</w> you</w> than</w> the</w> vast</w> majority</w> what</w> it</w> there</w> .</w> (</w> i</w> discovered</w> ,</w> just</w> a</w> day</w> or</w> 2</w> ago</w> ,</w> when</w> per using</w> <</w> a</w> h re f</w> =</w> ""</w> http</w> :</w> /</w> /</w> www. amaz",1
"i</w> believe</w> that</w> people</w> need</w> to</w> be</w> aware</w> of</w> the</w> fact</w> that</w> the</w> ki bble</w> size</w> of</w> the</w> adult</w> oral</w> care</w> dry</w> cat</w> food</w> is</w> very</w> large</w> and</w> presents</w> a</w> potential</w> cho king</w> hazard</w> to</w> their</w> pets</w> .</w> i</w> transi tioned</w> my</w> cat</w> to</w> this</w> food</w> from</w> the</w> science</w> diet</w> kit ten</w> formula</w> when</w> he</w> was</w> about</w> 15</w> months</w> old</w> .</w> from</w> day</w> one</w> ,</w> i</w> thought</w> that</w> the</w> ki",0
"i</w> decided</w> to</w> re- write</w> my</w> review</w> in</w> order</w> to</w> let</w> you</w> guys</w> in</w> on</w> how</w> my</w> cat</w> 's</w> been</w> doing</w> on</w> the</w> war u va</w> cat</w> food</w> .</w> <</w> b r</w> /</w> ></w> <</w> b r</w> /</w> ></w> we</w> decided</w> to</w> try</w> pretty</w> much</w> every</w> flavor</w> on</w> here</w> because</w> i</w> couldn</w> 't</w> find</w> any</w> really</w> good</w> specific</w> reviews</w> for</w> anything</w> other</w> than</w> one</w> or</w> two</w> flavors</w> ,</w> here</w> 's</w>",1
"i</w> decided</w> to</w> switch</w> from</w> grocery</w> store</w> milk</w> to</w> sac o</w> 's</w> powder</w> for</w> mathem atical</w> reasons</w> .</w> as</w> a</w> single</w> lady</w> who</w> uses</w> milk</w> primarily</w> for</w> my</w> tea</w> ,</w> i</w> rarely</w> finish</w> a</w> one- gallon</w> bottle</w> before</w> it</w> s ours</w> (</w> hate</w> that</w> race</w> to</w> use</w> it</w> all</w> !</w> )</w> ,</w> and</w> buying</w> half</w> gallon</w> containers</w> absolutely</w> bugs</w> me</w> ,</w> as</w> it</w> costs</w> five</w> bucks</w> for</w> two</w> (</w> purchased</w>",1
"go</w> to</w> [</w> ...</w> ]</w> .</w> search</w> for</w> the</w> video</w> entitled</w> :</w> whole</w> foods</w> market</w> ""</w> organic</w> ""</w> food</w> made</w> in</w> ch ina</w> !</w> !</w> !</w> !</w> <</w> b r</w> /</w> ></w> <</w> b r</w> /</w> ></w> according</w> to</w> my</w> research</w> ,</w> ch ina</w> is</w> the</w> major</w> supplier</w> of</w> go ji</w> berries</w> .</w> there</w> 's</w> been</w> an</w> unresolved</w> growing</w> concern</w> about</w> food</w> ,</w> any</w> food</w> produced</w> in</w> ch ina</w> ,</w>",0
"i</w> realize</w> i</w> am</w> just</w> adding</w> my</w> voice</w> to</w> the</w> chorus</w> of</w> approval</w> about</w> til da</w> bas mat i</w> rice</w> ,</w> but</w> i</w> feel</w> strongly</w> enough</w> to</w> write</w> a</w> review</w> anyway</w> !</w> <</w> b r</w> /</w> ></w> <</w> b r</w> /</w> ></w> in</w> a</w> nut shell</w> :</w> <</w> b r</w> /</w> ></w> if</w> you</w> enjoy</w> rice</w> that</w> is</w> d rier</w> in</w> texture</w> and</w> has</w> separate</w> grains</w> ,</w> til da</w> bas",1


In [97]:
databunch.one_batch()[0].shape

torch.Size([16, 512])

### 2 Setup FASTAI Learner 

###### Create Custom Transformer Model

TypeError: ignored

### Reference
https://www.kaggle.com/maroberti/fastai-with-transformers-bert-roberta