Setup

In [1]:
# Let's start by adjust the working directory so that it is the root of the repository
# This should be run just once.

import os
os.chdir('../')
print("Current working directory is {}".format(os.getcwd()))

In [2]:
#importing libraries

import torch
from farm.modeling.tokenization import Tokenizer
from farm.data_handler.processor import TextClassificationProcessor
from farm.data_handler.data_silo import DataSilo
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TextClassificationHead
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.optimization import initialize_optimizer
from farm.train import Trainer
from farm.utils import MLFlowLogger

I0320 14:07:18.910329 13716 file_utils.py:38] PyTorch version 1.4.0 available.


In [4]:
# Farm allows simple logging of many parameters & metrics. Let's use MLflow framework to track our experiment ...
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Tutorial1_Colab")


 __          __  _                            _        
 \ \        / / | |                          | |       
  \ \  /\  / /__| | ___ ___  _ __ ___   ___  | |_ ___  
   \ \/  \/ / _ \ |/ __/ _ \| '_ ` _ \ / _ \ | __/ _ \ 
    \  /\  /  __/ | (_| (_) | | | | | |  __/ | || (_) |
     \/  \/ \___|_|\___\___/|_| |_| |_|\___|  \__\___/ 
  ______      _____  __  __  
 |  ____/\   |  __ \|  \/  |              _.-^-._    .--.
 | |__ /  \  | |__) | \  / |           .-'   _   '-. |__|
 |  __/ /\ \ |  _  /| |\/| |          /     |_|     \|  |
 | | / ____ \| | \ \| |  | |         /               \  |
 |_|/_/    \_\_|  \_\_|  |_|        /|     _____     |\ |
                                     |    |==|==|    |  |
|---||---|---|---|---|---|---|---|---|    |--|--|    |  |
|---||---|---|---|---|---|---|---|---|    |==|==|    |  |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



In [5]:
# We need to fetch the right device to drive the growth of our model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print("Devices available: {}".format(device))

Devices available: cpu


Data handling

In [7]:
# Here we initialize a tokenizer that will be used for preprocessing text
# This is the BERT Tokenizer which uses the byte pair encoding method.
# It is currently loaded with a German model

tokenizer = Tokenizer.load(
    pretrained_model_name_or_path="bert-base-german-cased",
    do_lower_case=False)

I0320 14:10:05.839990 13716 tokenization.py:72] Loading tokenizer of type 'BertTokenizer'
I0320 14:10:06.002772 13716 tokenization_utils.py:418] loading file https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt from cache at C:\Users\ajankowski\.cache\torch\transformers\da299cdd121a3d71e1626f2908dda0d02658f42e925a3d6abd8273ec08cf41a6.2a48e6c60dcdb582effb718237ce5894652e3b4abb94f0a4d9a857b70333308d


In [8]:
# In order to prepare the data for the model, we need a set of
# functions to transform data files into PyTorch Datasets.
# We group these together in Processor objects.
# We will need a new Processor object for each new source of data.
# The abstract class can be found in farm.data_handling.processor.Processor

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="data/germeval18",
                                        label_list = ["OTHER", "OFFENSE"],
                                        metric = "f1_macro",
                                        label_column_name = "coarse_label")

In [9]:
# We need a DataSilo in order to keep our train, dev and test sets separate.
# The DataSilo will call the functions in the Processor to generate these sets.
# From the DataSilo, we can fetch a PyTorch DataLoader object which will
# be passed on to the model.
# Here is a good place to define a batch size for the model

BATCH_SIZE = 32

data_silo = DataSilo(
    processor=processor,
    batch_size=BATCH_SIZE)

I0320 14:26:10.223584 13716 data_silo.py:179] 
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
I0320 14:26:10.225584 13716 data_silo.py:188] Loading train set from: data\germeval18\train.tsv 
I0320 14:26:10.226576 13716 utils.py:39]  Couldn't find data\germeval18\train.tsv locally. Trying to download ...
I0320 14:26:10.227574 13716 utils.py:145] downloading and extracting file germeval18 to dir C:\Personal\AI\analiza sentymentu\data
100%|██████████████████████████████████████████████████████████████████████| 525101/525101 [00:00<00:00, 3681330.41B/s]
I0320 14:26:10.763776 13716 data_silo.py:139] Got ya 7 parallel workers to convert 5009 dictionaries to pytorch datasets (chunksize = 144)...
I0320 14:26:10.776438 13716 utils.py:244]  0    0    0    0    0    0    0 
I0320 14:26:10.777435 13716 utils.py:244] /w\  /w\  /w\  /w\  /w\  /|\  /w\
I0320 14:26:10.77

Modeling¶
In FARM, we make a strong distinction between the language model and prediction head so that you can mix and match different building blocks for your needs.

For example, in the transfer learning paradigm, you might have the one language model that you will be using for both document classification and NER. Or you perhaps you have a pretrained language model which you would like to adapt to your domain, then use for a downstream task such as question answering.

All this is possible within FARM and requires only the replacement of a few modular components, as we shall see below.

Let's first have a look at how we might set up a model.

In [10]:
# The language model is the foundation on which modern NLP systems are built.
# They encapsulate a general understanding of sentence semantics
# and are not specific to any one task.

# Here we are using Google's BERT model as implemented by HuggingFace. 
# The model being loaded is a German model that we trained. 
# You can also change the MODEL_NAME_OR_PATH to point to a BERT model that you
# have saved or download one connected to the HuggingFace repository.
# See farm.modeling.language_model.PRETRAINED_MODEL_ARCHIVE_MAP for a list of
# available models

MODEL_NAME_OR_PATH = "bert-base-german-cased"

language_model = LanguageModel.load(MODEL_NAME_OR_PATH)

I0320 14:39:33.548645 13716 filelock.py:274] Lock 2792498684816 acquired on C:\Users\ajankowski\.cache\torch\transformers\e653e2fe0970d519c5a3b6c0286e1630ad2f0eade78f82b4916ec945d6f06d48.4154b6bb468532f5a3035a2e706fc9db941628923ea897f73c727d9c8a9c0d1a.lock
I0320 14:39:33.548645 13716 file_utils.py:413] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json not found in cache or force_download set to True, downloading to C:\Users\ajankowski\.cache\torch\transformers\tmpobzprxhe


HBox(children=(IntProgress(value=0, description='Downloading', max=336, style=ProgressStyle(description_width=…




I0320 14:39:34.170804 13716 file_utils.py:423] storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json in cache at C:\Users\ajankowski\.cache\torch\transformers\e653e2fe0970d519c5a3b6c0286e1630ad2f0eade78f82b4916ec945d6f06d48.4154b6bb468532f5a3035a2e706fc9db941628923ea897f73c727d9c8a9c0d1a
I0320 14:39:34.171834 13716 file_utils.py:426] creating metadata file for C:\Users\ajankowski\.cache\torch\transformers\e653e2fe0970d519c5a3b6c0286e1630ad2f0eade78f82b4916ec945d6f06d48.4154b6bb468532f5a3035a2e706fc9db941628923ea897f73c727d9c8a9c0d1a
I0320 14:39:34.174793 13716 filelock.py:318] Lock 2792498684816 released on C:\Users\ajankowski\.cache\torch\transformers\e653e2fe0970d519c5a3b6c0286e1630ad2f0eade78f82b4916ec945d6f06d48.4154b6bb468532f5a3035a2e706fc9db941628923ea897f73c727d9c8a9c0d1a.lock
I0320 14:39:34.772385 13716 filelock.py:274] Lock 2792498684816 acquired on C:\Users\ajankowski\.cache\torch\transformers\e32f648561b03f77a129832928b7f16decdc5e087

HBox(children=(IntProgress(value=0, description='Downloading', max=438869143, style=ProgressStyle(description_…




I0320 14:40:36.326134 13716 file_utils.py:423] storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin in cache at C:\Users\ajankowski\.cache\torch\transformers\e32f648561b03f77a129832928b7f16decdc5e0870f1e6558857e046169d4133.4e5eda3a0f09b32a0b7d1a9185034da1b3506d5c5b0c6880a7ca0122ab5eef2e
I0320 14:40:36.329135 13716 file_utils.py:426] creating metadata file for C:\Users\ajankowski\.cache\torch\transformers\e32f648561b03f77a129832928b7f16decdc5e0870f1e6558857e046169d4133.4e5eda3a0f09b32a0b7d1a9185034da1b3506d5c5b0c6880a7ca0122ab5eef2e
I0320 14:40:36.333117 13716 filelock.py:318] Lock 2792498684816 released on C:\Users\ajankowski\.cache\torch\transformers\e32f648561b03f77a129832928b7f16decdc5e0870f1e6558857e046169d4133.4e5eda3a0f09b32a0b7d1a9185034da1b3506d5c5b0c6880a7ca0122ab5eef2e.lock
I0320 14:40:36.334113 13716 modeling_utils.py:458] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_mo

In [11]:
# A prediction head is a model that processes the output of the language model
# for a specific task.
# Prediction heads will look different depending on whether you're doing text classification
# Named Entity Recognition (NER), question answering or some other task.
# They should generate logits over the available prediction classes and contain methods
# to convert these logits to losses or predictions 

# Here we use TextClassificationHead which receives a single fixed length sentence vector
# and processes it using a feed forward neural network. layer_dims is a list of dimensions:
# [input_dims, hidden_1_dims, hidden_2_dims ..., output_dims]

# Here by default we have a single layer network.
# It takes in a vector of length 768 (the default size of BERT's output).
# It outputs a vector of length 2 (the number of classes in the GermEval18 (coarse) dataset)

LAYER_DIMS = [768, 2]

prediction_head = TextClassificationHead(layer_dims=LAYER_DIMS)

W0320 14:41:46.517942 13716 prediction_head.py:265] `layer_dims` will be deprecated in future releases
I0320 14:41:46.519937 13716 prediction_head.py:272] Prediction head initialized with size [768, 2]


In [12]:
# The language model and prediction head are coupled together in the Adaptive Model.
# This class takes care of model saving and loading and also coordinates
# cases where there is more than one prediction head.

# EMBEDS_DROPOUT_PROB is the probability that an element of the output vector from the
# language model will be set to zero.
EMBEDS_DROPOUT_PROB = 0.1

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],
    embeds_dropout_prob=EMBEDS_DROPOUT_PROB,
    lm_output_types=["per_sequence"],
    device=device)

Training

In [16]:
# Here we initialize a Bert Adam optimizer that has a linear warmup and warmdown
# Here you can set learning rate, the warmup proportion and number of epochs to train for

LEARNING_RATE = 2e-5
N_EPOCHS = 1

model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    device=device,
    learning_rate=LEARNING_RATE,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=N_EPOCHS)

I0320 14:44:26.583294 13716 optimization.py:171] Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 2e-05}'
I0320 14:44:26.906943 13716 optimization.py:241] Using scheduler 'get_linear_schedule_with_warmup'
I0320 14:44:26.906943 13716 optimization.py:255] Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 14.0, 'num_training_steps': 140}'


In [20]:
# Training loop handled by this
# It will also trigger evaluation during training using the dev data
# and after training using the test data.

# Set N_GPU to a positive value if CUDA is available
N_GPU = 0

trainer = Trainer(model,
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=N_EPOCHS,
    n_gpu=N_GPU,
    lr_schedule=lr_schedule,
    device=device,
)

In [22]:
model = trainer.train()

I0320 14:48:12.871213 13716 train.py:406] 
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `

Train epoch 1/1 (Cur. train loss: 0.7012):  72%|██████████████████████████▋          | 101/140 [32:16<12:35, 19.36s/it]
Evaluating:   0%|                                                                               | 0/18 [00:00<?, ?it/s]
Evaluating:  11%|███████▉        

In [23]:
# Test your model on a sample (Inference)
from farm.infer import Inferencer
from pprint import PrettyPrinter

infer_model = Inferencer(processor=processor, model=model, gpu=True)

basic_texts = [
    {"text": "Martin ist ein Idiot"},
    {"text": "Martin Müller spielt Handball in Berlin"},
]
result = infer_model.inference_from_dicts(dicts=basic_texts)
PrettyPrinter().pprint(result)

I0320 16:22:02.685729 13716 utils.py:81] device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
I0320 16:22:02.705676 13716 infer.py:262] Got ya 1 parallel workers to do inference on 2dicts (chunksize = 128)...
I0320 16:22:02.707673 13716 utils.py:244]  0 
I0320 16:22:02.708668 13716 utils.py:244] /w\
I0320 16:22:02.709667 13716 utils.py:244] /'\
I0320 16:22:02.710663 13716 utils.py:244] 
  0%|                                                                                        | 0/2 [00:00<?, ? Dicts/s]
Inferencing:   0%|                                                                               | 0/1 [00:00<?, ?it/s]
128 Dicts [00:06, 20.16 Dicts/s]                                                                                       


[{'predictions': [{'context': 'Martin ist ein Idiot',
                   'end': None,
                   'label': 'OFFENSE',
                   'probability': 0.85267246,
                   'start': None},
                  {'context': 'Martin Müller spielt Handball in Berlin',
                   'end': None,
                   'label': 'OTHER',
                   'probability': 0.88546455,
                   'start': None}],
  'task': 'text_classification'}]
