# Install library

In [None]:
# Install FARM
!pip install farm
!pip install grpcio-tools==1.34.1
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

Collecting farm
  Downloading farm-0.8.0-py3-none-any.whl (204 kB)
[K     |████████████████████████████████| 204 kB 5.5 MB/s 
[?25hCollecting torch<1.9,>1.5
  Downloading torch-1.8.1-cp37-cp37m-manylinux1_x86_64.whl (804.1 MB)
[K     |████████████████████████████████| 804.1 MB 2.7 kB/s 
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
Collecting boto3
  Downloading boto3-1.20.25-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 41.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 45.1 MB/s 
Collecting dotmap
  Downloading dotmap-1.3.26-py3-none-any.whl (11 kB)
Collecting flask-restplus
  Downloading flask_restplus-0.13.0-py2.py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 41.7 MB/s 
[?25hCollecting transformers==4.6.1
  Downl

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path="/content/drive/MyDrive/Data/Squad2.0"

In [None]:
# Here are the imports we need
import logging
import os
import pprint
import torch
from pathlib import Path

from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import SquadProcessor
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.optimization import initialize_optimizer
from farm.modeling.prediction_head import QuestionAnsweringHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

12/21/2021 06:57:52 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [None]:
# We need to fetch the right device to drive the growth of our model
# Make sure that you have gpu turned on in this notebook by going to
# Runtime>Change runtime type and select GPU as Hardware accelerator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Devices available: {}".format(device))


Devices available: cuda


#Data Processing
First we want to process the SQuAD data so it can be used by our NN model. Since
Colab only offers 2 cores this will likely take some time.

In [None]:
# init settings
base_LM_model = "bert-base-multilingual-uncased"
# base_LM_model = "deepset/xlm-roberta-base-squad2"
batch_size = 16
learning_rate = 1e-5
n_epochs = 3
evaluate_every_steps = 500
warmup_proportion = 0.2

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=base_LM_model)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = SquadProcessor(
    tokenizer=tokenizer,
    max_seq_len=256,
    label_list=["start_token", "end_token"],
    metric="squad",
    train_filename="new_train.json",
    dev_filename="new_dev.json",
    test_filename="new_test.json",
    data_dir=path,
)


# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

12/21/2021 06:57:53 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'


Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

12/21/2021 06:57:55 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
12/21/2021 06:57:55 - INFO - farm.data_handler.data_silo -   LOADING TRAIN DATA
12/21/2021 06:57:55 - INFO - farm.data_handler.data_silo -   Loading train set from: /content/drive/MyDrive/Data/Squad2.0/new_train.json 
12/21/2021 06:57:55 - INFO - farm.data_handler.data_silo -   Got ya 2 parallel workers to convert 4101 dictionaries to pytorch datasets (chunksize = 411)...
12/21/2021 06:57:55 - INFO - farm.data_handler.data_silo -    0    0 
12/21/2021 06:57:55 - INFO - farm.data_handler.data_silo -   /|\  /w\
12/21/2021 06:57:55 - INFO - farm.data_handler.data_silo -   /'\  / \
12/21/2021 06:57:55 - INFO - farm.data_handler.data_silo -     
Preprocessing Dataset /content/drive/MyDrive/Data/Squad2.0/new_train.json:   0%|          | 0/4101 [00:00<?, 

#Modelling

In [None]:
# 4. Create an AdaptiveModel
model = AdaptiveModel.convert_from_transformers(base_LM_model,
                                                device=device,  # type: ignore
                                                task_type="question_answering",)

# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    learning_rate=learning_rate,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=n_epochs,
    device=device,
    schedule_opts={"name": "LinearWarmup", 
                   "warmup_proportion": warmup_proportion}
)
# 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=n_epochs,
    n_gpu=1,
    lr_schedule=lr_schedule,
    evaluate_every=evaluate_every_steps,
    device=device,
    checkpoint_root_dir = Path("model_checkpoints"),
    checkpoint_every = 500
)
# 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
model = trainer.train()

12/21/2021 06:58:27 - INFO - farm.modeling.language_model -   
12/21/2021 06:58:27 - INFO - farm.modeling.language_model -   LOADING MODEL
12/21/2021 06:58:27 - INFO - farm.modeling.language_model -   Could not find bert-base-multilingual-uncased locally.
12/21/2021 06:58:27 - INFO - farm.modeling.language_model -   Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
12/21/2021 06:58:58 - INFO - farm.modeling.language_model -   Automatically detected language from language model name: multilingual
12/21/202

# Save model

In [None]:
model.save(path+"/model_reader")
processor.save(path+"/model_reader")