In [1]:
# fmt: off
import logging
from pathlib import Path

from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import TextClassificationProcessor
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.optimization import initialize_optimizer
from farm.modeling.prediction_head import TextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import initialize_device_settings, set_all_seeds
from jack.logging import wb

import warnings
warnings.filterwarnings('ignore')

  use_external_format=True if language_model_class is "XLMRoberta" else False
07/23/2022 22:29:08 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
##########################
########## Settings
##########################
set_all_seeds(seed=2077)
n_epochs = 10
batch_size = 4
evaluate_every = 1000
lang_model = "cointegrated/rubert-tiny"
do_lower_case = False
dev_split = 0.0
dev_stratification = True
max_processes = 1    # 128 is default
# or a local path:
# lang_model = Path("../saved_models/farm-bert-base-cased")
use_amp = None

device, n_gpu = initialize_device_settings(use_cuda=False, use_amp=use_amp)

07/23/2022 22:29:17 - INFO - farm.utils -   Using device: CPU 
07/23/2022 22:29:17 - INFO - farm.utils -   Number of GPUs: 0
07/23/2022 22:29:17 - INFO - farm.utils -   Distributed Training: False
07/23/2022 22:29:17 - INFO - farm.utils -   Automatic Mixed Precision: None


In [3]:
# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case, use_fast=True)

07/23/2022 22:29:21 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'


In [4]:
mapping = {
    '11-1021.00': 'General and Operations Managers',
    '11-2021.00': 'Marketing Managers',
    '11-2022.00': 'Sales Managers',
    '11-3031.02': 'Financial Managers, Branch or Department',
    '13-1111.00': 'Management Analysts',
    '13-2051.00': 'Financial Analysts',
    '15-1121.00': 'Computer Systems Analysts',
    '15-1122.00': 'Information Security Analysts',
    '15-1132.00': 'Software Developers, Applications',
    '15-1133.00': 'Software Developers, Systems Software',
    '15-1134.00': 'Web Developers',
    '15-1142.00': 'Network and Computer Systems Administrators',
    '15-1151.00': 'Computer User Support Specialists',
    '29-1141.00': 'Registered Nurses',
    '31-1014.00': 'Nursing Assistants',
    '33-3021.06': 'Intelligence Analysts',
    '41-2031.00': 'Retail Salespersons',
    '43-4051.00': 'Customer Service Representatives',
    '49-3023.02': 'Automotive Specialty Technicians',
    '49-9071.00': 'Maintenance and Repair Workers, General',
    '53-3032.00': 'Heavy and Tractor-Trailer Truck Drivers'
 }

In [5]:
label_list = list(mapping.values())
metric = "f1_micro"

In [6]:
processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir=Path.home() / "Dataset" / "nrich",
                                        train_filename="train.csv",
                                        dev_filename="test.csv",
                                        test_filename="test.csv",
                                        label_list=label_list,
                                        metric=metric,
                                        dev_split=dev_split,
                                        delimiter=",",
                                        dev_stratification=dev_stratification,
                                        text_column_name="Title",
                                        label_column_name="Code"
                                        )

In [7]:
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
#    few descriptive statistics of our datasets
data_silo = DataSilo(
        processor=processor,
        max_processes=max_processes,
        batch_size=batch_size
)

07/23/2022 22:29:39 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
07/23/2022 22:29:39 - INFO - farm.data_handler.data_silo -   LOADING TRAIN DATA
07/23/2022 22:29:39 - INFO - farm.data_handler.data_silo -   Loading train set from: /Users/justatom/Dataset/nrich/train.csv 
07/23/2022 22:29:39 - INFO - farm.data_handler.data_silo -   Multiprocessing disabled, using a single worker to convert 12098 dictionaries to pytorch datasets.
Preprocessing Dataset /Users/justatom/Dataset/nrich/train.csv:   0%|          | 0/12098 [00:00<?, ? Dicts/s]07/23/2022 22:29:40 - INFO - farm.data_handler.processor -   *** Show 1 random examples ***
07/23/2022 22:29:40 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (__

In [8]:
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(
    class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
    num_labels=len(label_list),
    loss_fn="crossentropy"
)

07/23/2022 22:29:47 - INFO - farm.modeling.language_model -   
07/23/2022 22:29:47 - INFO - farm.modeling.language_model -   LOADING MODEL
07/23/2022 22:29:47 - INFO - farm.modeling.language_model -   Could not find cointegrated/rubert-tiny locally.
07/23/2022 22:29:47 - INFO - farm.modeling.language_model -   Looking on Transformers Model Hub (in local cache and online)...
Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification mode

In [9]:
model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

07/23/2022 22:29:49 - INFO - farm.modeling.prediction_head -   Resizing input dimensions of TextClassificationHead (text_classification) from [768, 21] to [312, 21] to match language model


In [10]:
# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    learning_rate=3e-5,
    device=device,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=n_epochs,
    use_amp=use_amp)

07/23/2022 22:29:50 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 3e-05}'
07/23/2022 22:29:50 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
07/23/2022 22:29:50 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 3025.0, 'num_training_steps': 30250}'


In [11]:
API_KEY = "<YOUR_API_KEY>"
project_name = "nrich"
experiment_name = "nrich-cls"
loss_fn = "crossentropy"

In [12]:
# 6a. Create logger to enable finetuning. Uncomment lines below and initialize a logger
# ml_logger = wb.WANDBLogger.init_experiment(
#     project_name=project_name,
#     experiment_name=experiment_name,
#     prefix=f"{loss_fn} --- ",
#     api=API_KEY,
#     sync_step=False,
# )

In [13]:
# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(
    prefix="",
    model=model,
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=n_epochs,
    n_gpu=n_gpu,
    lr_schedule=lr_schedule,
    log_loss_every=1,
    evaluate_every=evaluate_every,
    tracker=ml_logger,
    device=device)

In [14]:
# 7. Let it grow
trainer.train()

Evaluating: 100%|██████████| 757/757 [00:12<00:00, 62.96it/s]99/3025 [01:22<02:30, 13.49it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
07/23/2022 21:50:18 - INFO - farm.eval -   

\\|//       \\|//      \\|//       \\|//     \\|//
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
***************************************************
***** EVALUATION | D.E.V. SET | AFTER 1000 BATCHES *****
***************************************************
\\|//       \\|//      \\|//       \\|//     \\|//
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

07/23/2022 21:50:18 - INFO - farm.eval -   
 _________ text_classification _________
07/23/2022 21:50:18 - INFO - farm.eval -   -loss: 2.849041147783768
07/23/2022 21:50:18 - INFO - farm.eval -   task_name: text_classification
07/23/2022 21:50:18 - INFO - farm.eval -   f1_micro: 0.27173553719008264
07/23/2022 21:50:18 - 