In [1]:
%%capture

!pip install transformers==4.17.0
!pip install datasets==2.4.0
!pip install sklearn==0.24.2
!pip install s3fs==2022.01.0
!pip install pandas==1.4.3

In [2]:
from sklearn.metrics import precision_recall_fscore_support
from transformers import BertForSequenceClassification
from sklearn.metrics import accuracy_score
from transformers import TrainingArguments
from transformers import BertTokenizerFast
from transformers import pipeline
from transformers import Trainer
from datasets import load_dataset
from datasets import DatasetDict
import pandas as pd
import numpy as np
import pickle

### Read data

In [3]:
df = pd.read_csv('./data/covid_articles_clf_data.csv')
df.head()

Unnamed: 0,Looking into the truth about modern workplace environments,2
0,Hexo refiles financial statements,2
1,"Japan raid, Turkey arrests in widening Ghosn p...",2
2,Pope's bodyguards criticised over slapping inc...,2
3,Lebanon denies president welcomed fugitive Ghosn,2
4,Lebanese lawyers want Ghosn prosecuted over Is...,2


In [4]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
tokenizer('covid is ravaging south asia')

{'input_ids': [101, 2522, 17258, 2003, 10958, 3567, 4726, 2148, 4021, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
data = load_dataset('csv', 
                     data_files='./data/covid_articles_clf_data.csv', 
                     column_names=['text', 'label'], 
                     delimiter=',', 
                     split='train', 
                     cache_dir='./tmp')

Using custom data configuration default-bb6da80340ff74ce
Reusing dataset csv (./tmp/csv/default-bb6da80340ff74ce/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [7]:
train_validation_test = data.train_test_split(shuffle=True, seed=123, test_size=0.1)
validation_test = train_validation_test['test'].train_test_split(shuffle=True, seed=123, test_size=0.5)
data_splits = DatasetDict({'train': train_validation_test['train'],  
                           'validation': validation_test['train'], 
                           'test': validation_test['test']})

Loading cached split indices for dataset at ./tmp/csv/default-bb6da80340ff74ce/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-0096ba6187486b82.arrow and ./tmp/csv/default-bb6da80340ff74ce/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-698b9113c3771b66.arrow
Loading cached split indices for dataset at ./tmp/csv/default-bb6da80340ff74ce/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-157d7449195b3baa.arrow and ./tmp/csv/default-bb6da80340ff74ce/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-b236db7a465dc0e4.arrow


In [8]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

In [9]:
tokenized_data = data_splits.map(preprocess_function, batched=True)

  0%|          | 0/430 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

In [10]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 429785
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23877
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23877
    })
})

#### Compute Metrics 

In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 
            'f1': f1, 
            'precision': precision, 
            'recall': recall}

#### Load model

In [12]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5,  force_download=True)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
training_args = TrainingArguments(output_dir='./tmp', 
                                  overwrite_output_dir=True, 
                                  optim='adamw_torch', 
                                  learning_rate=2e-5, 
                                  per_device_train_batch_size=8, 
                                  per_device_eval_batch_size=8, 
                                  num_train_epochs=2,  
                                  weight_decay=0.01, 
                                  save_total_limit=2, 
                                  save_strategy='no',  
                                  load_best_model_at_end=False)

In [14]:
trainer = Trainer(model=model, 
                      args=training_args, 
                      train_dataset=tokenized_data['train'], 
                      eval_dataset=tokenized_data['validation'], 
                      tokenizer=tokenizer, 
                      compute_metrics=compute_metrics)

In [15]:
train_results = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 429785
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 107448


[2022-09-01 14:45:10.850 pytorch-1-8-gpu-py-ml-g4dn-8xlarge-55e67e449159adffe098c7680613:5871 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-09-01 14:45:10.882 pytorch-1-8-gpu-py-ml-g4dn-8xlarge-55e67e449159adffe098c7680613:5871 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


Step,Training Loss


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

INFO:root:
Unfortunately, your original traceback can not be constructed.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-76bcf0aaf355>", line 1, in <module>
    train_results = trainer.train()
  File "/opt/conda/lib/python3.6/site-packages/transformers/trainer.py", line 1405, in train
    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2044, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1169, in get_rec

TypeError: object of type 'NoneType' has no len()

In [None]:
trainer.log_metrics('train', train_results.metrics)
trainer.save_metrics('train', train_results.metrics)

In [None]:
results = trainer.evaluate()
trainer.log_metrics('validation', results)
trainer.save_metrics('validation', results)

In [None]:
results = trainer.evaluate(eval_dataset=tokenized_data['test'])
trainer.log_metrics('test', results)
trainer.save_metrics('test', results)

### label config 

In [None]:
with open('.././data/label_map', 'rb') as f:
    label2id = pickle.load(f)

In [None]:
id2label = dict((str(v), k) for k, v in label2id.items())
id2label

In [None]:
trainer.model.config.label2id = label2id
trainer.model.config.id2label = id2label

### Save model

In [None]:
trainer.save_model('./model/')

### Test model 

In [None]:
classifier = pipeline('sentiment-analysis', model='./model/')
prediction = classifier('I hate you')
prediction