In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2080 SUPER


#installing dependencies

In [2]:
#!pip install optuna==2.3.0
#!pip install transformers==4.2.1
#!pip install farasapy
#!pip install pyarabic
#!git clone https://github.com/aub-mind/arabert

In [3]:
#!pip install openpyxl
#!pip install xlrd

#Creating training datasets

In [4]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
all_datasets= []

In [5]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

##HARD - Balanced

In [6]:
df_HARD = pd.read_excel('incident13type.xlsx', engine='openpyxl', header=None)
df_HARD = df_HARD.rename(columns={0: 'tags', 1: 'post'})
print(df_HARD["tags"].value_counts())
# code rating as +ve if > 3, -ve if less, no 3s in dataset

train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
label_list_HARD = list(range(1, 14))

data_Hard = Dataset("incident13type", train_HARD, test_HARD, label_list_HARD)
all_datasets.append(data_Hard)

7     436
3     435
1     272
9     224
11    215
13    191
10    141
12    109
2      98
4      76
6      71
5      59
8      57
Name: tags, dtype: int64


In [7]:
for x in all_datasets:
  print(x.name) 

incident13type


#Trainer

In [8]:
#pip install transformers -U

In [9]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [10]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [11]:
for x in all_datasets:
  print(x.name)

incident13type


You can choose which model, and dataset from here along with the max sentence length

In [12]:
dataset_name = 'incident13type'
model_name = 'aubmindlab/bert-base-arabertv2'
task_name = 'classification'
max_len = 256

In [13]:
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print('Dataset found')
    break

Dataset found


In [14]:
arabert_prep = ArabertPreprocessor("bert-base-arabertv2")

selected_dataset.train['post'] = selected_dataset.train['post'].apply(lambda x: arabert_prep.preprocess(x))
selected_dataset.test['post'] = selected_dataset.test['post'].apply(lambda x: arabert_prep.preprocess(x))  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_dataset.train['post'] = selected_dataset.train['post'].apply(lambda x: arabert_prep.preprocess(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_dataset.test['post'] = selected_dataset.test['post'].apply(lambda x: arabert_prep.preprocess(x))


In [15]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [16]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(selected_dataset.train['post'].to_list(),selected_dataset.train['tags'].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(selected_dataset.test['post'].to_list(),selected_dataset.test['tags'].to_list(),model_name,max_len,label_map)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12}


In [17]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [18]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_per_label = f1_score(p.label_ids, preds,average='macro', labels=list(range(1, 14)))
  macro_f1 = f1_score(p.label_ids,preds, average='macro')
  macro_precision = precision_score(p.label_ids, preds,average='macro')
  macro_recall = recall_score(p.label_ids, preds, average='macro')
  acc = accuracy_score(p.label_ids, preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_per_label' : macro_f1_per_label,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

# Regular Training

This paert allows you to do a regular training with no hyper parameter optimization

In [19]:
training_args = TrainingArguments(
    output_dir= "./train",
    do_eval = True,
    do_train = True,
    evaluation_strategy = 'epoch',
    adam_epsilon = 1e-8,
    learning_rate = 5e-5,
    fp16 = False,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 2,
    num_train_epochs= 8,
    warmup_ratio =0,
    save_strategy = 'epoch',
    seed = 42,
    lr_scheduler_type = 'cosine'
)

In [20]:
training_args.n_gpu

2

In [21]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

In [22]:
trainer.train()

***** Running training *****
  Num examples = 1907
  Num Epochs = 8
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 952
 12%|█▎        | 119/952 [01:22<09:25,  1.47it/s]***** Running Evaluation *****
  Num examples = 477
  Batch size = 8
  _warn_prf(

 12%|█▎        | 119/952 [01:31<09:25,  1.47it/s]Saving model checkpoint to ./train\checkpoint-119
Configuration saved in ./train\checkpoint-119\config.json


{'eval_loss': 0.2951911985874176, 'eval_macro_f1': 0.8710699115849125, 'eval_macro_f1_per_label': 0.7986717215396636, 'eval_macro_precision': 0.8887999157779748, 'eval_macro_recall': 0.8716884286022455, 'eval_accuracy': 0.9245283018867925, 'eval_runtime': 8.8308, 'eval_samples_per_second': 54.016, 'eval_steps_per_second': 6.794, 'epoch': 1.0}


Model weights saved in ./train\checkpoint-119\pytorch_model.bin
 25%|██▌       | 238/952 [02:56<07:57,  1.50it/s]***** Running Evaluation *****
  Num examples = 477
  Batch size = 8
  _warn_prf(

 25%|██▌       | 238/952 [03:06<07:57,  1.50it/s]Saving model checkpoint to ./train\checkpoint-238
Configuration saved in ./train\checkpoint-238\config.json


{'eval_loss': 0.23825226724147797, 'eval_macro_f1': 0.9100712393070602, 'eval_macro_f1_per_label': 0.8385871678229887, 'eval_macro_precision': 0.9191711304230139, 'eval_macro_recall': 0.9064463040285677, 'eval_accuracy': 0.939203354297694, 'eval_runtime': 8.8094, 'eval_samples_per_second': 54.147, 'eval_steps_per_second': 6.811, 'epoch': 2.0}


Model weights saved in ./train\checkpoint-238\pytorch_model.bin
 38%|███▊      | 357/952 [04:31<06:36,  1.50it/s]***** Running Evaluation *****
  Num examples = 477
  Batch size = 8
  _warn_prf(

 38%|███▊      | 357/952 [04:40<06:36,  1.50it/s]Saving model checkpoint to ./train\checkpoint-357
Configuration saved in ./train\checkpoint-357\config.json


{'eval_loss': 0.2560058832168579, 'eval_macro_f1': 0.9072677167124573, 'eval_macro_f1_per_label': 0.8348695266672086, 'eval_macro_precision': 0.9135854181805599, 'eval_macro_recall': 0.9045544936347333, 'eval_accuracy': 0.939203354297694, 'eval_runtime': 8.7927, 'eval_samples_per_second': 54.25, 'eval_steps_per_second': 6.824, 'epoch': 3.0}


Model weights saved in ./train\checkpoint-357\pytorch_model.bin
 50%|█████     | 476/952 [06:06<05:19,  1.49it/s]***** Running Evaluation *****
  Num examples = 477
  Batch size = 8
  _warn_prf(

 50%|█████     | 476/952 [06:15<05:19,  1.49it/s]Saving model checkpoint to ./train\checkpoint-476
Configuration saved in ./train\checkpoint-476\config.json


{'eval_loss': 0.22668638825416565, 'eval_macro_f1': 0.90319027081903, 'eval_macro_f1_per_label': 0.8317061993349587, 'eval_macro_precision': 0.9113027624861694, 'eval_macro_recall': 0.8981172224804321, 'eval_accuracy': 0.9371069182389937, 'eval_runtime': 8.7957, 'eval_samples_per_second': 54.231, 'eval_steps_per_second': 6.822, 'epoch': 4.0}


Model weights saved in ./train\checkpoint-476\pytorch_model.bin
 53%|█████▎    | 500/952 [06:37<05:03,  1.49it/s]

{'loss': 0.3544, 'learning_rate': 2.3022073937664386e-05, 'epoch': 4.2}


 62%|██████▎   | 595/952 [07:40<03:58,  1.50it/s]***** Running Evaluation *****
  Num examples = 477
  Batch size = 8
  _warn_prf(

 62%|██████▎   | 595/952 [07:49<03:58,  1.50it/s]Saving model checkpoint to ./train\checkpoint-595
Configuration saved in ./train\checkpoint-595\config.json


{'eval_loss': 0.24332840740680695, 'eval_macro_f1': 0.9094764792168828, 'eval_macro_f1_per_label': 0.8372629784319534, 'eval_macro_precision': 0.9168160702428504, 'eval_macro_recall': 0.904836301273979, 'eval_accuracy': 0.9433962264150944, 'eval_runtime': 8.7914, 'eval_samples_per_second': 54.258, 'eval_steps_per_second': 6.825, 'epoch': 5.0}


Model weights saved in ./train\checkpoint-595\pytorch_model.bin
 75%|███████▌  | 714/952 [09:16<02:39,  1.49it/s]***** Running Evaluation *****
  Num examples = 477
  Batch size = 8
  _warn_prf(

 75%|███████▌  | 714/952 [09:25<02:39,  1.49it/s]Saving model checkpoint to ./train\checkpoint-714
Configuration saved in ./train\checkpoint-714\config.json


{'eval_loss': 0.2271929383277893, 'eval_macro_f1': 0.9294718661189053, 'eval_macro_f1_per_label': 0.8579877946348339, 'eval_macro_precision': 0.9333994303495654, 'eval_macro_recall': 0.9280611533449848, 'eval_accuracy': 0.9538784067085954, 'eval_runtime': 8.8098, 'eval_samples_per_second': 54.144, 'eval_steps_per_second': 6.811, 'epoch': 6.0}


Model weights saved in ./train\checkpoint-714\pytorch_model.bin
 88%|████████▊ | 833/952 [10:51<01:19,  1.50it/s]***** Running Evaluation *****
  Num examples = 477
  Batch size = 8
  _warn_prf(

 88%|████████▊ | 833/952 [11:00<01:19,  1.50it/s]Saving model checkpoint to ./train\checkpoint-833
Configuration saved in ./train\checkpoint-833\config.json


{'eval_loss': 0.2373264729976654, 'eval_macro_f1': 0.928249159718902, 'eval_macro_f1_per_label': 0.8576055176466884, 'eval_macro_precision': 0.9326334700276384, 'eval_macro_recall': 0.9265818634041564, 'eval_accuracy': 0.9517819706498952, 'eval_runtime': 8.802, 'eval_samples_per_second': 54.192, 'eval_steps_per_second': 6.817, 'epoch': 7.0}


Model weights saved in ./train\checkpoint-833\pytorch_model.bin
100%|██████████| 952/952 [12:25<00:00,  1.49it/s]***** Running Evaluation *****
  Num examples = 477
  Batch size = 8
  _warn_prf(

100%|██████████| 952/952 [12:34<00:00,  1.49it/s]Saving model checkpoint to ./train\checkpoint-952
Configuration saved in ./train\checkpoint-952\config.json


{'eval_loss': 0.24022670090198517, 'eval_macro_f1': 0.928249159718902, 'eval_macro_f1_per_label': 0.8576055176466884, 'eval_macro_precision': 0.9326334700276384, 'eval_macro_recall': 0.9265818634041564, 'eval_accuracy': 0.9517819706498952, 'eval_runtime': 8.8454, 'eval_samples_per_second': 53.927, 'eval_steps_per_second': 6.783, 'epoch': 8.0}


Model weights saved in ./train\checkpoint-952\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 952/952 [12:41<00:00,  1.25it/s]

{'train_runtime': 761.1521, 'train_samples_per_second': 20.043, 'train_steps_per_second': 1.251, 'train_loss': 0.20373680611618428, 'epoch': 8.0}





TrainOutput(global_step=952, training_loss=0.20373680611618428, metrics={'train_runtime': 761.1521, 'train_samples_per_second': 20.043, 'train_steps_per_second': 1.251, 'train_loss': 0.20373680611618428, 'epoch': 8.0})

In [23]:
trainer.save_model("incident13type")

Saving model checkpoint to incident13type
Configuration saved in incident13type\config.json
Model weights saved in incident13type\pytorch_model.bin


In [24]:
train_dataset.tokenizer.save_pretrained("incident13type")

tokenizer config file saved in incident13type\tokenizer_config.json
Special tokens file saved in incident13type\special_tokens_map.json


('incident13type\\tokenizer_config.json',
 'incident13type\\special_tokens_map.json',
 'incident13type\\vocab.txt',
 'incident13type\\added_tokens.json',
 'incident13type\\tokenizer.json')