In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2080 SUPER


#installing dependencies

In [2]:
!pip install optuna==2.3.0
!pip install transformers==4.2.1
!pip install farasapy
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

Collecting transformers==4.2.1
  Using cached transformers-4.2.1-py3-none-any.whl (1.8 MB)
Collecting tokenizers==0.9.4
  Using cached tokenizers-0.9.4-cp38-cp38-win_amd64.whl (1.9 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.12.1
    Uninstalling tokenizers-0.12.1:
      Successfully uninstalled tokenizers-0.12.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.20.1
    Uninstalling transformers-4.20.1:
      Successfully uninstalled transformers-4.20.1
Successfully installed tokenizers-0.9.4 transformers-4.2.1


fatal: destination path 'arabert' already exists and is not an empty directory.


In [3]:
!pip install openpyxl
!pip install xlrd



#Creating training datasets

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
all_datasets= []

In [3]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

##HARD - Balanced

In [4]:
df_HARD = pd.read_excel('ML1.xlsx', engine='openpyxl', header=None)
df_HARD = df_HARD.rename(columns={0: 'tags', 1: 'post'})
print(df_HARD["tags"].value_counts())
# code rating as +ve if > 3, -ve if less, no 3s in dataset

train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
label_list_HARD = list(range(0, 5))

data_Hard = Dataset("ML1", train_HARD, test_HARD, label_list_HARD)
all_datasets.append(data_Hard)

4    730
3    369
2    325
1    303
0    252
Name: tags, dtype: int64


In [5]:
for x in all_datasets:
  print(x.name) 

ML1


#Trainer

In [27]:
!pip install transformers -U



In [6]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [7]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [8]:
for x in all_datasets:
  print(x.name)

ML1


You can choose which model, and dataset from here along with the max sentence length

In [9]:
dataset_name = 'ML1'
model_name = 'aubmindlab/bert-base-arabertv2'
task_name = 'classification'
max_len = 256

In [10]:
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print('Dataset found')
    break

Dataset found


In [11]:
arabert_prep = ArabertPreprocessor("bert-base-arabertv2")

selected_dataset.train['post'] = selected_dataset.train['post'].apply(lambda x: arabert_prep.preprocess(x))
selected_dataset.test['post'] = selected_dataset.test['post'].apply(lambda x: arabert_prep.preprocess(x))  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_dataset.train['post'] = selected_dataset.train['post'].apply(lambda x: arabert_prep.preprocess(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_dataset.test['post'] = selected_dataset.test['post'].apply(lambda x: arabert_prep.preprocess(x))


In [13]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [19]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(selected_dataset.train['post'].to_list(),selected_dataset.train['tags'].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(selected_dataset.test['post'].to_list(),selected_dataset.test['tags'].to_list(),model_name,max_len,label_map)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4}


In [20]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [15]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_per_label = f1_score(p.label_ids, preds,average='macro', labels=list(range(0, 5)))
  macro_f1 = f1_score(p.label_ids,preds, average='macro')
  macro_precision = precision_score(p.label_ids, preds,average='macro')
  macro_recall = recall_score(p.label_ids, preds, average='macro')
  acc = accuracy_score(p.label_ids, preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_per_label' : macro_f1_per_label,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

# Regular Training

This paert allows you to do a regular training with no hyper parameter optimization

In [21]:
training_args = TrainingArguments(
    output_dir= "./train",
    do_eval = True,
    do_train = True,
    evaluation_strategy = 'epoch',
    adam_epsilon = 1e-8,
    learning_rate = 5e-5,
    fp16 = False,
    per_device_train_batch_size =16,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 2,
    num_train_epochs= 8,
    warmup_ratio =0,
    save_strategy = 'epoch',
    seed = 42,
    lr_scheduler_type = 'cosine'
)

In [22]:
training_args.n_gpu

2

In [23]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

In [24]:
trainer.train()

***** Running training *****
  Num examples = 1583
  Num Epochs = 8
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 200
 12%|█▎        | 25/200 [00:31<03:12,  1.10s/it]***** Running Evaluation *****
  Num examples = 396
  Batch size = 32

 12%|█▎        | 25/200 [00:34<03:12,  1.10s/it]Saving model checkpoint to ./train\checkpoint-25
Configuration saved in ./train\checkpoint-25\config.json


{'eval_loss': 0.14836037158966064, 'eval_macro_f1': 0.9758879023774458, 'eval_macro_f1_per_label': 0.9758879023774458, 'eval_macro_precision': 0.9684938691980947, 'eval_macro_recall': 0.9849139188794863, 'eval_accuracy': 0.9772727272727273, 'eval_runtime': 3.0096, 'eval_samples_per_second': 131.579, 'eval_steps_per_second': 4.32, 'epoch': 1.0}


Model weights saved in ./train\checkpoint-25\pytorch_model.bin
 25%|██▌       | 50/200 [01:06<02:46,  1.11s/it]***** Running Evaluation *****
  Num examples = 396
  Batch size = 32

 25%|██▌       | 50/200 [01:09<02:46,  1.11s/it]Saving model checkpoint to ./train\checkpoint-50
Configuration saved in ./train\checkpoint-50\config.json


{'eval_loss': 0.07757890969514847, 'eval_macro_f1': 0.9843262475675848, 'eval_macro_f1_per_label': 0.9843262475675848, 'eval_macro_precision': 0.980004910009981, 'eval_macro_recall': 0.9889407645170702, 'eval_accuracy': 0.9848484848484849, 'eval_runtime': 2.982, 'eval_samples_per_second': 132.797, 'eval_steps_per_second': 4.36, 'epoch': 2.0}


Model weights saved in ./train\checkpoint-50\pytorch_model.bin
 38%|███▊      | 75/200 [01:42<02:17,  1.10s/it]***** Running Evaluation *****
  Num examples = 396
  Batch size = 32

 38%|███▊      | 75/200 [01:45<02:17,  1.10s/it]Saving model checkpoint to ./train\checkpoint-75
Configuration saved in ./train\checkpoint-75\config.json


{'eval_loss': 0.09647765010595322, 'eval_macro_f1': 0.9689198932464368, 'eval_macro_f1_per_label': 0.9689198932464368, 'eval_macro_precision': 0.9749175964780434, 'eval_macro_recall': 0.9641960898745257, 'eval_accuracy': 0.9722222222222222, 'eval_runtime': 2.9438, 'eval_samples_per_second': 134.519, 'eval_steps_per_second': 4.416, 'epoch': 3.0}


Model weights saved in ./train\checkpoint-75\pytorch_model.bin
 50%|█████     | 100/200 [02:17<01:50,  1.11s/it]***** Running Evaluation *****
  Num examples = 396
  Batch size = 32

 50%|█████     | 100/200 [02:20<01:50,  1.11s/it]Saving model checkpoint to ./train\checkpoint-100
Configuration saved in ./train\checkpoint-100\config.json


{'eval_loss': 0.10007987171411514, 'eval_macro_f1': 0.9782160576684067, 'eval_macro_f1_per_label': 0.9782160576684067, 'eval_macro_precision': 0.979424183747121, 'eval_macro_recall': 0.9772395681353954, 'eval_accuracy': 0.9797979797979798, 'eval_runtime': 3.0017, 'eval_samples_per_second': 131.924, 'eval_steps_per_second': 4.331, 'epoch': 4.0}


Model weights saved in ./train\checkpoint-100\pytorch_model.bin
 62%|██████▎   | 125/200 [02:54<01:23,  1.11s/it]***** Running Evaluation *****
  Num examples = 396
  Batch size = 32

 62%|██████▎   | 125/200 [02:57<01:23,  1.11s/it]Saving model checkpoint to ./train\checkpoint-125
Configuration saved in ./train\checkpoint-125\config.json


{'eval_loss': 0.06463334709405899, 'eval_macro_f1': 0.9870770751068181, 'eval_macro_f1_per_label': 0.9870770751068181, 'eval_macro_precision': 0.9840037190432376, 'eval_macro_recall': 0.990283046396265, 'eval_accuracy': 0.9873737373737373, 'eval_runtime': 2.9941, 'eval_samples_per_second': 132.261, 'eval_steps_per_second': 4.342, 'epoch': 5.0}


Model weights saved in ./train\checkpoint-125\pytorch_model.bin
 75%|███████▌  | 150/200 [03:32<00:55,  1.11s/it]***** Running Evaluation *****
  Num examples = 396
  Batch size = 32

 75%|███████▌  | 150/200 [03:35<00:55,  1.11s/it]Saving model checkpoint to ./train\checkpoint-150
Configuration saved in ./train\checkpoint-150\config.json


{'eval_loss': 0.0659913718700409, 'eval_macro_f1': 0.9841688122905513, 'eval_macro_f1_per_label': 0.9841688122905513, 'eval_macro_precision': 0.9824674797455213, 'eval_macro_recall': 0.9859352203093085, 'eval_accuracy': 0.9848484848484849, 'eval_runtime': 3.0141, 'eval_samples_per_second': 131.382, 'eval_steps_per_second': 4.313, 'epoch': 6.0}


Model weights saved in ./train\checkpoint-150\pytorch_model.bin
 88%|████████▊ | 175/200 [04:09<00:27,  1.11s/it]***** Running Evaluation *****
  Num examples = 396
  Batch size = 32

 88%|████████▊ | 175/200 [04:12<00:27,  1.11s/it]Saving model checkpoint to ./train\checkpoint-175
Configuration saved in ./train\checkpoint-175\config.json


{'eval_loss': 0.07619144767522812, 'eval_macro_f1': 0.9812157028157028, 'eval_macro_f1_per_label': 0.9812157028157028, 'eval_macro_precision': 0.9809412774118658, 'eval_macro_recall': 0.9815873942223519, 'eval_accuracy': 0.9823232323232324, 'eval_runtime': 3.0335, 'eval_samples_per_second': 130.543, 'eval_steps_per_second': 4.286, 'epoch': 7.0}


Model weights saved in ./train\checkpoint-175\pytorch_model.bin
100%|██████████| 200/200 [04:47<00:00,  1.11s/it]***** Running Evaluation *****
  Num examples = 396
  Batch size = 32

100%|██████████| 200/200 [04:50<00:00,  1.11s/it]Saving model checkpoint to ./train\checkpoint-200
Configuration saved in ./train\checkpoint-200\config.json


{'eval_loss': 0.07642978429794312, 'eval_macro_f1': 0.9812157028157028, 'eval_macro_f1_per_label': 0.9812157028157028, 'eval_macro_precision': 0.9809412774118658, 'eval_macro_recall': 0.9815873942223519, 'eval_accuracy': 0.9823232323232324, 'eval_runtime': 3.0149, 'eval_samples_per_second': 131.347, 'eval_steps_per_second': 4.312, 'epoch': 8.0}


Model weights saved in ./train\checkpoint-200\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 200/200 [04:55<00:00,  1.48s/it]

{'train_runtime': 295.3835, 'train_samples_per_second': 42.873, 'train_steps_per_second': 0.677, 'train_loss': 0.13046462059020997, 'epoch': 8.0}





TrainOutput(global_step=200, training_loss=0.13046462059020997, metrics={'train_runtime': 295.3835, 'train_samples_per_second': 42.873, 'train_steps_per_second': 0.677, 'train_loss': 0.13046462059020997, 'epoch': 8.0})

In [25]:
trainer.save_model("types")

Saving model checkpoint to types
Configuration saved in types\config.json
Model weights saved in types\pytorch_model.bin


In [27]:
train_dataset.tokenizer.save_pretrained("types")

tokenizer config file saved in types\tokenizer_config.json
Special tokens file saved in types\special_tokens_map.json


('types\\tokenizer_config.json',
 'types\\special_tokens_map.json',
 'types\\vocab.txt',
 'types\\added_tokens.json',
 'types\\tokenizer.json')