#installing dependencies

In [25]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 860M


In [26]:
# !pip install optuna==2.3.0
# !pip install transformers==4.2.1
# !pip install farasapy
# !pip install pyarabic
# !git clone https://github.com/aub-mind/arabert

In [27]:
# !pip install openpyxl
# !pip install xlrd

#Creating training datasets

In [28]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
all_datasets= []

In [29]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

##HARD - Balanced

In [30]:
df_HARD = pd.read_excel('ml2.xlsx', engine='openpyxl', header=None)
df_HARD = df_HARD.rename(columns={0: 'tags', 1: 'post'})
print(df_HARD["tags"].value_counts())
# code rating as +ve if > 3, -ve if less, no 3s in dataset

train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
label_list_HARD = list(range(1, 14))

data_Hard = Dataset("ml2", train_HARD, test_HARD, label_list_HARD)
all_datasets.append(data_Hard)

13    1025
12     998
11     755
10     646
9      574
8      482
7      477
3      440
5      425
6      414
4      410
1      304
2      292
Name: tags, dtype: int64


In [31]:
for x in all_datasets:
  print(x.name) 

ml2


#Trainer

In [32]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [33]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [34]:
for x in all_datasets:
  print(x.name)

ml2


You can choose which model, and dataset from here along with the max sentence length

In [35]:
dataset_name = 'ml2'
model_name = 'aubmindlab/bert-base-arabertv2'
task_name = 'classification'
max_len = 256

In [36]:
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print('Dataset found')
    break

Dataset found


In [37]:
arabert_prep = ArabertPreprocessor("bert-base-arabertv2")

selected_dataset.train['post'] = selected_dataset.train['post'].apply(lambda x: arabert_prep.preprocess(x))
selected_dataset.test['post'] = selected_dataset.test['post'].apply(lambda x: arabert_prep.preprocess(x))  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [38]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [39]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(selected_dataset.train['post'].to_list(),selected_dataset.train['tags'].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(selected_dataset.test['post'].to_list(),selected_dataset.test['tags'].to_list(),model_name,max_len,label_map)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12}


In [40]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [41]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_per_label = f1_score(p.label_ids, preds,average='macro', labels=list(range(1, 14)))
  macro_f1 = f1_score(p.label_ids,preds, average='macro')
  macro_precision = precision_score(p.label_ids, preds,average='macro')
  macro_recall = recall_score(p.label_ids, preds, average='macro')
  acc = accuracy_score(p.label_ids, preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_per_label' : macro_f1_per_label,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

# HyperParameter Search

you can change the batch size and gradient accumulation from here



In [42]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.lr_scheduler_type = 'cosine'
training_args.fp16 = True
training_args.per_device_train_batch_size = 1
training_args.per_device_eval_batch_size = 1
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000
# training_args.save_steps = 
#training_args.eval_steps = 
training_args.disable_tqdm = True
# print("Logging Step:", training_args.logging_steps)
# print("Eval Step:",training_args.eval_steps)

In [None]:
steps_per_epoch = (len(selected_dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)

In [None]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=test_dataset, 
    model_init=model_init,
    compute_metrics=compute_metrics,
)

here you can define your search space.

the `my_hp_space` function defines the hyper parameter super set, of which you can choose a subset (or even the whole set) for the grid search


Note: You can include the opch count as a hyperparameter, but this will drasticly increase the search space, I prefer setting a fixed epcoh size, then I manually search for the highest score between the epochs since optuna can't do that as far as I know

In [None]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 2e-5, 7e-5, step=1e-5),
        "seed": trial.suggest_categorical("seed", [0, 1, 42, 666, 123, 12345]),
        "warmup_steps": trial.suggest_int("warmup_steps",0,total_steps*0.1,step=total_steps*0.1*0.5)
    }

search_space = {
    "learning_rate":  list(np.arange(2e-5, 7e-5, 1e-5)),
    "seed":  [0, 1, 42, 666, 123, 12345],
    "warmup_steps": list(range(0, int((total_steps)*0.1)+1, int(total_steps*0.1*0.5)))
}
search_space

In [None]:
def my_objective(metrics):
    return metrics['eval_macro_f1']

choose a study name to save it on disk

In [None]:
name = "ml2-arabert-base-v2"

In [None]:
best_run = trainer.hyperparameter_search(direction="maximize",
                                         hp_space=my_hp_space,
                                         compute_objective=my_objective,
                                         n_trials=None,
                                         pruner=optuna.pruners.NopPruner(),
                                         sampler=optuna.samplers.GridSampler(search_space),
                                         study_name=name,
                                         load_if_exists=False # you can change this to true, for continuing the search
                                         )

In [None]:
best_run

### Regular Training

This paert allows you to do a regular training with no hyper parameter optimization

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8


steps_per_epoch = (len(selected_dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("ml2")