#installing dependencies

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
!pip install optuna==2.3.0
!pip install transformers==4.2.1
!pip install farasapy
!pip install pyarabic
# !git clone https://github.com/aub-mind/arabert
!wget https://huggingface.co/UBC-NLP/MARBERT/resolve/main/MARBERT_pytorch_verison.tar.gz
!tar -xvf MARBERT_pytorch_verison.tar.gz
!wget https://raw.githubusercontent.com/UBC-NLP/marbert/main/examples/UBC_AJGT_final_shuffled_train.tsv
!wget https://raw.githubusercontent.com/UBC-NLP/marbert/main/examples/UBC_AJGT_final_shuffled_test.tsv
!mkdir -p AJGT
!mv UBC_AJGT_final_shuffled_train.tsv ./AJGT/UBC_AJGT_final_shuffled_train.tsv
!mv UBC_AJGT_final_shuffled_test.tsv ./AJGT/UBC_AJGT_final_shuffled_test.tsv
!pip install GPUtil pytorch_pretrained_bert transformers

In [3]:
!mkdir data
!mkdir train

#Creating training datasets

In [4]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
all_datasets= []

In [5]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [14]:
DATA_COLUMN = "Tweets"
LABEL_COLUMN = "OFF"

##HARD - Balanced

In [6]:
df_HARD=pd.read_csv("/content/subTask A (OFF&NOT_OFF).csv")

In [7]:
train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.1, random_state=42, shuffle=True)
label_list_HARD = [0,1]
data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
all_datasets.append(data_Hard)

#Trainer

In [None]:
!git clone https://github.com/aub-mind/arabert.git
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna

In [9]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [10]:
for x in all_datasets:
  print(x.name)

HARD


You can choose which model, and dataset from here along with the max sentence length

In [11]:
dataset_name = 'HARD'
model_name = 'UBC-NLP/MARBERT'
task_name = 'classification'
max_len = 266

In [12]:
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print('Dataset found')
    break

Dataset found


In [None]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])
selected_dataset.train[DATA_COLUMN] = selected_dataset.train[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))
selected_dataset.test[DATA_COLUMN] = selected_dataset.test[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))

In [16]:
selected_dataset.test[DATA_COLUMN]

2019                                                 خياس
9688    الحمدلله والشكر هالمنطق حتي كافره شيء اسمه ذوق...
5225                            الله يصلحك نواف قاعد افطر
3592                                           انت الشافي
6822                مبشغلش دماغي بعالم متكبره ملهاش لازمه
                              ...                        
2611     السناجل اقولهم فاتكم النكد والغم انا حتجوز علشان
95      امن العقوبه اساء الادب وين رجال الامن مايجري ا...
8761    بايع الكليجا ماشوف جمال عشان يطق الترند سلامات...
9136      اسال سيدك حسن نصر الله قبض قطر لبناء الجنوب عبد
2157    مره تحطي صوره واحده طويله وبترقص ومش فاهم ليه ...
Name: Tweets, Length: 1338, dtype: object

In [17]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [None]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(selected_dataset.train[DATA_COLUMN].to_list(),selected_dataset.train[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(selected_dataset.test[DATA_COLUMN].to_list(),selected_dataset.test[LABEL_COLUMN].to_list(),model_name,max_len,label_map)


In [20]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [21]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))
  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

you can change the batch size and gradient accumulation from here



#Regular Training

This paert allows you to do a regular training with no hyper parameter optimization

In [22]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate =5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 32
training_args.per_device_eval_batch_size = 32
training_args.gradient_accumulation_steps = 5
training_args.num_train_epochs= 4


steps_per_epoch = (len(selected_dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 30
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

75
300


In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics,
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
0,No log,0.331748,0.866191,0.866191,0.8662,0.866184,0.866218,7.4008,180.79
1,No log,0.330242,0.878921,0.878921,0.878962,0.879023,0.878924,7.3574,181.857
2,No log,0.432658,0.87734,0.87734,0.877773,0.877226,0.877429,7.3836,181.212
3,No log,0.509461,0.879535,0.879535,0.880411,0.879378,0.879671,7.3838,181.207


TrainOutput(global_step=300, training_loss=0.19560380299886068, metrics={'train_runtime': 725.1939, 'train_samples_per_second': 0.414, 'total_flos': 12500797330983528, 'epoch': 3.99})

In [25]:
trainer.save_model("OFF")

In [26]:
! zip -r '/content/OFF.zip' '/content/OFF'

  adding: content/OFF/ (stored 0%)
  adding: content/OFF/config.json (deflated 51%)
  adding: content/OFF/training_args.bin (deflated 44%)
  adding: content/OFF/pytorch_model.bin (deflated 7%)


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
import shutil
shutil.move('/content/OFF.zip','/content/drive/MyDrive/')

'/content/drive/MyDrive/OFF.zip'

In [None]:
from zipfile import ZipFile
with ZipFile('OFF_Not_OFF.zip', 'r') as zip:
	# printing all the contents of the zip file
	zip.printdir()

	# extracting all the files
	print('Extracting all the files now...')
	zip.extractall()
	print('Done!')

In [None]:
import torch 
modell= torch.load('/content/content/rename/pytorch_model.bin')
print(modell)