# NLP Project (Arabic Dialect Classification)-DL-AraBERT (Colab)

### Importing necessary libraries

In [None]:
! pip install transformers
! pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, ht

In [None]:
import torch
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_metric

import warnings
warnings.filterwarnings('ignore')

from tqdm.notebook import tqdm
tqdm.pandas()

### Reading the cleaned data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pd.set_option('display.max_colwidth',None)
pd.set_option('display.max_rows',None)

In [None]:
path = '/content/drive/MyDrive/Arabic_Dialect_Identification/'
preprocess = 'Data_Preprocessing/'
clean_data_csv = 'clean_data.csv'
modeling = 'Modeling/'

In [None]:
df = pd.read_csv(path+preprocess+clean_data_csv, index_col=0)
df.head()

Unnamed: 0_level_0,text,dialect
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1164474450408038400,امي ديما ادير,LY
1116319804804222976,حاجه كده زي زبادي خلاط,EG
1146658966669287424,الجبل بيحضن البحر طبيعه لبنان الجميله,LB
1022926186098225152,شو قصه انجليك كلنا شفنا التقرير عرفنا انو توقفت التدخل تبعكم كانت بعدها منقوعه بالحبس معقول بتعرف انها فاسده بتعرف مصيبه بتعرف لمصيبه الاكبر,LB
1164211560216170496,الدولار صرلو شهرين بالسوق اللبناني مبارح الحمد الله الليره ثابته وبالف خير,LB


In [None]:
# get rid of nans
df.dropna(inplace=True)

In [None]:
df.shape

(147708, 2)

In [None]:
# convert the text column into list of strings
X = df['text'].to_list()

In [None]:
# one hot encode the labels
Y = pd.get_dummies(df['dialect']).values
Y.shape

(147708, 5)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, stratify=Y, random_state=42)

In [None]:
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

118166
118166
29542
29542


### Loading the tokenizer and the model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02')

Downloading (…)okenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/751k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
model = BertForSequenceClassification.from_pretrained('aubmindlab/bert-base-arabertv02', num_labels=5)

Downloading (…)lve/main/config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [None]:
max_len = 128

In [None]:
train_encodings = tokenizer(x_train, truncation=True, padding=True, max_length=max_len)
test_encodings = tokenizer(x_test, truncation=True, padding=True, max_length=max_len)

### Initializing the dataset

In [None]:
class DilacetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
        # Calculate class weights
        unique_labels, counts = np.unique(labels, return_counts=True)
        self.class_weights = torch.Tensor(counts.sum() / (counts * len(unique_labels)))
        
        # Add weights to the dataset
        self.weights = [self.class_weights[label] for label in self.labels]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['weights'] = self.weights[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = DilacetDataset(train_encodings, y_train.astype('float64'))
test_dataset = DilacetDataset(test_encodings, y_test.astype('float64'))

### Training

In [None]:
# free some ram
del train_encodings 
del test_encodings 
del x_train
del y_train

In [None]:
# calculate accuracy at every epoch
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    references = np.argmax(labels, axis=1)
    return metric.compute(predictions=predictions, references=references)

In [None]:
training_args = TrainingArguments(
    output_dir=path+modeling+'output',          
    num_train_epochs=2,             
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir=path+modeling+'log',           
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy = "epoch",     
    save_total_limit = 15,
    save_steps = 2000,
    load_best_model_at_end = True,

)


trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset ,         
    compute_metrics = compute_metrics   
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1438,0.177771,0.824284
2,0.1447,0.164527,0.840126


TrainOutput(global_step=14772, training_loss=0.17881030625858638, metrics={'train_runtime': 5664.779, 'train_samples_per_second': 41.72, 'train_steps_per_second': 2.608, 'total_flos': 1.5060002673350496e+16, 'train_loss': 0.17881030625858638, 'epoch': 2.0})

In [None]:
trainer.save_model(path+modeling+'trial_0')

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.16452705314637148,
 'eval_accuracy': 0.840125922415544,
 'eval_runtime': 146.8976,
 'eval_samples_per_second': 201.106,
 'eval_steps_per_second': 3.145,
 'epoch': 2.0}

In [None]:
output=trainer.predict(test_dataset)[0]

In [None]:
cm=classification_report(y_test.argmax(axis=1),output.argmax(axis=1))
print(cm)

              precision    recall  f1-score   support

           0       0.85      0.92      0.89     11527
           1       0.86      0.85      0.85      5523
           2       0.82      0.82      0.82      7299
           3       0.83      0.71      0.76      2307
           4       0.78      0.65      0.71      2886

    accuracy                           0.84     29542
   macro avg       0.83      0.79      0.81     29542
weighted avg       0.84      0.84      0.84     29542



### Prediction

In [None]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
label_dict = {'LABEL_0':'EG','LABEL_1':'LB','LABEL_2':'LY','LABEL_3':'MA','LABEL_4':'SD'}

In [None]:
def predict_dialect(text,pipe,label_dict):
  prediction = pipe(text)[0]
  output = label_dict[prediction['label']]
  return output

In [None]:
predict_dialect("ازيك يا اسطا",pipe,label_dict)

'EG'