##Preparing The Dataset

In [None]:
!pip install transformers datasets

In [None]:
from datasets import DatasetDict

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

# Load your dataset
data = pd.read_csv('/content/final_dataset.csv')

In [None]:
data.head()

Unnamed: 0,tweet,category
0,سلاما لمن يجلس وحيدا مشتت لافكاار مبعثر لمشاا...,Diminished ability to think or concentrate
1,مشتت,Diminished ability to think or concentrate
2,واني اعلم ان الطريق طويل وانا لوحدي والخوف يت...,Diminished ability to think or concentrate
3,مو قادره اركز كثر التشتت,Diminished ability to think or concentrate
4,طبعا لان جماعه ابن مشتت الفكر,Diminished ability to think or concentrate


In [None]:
data.isna().sum()

Unnamed: 0,0
tweet,7
category,7


In [None]:
data.dropna(inplace=True)

In [None]:
data.isna().sum()

Unnamed: 0,0
tweet,0
category,0


In [None]:
# Find the index of the row with the specified category
index_to_drop = data[data['category'] == 'feelings of worthlessness'].index[0]

# Drop the row
data = data.drop(index_to_drop)

In [None]:
label_encoder = LabelEncoder()
data['category'] = label_encoder.fit_transform(data['category'])

In [None]:
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}

In [None]:
print(label_mapping)

{0: 'Diminished ability to think or concentrate', 1: 'Feelings of worthlessness', 2: 'Psychomotor agitation or retardation', 3: 'Suicidality', 4: 'losing interest or pleasure in activities', 5: 'loss of energy', 6: 'low mood', 7: 'sleep disorder', 8: 'weight disorder'}


In [None]:
data.head()

Unnamed: 0,tweet,category
0,سلاما لمن يجلس وحيدا مشتت لافكاار مبعثر لمشاا...,0
1,مشتت,0
2,واني اعلم ان الطريق طويل وانا لوحدي والخوف يت...,0
3,مو قادره اركز كثر التشتت,0
4,طبعا لان جماعه ابن مشتت الفكر,0


In [None]:
data.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
0,271
8,226
2,120
7,117
6,109
5,101
3,100
1,97
4,80


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1221 entries, 0 to 1228
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet     1221 non-null   object
 1   category  1221 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 28.6+ KB


In [None]:
data.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
0,271
8,226
2,120
7,117
6,109
5,101
3,100
1,97
4,80


In [None]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(data, test_size=0.2)

In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# Create a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

##Training The AraBERT

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))


tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):
    return tokenizer(examples['tweet'], truncation=True, padding=True)

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Make sure to set the format with the 'label' column
tokenized_datasets = tokenized_datasets.map(lambda examples: {'labels': examples['category']}, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/976 [00:00<?, ? examples/s]

Map:   0%|          | 0/245 [00:00<?, ? examples/s]

Map:   0%|          | 0/976 [00:00<?, ? examples/s]

Map:   0%|          | 0/245 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_datasets['train'],         # training dataset
    eval_dataset=tokenized_datasets['validation']      # evaluation dataset
)

trainer.train()


Step,Training Loss
500,0.6022


TrainOutput(global_step=610, training_loss=0.49386752514077015, metrics={'train_runtime': 121.0522, 'train_samples_per_second': 80.626, 'train_steps_per_second': 5.039, 'total_flos': 230730007178880.0, 'train_loss': 0.49386752514077015, 'epoch': 10.0})

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.03795882314443588, 'eval_runtime': 0.6428, 'eval_samples_per_second': 381.126, 'eval_steps_per_second': 6.222, 'epoch': 10.0}


## Saving The Model

In [None]:
trainer.save_model('./trained_model')
tokenizer.save_pretrained('./trained_model')

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
trainer.save_model('/content/gdrive/MyDrive/MHD_Project/PHQ9_Classifier')
tokenizer.save_pretrained('/content/gdrive/MyDrive/MHD_Project/PHQ9_Classifier')

('/content/gdrive/MyDrive/MHD_Project/PHQ9_Classifier/tokenizer_config.json',
 '/content/gdrive/MyDrive/MHD_Project/PHQ9_Classifier/special_tokens_map.json',
 '/content/gdrive/MyDrive/MHD_Project/PHQ9_Classifier/vocab.txt',
 '/content/gdrive/MyDrive/MHD_Project/PHQ9_Classifier/added_tokens.json',
 '/content/gdrive/MyDrive/MHD_Project/PHQ9_Classifier/tokenizer.json')

## Load The Model and make Predictions

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = '/content/gdrive/MyDrive/MHD_Project/PHQ9_Classifier'
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
from transformers import TextClassificationPipeline

pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

text = "اليوم مو قادر اشتغل مزبوط"

predictions = pipeline(text)



In [None]:
def map_predictions(predictions, label_mapping):
    mapped_predictions = []
    for prediction in predictions:
        mapped_prediction = {}
        for pred in prediction:
            label = pred['label']
            score = pred['score']
            actual_label = label_mapping[int(label.split('_')[-1])]
            mapped_prediction[actual_label] = score

        mapped_predictions.append(mapped_prediction)
    mapped_predictions = sorted(mapped_predictions[0].items(), key=lambda x: x[1], reverse=True)
    return mapped_predictions

# Use the function
mapped_predictions = map_predictions(predictions, label_mapping)
def printMappedPreds(mapped_predictions):

  for key, value in mapped_predictions:
    print(f"{key}: {value}")

printMappedPreds(mapped_predictions)

NameError: name 'label_mapping' is not defined