#1) Installations and imports

##a. Mount drive (if you are running on colab)

In [68]:
from google.colab import drive
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##b. Clone or update competition repository

After cloning, under MyDrive, you will see afrisenti-semeval-2023 folder with all the the data for the afrisenti shared task (training and dev) 

In [69]:
%cd /content/drive/MyDrive

PROJECT_DIR = '/content/drive/MyDrive/afrisent-semeval-2023'
PROJECT_GITHUB_URL = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023.git'

print(os.path.isdir(PROJECT_DIR))


/content/drive/MyDrive
True


##c. Install required libraries



In [70]:
os.listdir(os.path.join(PROJECT_DIR, 'starter_kit'))

['afrisenti_get_scores1.py',
 'run_textclass.py',
 'run_textclass.sh',
 'requirements.txt',
 'run_predict.py']

In [71]:
#Install other required libraries
!pip install -r '/content/drive/MyDrive/afrisent-semeval-2023/starter_kit/requirements.txt'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##d. Import libraries

Import libraries below

In [72]:
import pandas as pd
import numpy as np

# Please don not edit anything here
languages = ['am', 'dz', 'ha', 'ig', 'ma', 'pcm', 'pt', 'sw', 'yo']

#2) Dataset

In [73]:
# Training Data Paths

TASK = 'SubtaskA'
TRAINING_DATA_DIR = os.path.join(PROJECT_DIR, TASK, 'train')
FORMATTED_TRAIN_DATA = os.path.join(TRAINING_DATA_DIR, 'formatted-train-data')

if os.path.isdir(TRAINING_DATA_DIR):
  print('Data directory found.')
  if not os.path.isdir(FORMATTED_TRAIN_DATA):
    print('Creating directory to store formatted data.')
    os.mkdir(FORMATTED_TRAIN_DATA)
else:
  print(TRAINING_DATA_DIR + ' is not a valid directory or does not exist!')

Data directory found.


In [74]:
LANGUAGE_CODE = 'dz'
# file_path = os.path.join(PROJECT_DIR, TASK, 'train', 'splitted-train-dev-test', LANGUAGE_CODE, 'test.tsv')
file_path = r'/content/drive/MyDrive/afrisent-semeval-2023/SubtaskA/train/dz_train.tsv'
print('splitted-train-dev-test/am/test.tsv')

splitted-train-dev-test/am/test.tsv


In [75]:
df_dz = pd.read_csv(file_path, sep='\t')

In [76]:
df_dz

Unnamed: 0,ID,tweet,label
0,dz_train_00001,@user على حسب موقعك يبدو أنك صاحب نظرة ثاقبة ....,negative
1,dz_train_00002,@user تبهليل هاذا,negative
2,dz_train_00003,@user هاذي تبهليل ماشي فهامة,negative
3,dz_train_00004,@user @user تخاف نجاوب يا ناصر ببلوك لانو طريق...,negative
4,dz_train_00005,@user مرنكة أقسم بالله 😂😂😂تبهليل ما بعد منتصف ...,negative
...,...,...,...
1646,dz_train_01647,@user نسيت هذي أنت تحب بزاف هذي @user,positive
1647,dz_train_01648,@user يخدمو ما يهدروش بزاف،تخية لواد سوف,positive
1648,dz_train_01649,@user مليحة بزاف صراحة عجبتني,positive
1649,dz_train_01650,RT @user الخميس و البحر فور بزاف🤗😍 @user,positive


In [77]:
df_dz.label.value_counts()

negative    892
positive    417
neutral     342
Name: label, dtype: int64

In [78]:
#Regex patterns
import re
def clean_text(row, options):
    """Removes url, mentions, emoji and uppercase from tweets"""
    if options['lowercase']:
        row = row.lower()

    if options['remove_url']:
        row = re.sub(r"(?:\@|https?\://)\S+", "", row)

    if options['remove_mentions']:
        row = re.sub(r"[^\w\s]", "", row)
            
    if options['remove_punctuation']:
        row = re.sub("@[A-Za-z0-9_]+","", row)
    
    if options['remove_numbers']:
        row = re.sub("[0-9_]+","", row)

    if options['remove_eng_chars']:
        row = re.sub("[A-Za-z]+","", row)
        
    if options['demojify']:
      emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
      row = re.sub(emoj, '', row)

    return row

In [79]:
clean_config = {
    'remove_url': True,
    'remove_mentions': True,
    'lowercase': True,
    'remove_numbers': True,
    'remove_eng_chars': True,
    'remove_punctuation':True,
    'demojify': True
    }

In [80]:
df_dz['text'] = df_dz['tweet'].apply(clean_text, args=(clean_config,))

In [81]:
df_dz.drop(['ID', 'tweet'], axis=1, inplace=True)

In [82]:
train_f, dev_f, test_f = np.split(df_dz.sample(frac=1, random_state=1), [int(.7*len(df_dz)), int(.85*len(df_dz))])


In [83]:
fil_split_path = '/content/drive/MyDrive/afrisent-semeval-2023/SubtaskA/train/split-dz'

In [84]:
train_f.sample(frac=1).to_csv(os.path.join(fil_split_path,'train.tsv'), sep='\t', index=False)
dev_f.sample(frac=1).to_csv(os.path.join(fil_split_path,'dev.tsv'), sep='\t', index=False)
test_f.sample(frac=1).to_csv(os.path.join(fil_split_path,'test.tsv'), sep='\t', index=False)

# 3) Evaluate on test subsets from train set -- p/r/f1 scores

## a. predict and create a scores folder

### Check the pre-training step and set the parameters

In [85]:
%cd {PROJECT_DIR}

#Language to train sentiment classifier for dz
LANGUAGE_CODE = 'dz'
if LANGUAGE_CODE in languages:
  # Model Training Parameters
  MODEL_NAME_OR_PATH = "alger-ia/dziribert"  
  BATCH_SIZE = 8
  LEARNING_RATE = 2e-05
  NUMBER_OF_TRAINING_EPOCHS = 2
  MAXIMUM_SEQUENCE_LENGTH = 128
  SAVE_STEPS = -1

  print('Everything set. You can now start model training.')
else:
  print("Invalid language code/Dataset not released. Please input any of the following released data\n\n\t- 'am'\n\t- 'dz'\n\t- 'ha'\n\t- 'ig'\n\t- 'ma'\n\t- 'pcm'\n\t- 'pt'\n\t- 'sw'\n\t- 'yo'")


/content/drive/MyDrive/afrisent-semeval-2023
Everything set. You can now start model training.


In [86]:
os.path.join(TRAINING_DATA_DIR, 'fil-splitted-train-dev-test', LANGUAGE_CODE)

'/content/drive/MyDrive/afrisent-semeval-2023/SubtaskA/train/fil-splitted-train-dev-test/dz'

### Actual training step

In [99]:
#DATA_DIR = os.path.join(TRAINING_DATA_DIR, 'splitted-train-dev-test', LANGUAGE_CODE)
# DATA_DIR = os.path.join(TRAINING_DATA_DIR, 'fil-splitted-train-dev-test', LANGUAGE_CODE)
DATA_DIR = r'/content/drive/MyDrive/afrisent-semeval-2023/SubtaskA/train/split-dz'#os.path.join(TRAINING_DATA_DIR, 'fil-splitted-train-dev-test', LANGUAGE_CODE)
OUTPUT_DIR = os.path.join(PROJECT_DIR, 'models', LANGUAGE_CODE)

#python starter_kit/run_textclass.py \
!CUDA_VISIBLE_DEVICES=0 python starter_kit/run_textclass.py \
  --model_name_or_path {MODEL_NAME_OR_PATH} \
  --data_dir {DATA_DIR} \
  --do_train \
  --do_eval \
  --do_predict \
  --per_device_train_batch_size {BATCH_SIZE} \
  --learning_rate {LEARNING_RATE} \
  --num_train_epochs {NUMBER_OF_TRAINING_EPOCHS} \
  --max_seq_length {MAXIMUM_SEQUENCE_LENGTH} \
  --output_dir {OUTPUT_DIR} \
  --save_steps {SAVE_STEPS} \
  --overwrite_output_dir


INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_met

### Do the predictions on the test set

In [100]:
## file will be saved in submission folder

%cd {PROJECT_DIR}

OUTPUT_DIR = os.path.join(PROJECT_DIR, 'models', LANGUAGE_CODE)
# FILE_NAME = os.path.join(PROJECT_DIR, TASK, 'train', 'fil-splitted-train-dev-test', LANGUAGE_CODE, 'test.tsv')  ## test file name
FILE_NAME = r'/content/drive/MyDrive/afrisent-semeval-2023/SubtaskA/train/split-dz/test.tsv'## test file name
TEXT_COLUMN = 'text'

## file will be saved in submission folder
!python starter_kit/run_predict.py \
  --model_path {OUTPUT_DIR} \
  --file_name {FILE_NAME} \
  --text_column {TEXT_COLUMN} \
  --lang_code {LANGUAGE_CODE}

print('current language code is:', LANGUAGE_CODE)
print('current model path is:', OUTPUT_DIR)

/content/drive/MyDrive/afrisent-semeval-2023
***** Running Prediction *****
  Num examples = 248
  Batch size = 8
100% 31/31 [00:00<00:00, 46.76it/s]
Data directory found.
current language code is: dz
current model path is: /content/drive/MyDrive/afrisent-semeval-2023/models/dz


## b. evaluation method

In [101]:
import pandas as pd
from sklearn.metrics import classification_report

def evaluate(file_true, file_pred):

  df_true = pd.read_csv(file_true, sep='\t')
  df_pred = pd.read_csv(file_pred, sep='\t')
  true = df_true.label
  pred = df_pred.label
  return classification_report(true, pred)

## c. get scores for 'dz'

In [102]:
LANGUAGE_CODE = 'dz'
file_true = r'/content/drive/MyDrive/afrisent-semeval-2023/SubtaskA/train/split-dz/test.tsv'## test file name
file_pred = os.path.join(PROJECT_DIR, 'models', LANGUAGE_CODE, 'submission', 'pred_' + LANGUAGE_CODE + '.tsv')
print(evaluate(file_true, file_pred))

              precision    recall  f1-score   support

    negative       0.71      0.82      0.76       126
     neutral       0.41      0.39      0.40        49
    positive       0.75      0.58      0.65        73

    accuracy                           0.66       248
   macro avg       0.62      0.59      0.60       248
weighted avg       0.66      0.66      0.66       248



In [103]:
df_true = pd.read_csv(file_true, sep='\t')
df_pred = pd.read_csv(file_pred, sep='\t')
true = df_true.label
pred = df_pred.label

In [104]:
print(classification_report(true,pred))
print("f1-score:")
print(f1_score(true,pred, average='macro'))
print("Accuarcy:")
print(accuracy_score(true,pred) )
print("Precision:")
print(precision_score(true,pred, average=None))
print("Recall:")
print(recall_score(true,pred, average=None))

              precision    recall  f1-score   support

    negative       0.71      0.82      0.76       126
     neutral       0.41      0.39      0.40        49
    positive       0.75      0.58      0.65        73

    accuracy                           0.66       248
   macro avg       0.62      0.59      0.60       248
weighted avg       0.66      0.66      0.66       248

f1-score:
0.6028385772913816
Accuarcy:
0.6612903225806451
Precision:
[0.70547945 0.41304348 0.75      ]
Recall:
[0.81746032 0.3877551  0.57534247]
