#1) Installations and imports

##a. Mount drive (if you are running on colab)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##b. Clone or update competition repository

After cloning, under MyDrive, you will see afrisenti-semeval-2023 folder with all the the data for the afrisenti shared task (training and dev) 

In [2]:
%cd /content/drive/MyDrive

import os

PROJECT_DIR = '/content/drive/MyDrive/afrisent-semeval-2023'
PROJECT_GITHUB_URL = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023.git'

## Already files uploaded to drive as per the required file structure.

print(os.path.isdir(PROJECT_DIR))


/content/drive/MyDrive
True


##c. Install required libraries

In [3]:
!pip install -r '/content/drive/MyDrive/afrisent-semeval-2023/starter_kit/requirements.txt'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 14.2 MB/s 
Collecting accelerate
  Downloading accelerate-0.14.0-py3-none-any.whl (175 kB)
[K     |████████████████████████████████| 175 kB 67.5 MB/s 
[?25hCollecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 67.1 MB/s 
[?25hCollecting datasets>=1.8.0
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 64.0 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████

##d. Import libraries

Import libraries below

In [4]:
import pandas as pd
import numpy as np

# Please don not edit anything here
languages = ['am', 'dz', 'ha', 'ig', 'ma', 'pcm', 'pt', 'sw', 'yo']

#2) Dataset

In [5]:
# Training Data Paths

TASK = 'SubtaskA'
TRAINING_DATA_DIR = os.path.join(PROJECT_DIR, TASK, 'train')
FORMATTED_TRAIN_DATA = os.path.join(TRAINING_DATA_DIR, 'formatted-train-data')

if os.path.isdir(TRAINING_DATA_DIR):
  print('Data directory found.')
  if not os.path.isdir(FORMATTED_TRAIN_DATA):
    print('Creating directory to store formatted data.')
    os.mkdir(FORMATTED_TRAIN_DATA)
else:
  print(TRAINING_DATA_DIR + ' is not a valid directory or does not exist!')

Data directory found.


# 3) Evaluate on test subsets from train set -- p/r/f1 scores

## a. predict and create a scores folder

### Check the pre-training step and set the parameters

In [7]:
%cd {PROJECT_DIR}

# # Language to train sentiment classifier for
# LANGUAGE_CODE = 'am' # 'am', 'dz', 'ha', 'ig', 'ma', 'pcm', 'pt', 'sw', 'yo' DO NOT EASILTY UNCOMMENT THIS LINE

LANGUAGE_CODE = 'am'
if LANGUAGE_CODE in languages:
  # Model Training Parameters
  # MODEL_NAME_OR_PATH = 'uhhlt/am-roberta'
  MODEL_NAME_OR_PATH = 'Davlan/afro-xlmr-mini'
  # MODEL_NAME_OR_PATH = 'Davlan/xlm-roberta-base-finetuned-amharic'
  # MODEL_NAME_OR_PATH = 'Davlan/afro-xlmr-large'
  # MODEL_NAME_OR_PATH = "Davlan/naija-twitter-sentiment-afriberta-large"
  # MODEL_NAME_OR_PATH = "Davlan/bert-base-multilingual-cased-finetuned-amharic"
  
  BATCH_SIZE = 32
  LEARNING_RATE = 0.0005 #0.005
  NUMBER_OF_TRAINING_EPOCHS = 5 #100
  MAXIMUM_SEQUENCE_LENGTH = 128
  SAVE_STEPS = -1

  print('Everything set. You can now start model training.')

else:
  print("Invalid language code/Dataset not released. Please input any of the following released data\n\n\t- 'am'\n\t- 'dz'\n\t- 'ha'\n\t- 'ig'\n\t- 'ma'\n\t- 'pcm'\n\t- 'pt'\n\t- 'sw'\n\t- 'yo'")


/content/drive/MyDrive/afrisent-semeval-2023
Everything set. You can now start model training.


### Actual training step

In [8]:
DATA_DIR = os.path.join(TRAINING_DATA_DIR, 'fil-bal-splitted-train-dev-test', LANGUAGE_CODE) 
## other paths - replace "fil-bal-splitted-train-dev-test" with below:
## 'fil-splitted-train-dev-test' for unbalanced version
## 'splitted-train-dev-test' for raw version of files.
OUTPUT_DIR = os.path.join(PROJECT_DIR, 'models', LANGUAGE_CODE)

#python starter_kit/run_textclass.py \
!CUDA_VISIBLE_DEVICES=0 python starter_kit/run_textclass.py \
  --model_name_or_path {MODEL_NAME_OR_PATH} \
  --data_dir {DATA_DIR} \
  --do_train \
  --do_eval \
  --do_predict \
  --per_device_train_batch_size {BATCH_SIZE} \
  --learning_rate {LEARNING_RATE} \
  --num_train_epochs {NUMBER_OF_TRAINING_EPOCHS} \
  --max_seq_length {MAXIMUM_SEQUENCE_LENGTH} \
  --output_dir {OUTPUT_DIR} \
  --save_steps {SAVE_STEPS} \
  --overwrite_output_dir


INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_met

### Do the predictions on the test set

In [9]:
%cd {PROJECT_DIR}

OUTPUT_DIR = os.path.join(PROJECT_DIR, 'models', LANGUAGE_CODE)
FILE_NAME = os.path.join(PROJECT_DIR, TASK, 'train', 'fil-bal-splitted-train-dev-test', LANGUAGE_CODE, 'test.tsv')
## other paths - replace "fil-bal-splitted-train-dev-test" with below:
## 'fil-splitted-train-dev-test' for unbalanced version
## 'splitted-train-dev-test' for raw version of files.
TEXT_COLUMN = 'text'

!python starter_kit/afrisenti_get_scores1.py \
  --model_path {OUTPUT_DIR} \
  --file_name {FILE_NAME} \
  --text_column {TEXT_COLUMN} \
  --lang_code {LANGUAGE_CODE}

print('current language code is:', LANGUAGE_CODE)
print('current model path is:', OUTPUT_DIR)

/content/drive/MyDrive/afrisent-semeval-2023
***** Running Prediction *****
  Num examples = 587
  Batch size = 8
100% 74/74 [00:01<00:00, 48.09it/s]
Data directory found.
current language code is: am
current model path is: /content/drive/MyDrive/afrisent-semeval-2023/models/am


## Evaluation method

In [10]:
import pandas as pd
from sklearn.metrics import classification_report

def evaluate(file_true, file_pred):

  df_true = pd.read_csv(file_true, sep='\t')
  df_pred = pd.read_csv(file_pred, sep='\t')
  true = df_true.label
  pred = df_pred.label
  return classification_report(true, pred)

## Get scores for 'am'

In [11]:
# get scores for 'am'
LANGUAGE_CODE = 'am'
file_true = os.path.join(PROJECT_DIR, TASK, 'train', 'fil-bal-splitted-train-dev-test', LANGUAGE_CODE, 'test.tsv')
## other paths - replace "fil-bal-splitted-train-dev-test" with below:
## 'fil-splitted-train-dev-test' for unbalanced version
## 'splitted-train-dev-test' for raw version of files.
file_pred = os.path.join(PROJECT_DIR, 'models', LANGUAGE_CODE, 'scores', 'pred_' + LANGUAGE_CODE + '.tsv')
print(evaluate(file_true, file_pred))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       198
     neutral       0.00      0.00      0.00       196
    positive       0.33      1.00      0.49       193

    accuracy                           0.33       587
   macro avg       0.11      0.33      0.16       587
weighted avg       0.11      0.33      0.16       587



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [13]:
df_true = pd.read_csv(file_true, sep='\t')
df_pred = pd.read_csv(file_pred, sep='\t')
true = df_true.label
pred = df_pred.label

In [14]:
print(classification_report(true,pred))
print("f1-score:")
print(f1_score(true,pred, average='macro'))
print("Accuarcy:")
print(accuracy_score(true,pred) )
print("Precision:")
print(precision_score(true,pred, average=None))
print("Recall:")
print(recall_score(true,pred, average=None))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       198
     neutral       0.00      0.00      0.00       196
    positive       0.33      1.00      0.49       193

    accuracy                           0.33       587
   macro avg       0.11      0.33      0.16       587
weighted avg       0.11      0.33      0.16       587

f1-score:
0.16495726495726495
Accuarcy:
0.3287904599659284
Precision:
[0.         0.         0.32879046]
Recall:
[0. 0. 1.]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
