Adopted and modified from [AfriSenti Semeval](https://github.com/afrisenti-semeval/afrisent-semeval-2023). Originally created *by* [Idris Abdulmumin](https://www.hausanlp.org/author/idris-abdulmuminu/), [David Adelani](https://dadelani.github.io/) and [Shamsuddeen Hassan Muhammad](https://www.hausanlp.org/author/shamsuddeen-hassan-muhammad/).



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

# Please don not edit anything here
languages = ['am', 'dz', 'ha', 'ig', 'ma', 'pcm', 'pt', 'sw', 'yo', 'twi', 'kr', 'ts']

In [3]:
%cd /content/drive/MyDrive

import os

PROJECT_DIR = '/content/drive/MyDrive/afrisent-semeval-2023'
PROJECT_GITHUB_URL = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023.git'

if not os.path.isdir(PROJECT_DIR):
  !git clone {PROJECT_GITHUB_URL}
else:
  %cd {PROJECT_DIR}
  !git pull {PROJECT_GITHUB_URL}

/content/drive/MyDrive
/content/drive/MyDrive/afrisent-semeval-2023
From https://github.com/afrisenti-semeval/afrisent-semeval-2023
 * branch            HEAD       -> FETCH_HEAD
Already up to date.


In [None]:
!pip install -r /content/drive/MyDrive/afrisent-semeval-2023/starter_kit/requirements.txt

In [5]:
# Training Data Paths

TASK = 'SubtaskA'
if TASK == 'SubtaskB': 
  TRAINING_DATA_DIR = os.path.join(PROJECT_DIR, TASK)
else:
  TRAINING_DATA_DIR = os.path.join(PROJECT_DIR, TASK, 'train')
FORMATTED_TRAIN_DATA = os.path.join(TRAINING_DATA_DIR, 'formatted-train-data')

if os.path.isdir(TRAINING_DATA_DIR):
  print('Data directory found.')
  if not os.path.isdir(FORMATTED_TRAIN_DATA):
    print('Creating directory to store formatted data.')
    os.mkdir(FORMATTED_TRAIN_DATA)
else:
  print(TRAINING_DATA_DIR + ' is not a valid directory or does not exist!')

Data directory found.


In [None]:
%cd {TRAINING_DATA_DIR}

training_files = os.listdir()

if len(training_files) > 0:
  for training_file in training_files:
    if training_file.endswith('.tsv'):

      data = training_file.split('_')[0]
      if not os.path.isdir(os.path.join(FORMATTED_TRAIN_DATA, data)):
        print(data, 'Creating directory to store train, dev and test splits.')
        os.mkdir(os.path.join(FORMATTED_TRAIN_DATA, data))
      
      df = pd.read_csv(training_file, sep='\t', names=['ID', 'text', 'label'], header=0)
      df[['text', 'label']].sample(frac=1, random_state=42).to_csv(os.path.join(FORMATTED_TRAIN_DATA, data, 'train.tsv'), sep='\t', index=False)
    else:
      print(training_file + ' skipped!')
else:
  print('No files are found in this directory!')

In [8]:
from sklearn.model_selection import train_test_split

if os.path.isdir(FORMATTED_TRAIN_DATA):
  print('Data directory found.')
  SPLITTED_DATA = os.path.join(TRAINING_DATA_DIR, 'splitted-train-dev')
  if not os.path.isdir(SPLITTED_DATA):
    print('Creating directory to store train, dev and test splits.')
    os.mkdir(SPLITTED_DATA)
else:
  print(FORMATTED_TRAIN_DATA + ' is not a valid directory or does not exist!')

%cd {FORMATTED_TRAIN_DATA}
formatted_training_files = os.listdir()

if len(formatted_training_files) > 0:
  for data_name in formatted_training_files:
    formatted_training_file = os.path.join(data_name, 'train.tsv')
    if os.path.isfile(formatted_training_file):
      labeled_tweets = pd.read_csv(formatted_training_file, sep='\t', names=['text', 'label'], header=0)
      train, dev = train_test_split(labeled_tweets, test_size=0.3, shuffle=True, random_state=42)
      if not os.path.isdir(os.path.join(SPLITTED_DATA, data_name)):
        print(data_name, 'Creating directory to store train, dev and test splits.')
        os.mkdir(os.path.join(SPLITTED_DATA, data_name))

      train.sample(frac=1).to_csv(os.path.join(SPLITTED_DATA, data_name, 'train.tsv'), sep='\t', index=False)
      dev.sample(frac=1).to_csv(os.path.join(SPLITTED_DATA, data_name, 'dev.tsv'), sep='\t', index=False)
    else:
      print(training_file + ' is not a supported file!')
else:
  print('No files are found in this directory!')

Data directory found.
/content/drive/MyDrive/afrisent-semeval-2023/SubtaskA/train/formatted-train-data


In [9]:
%cd {PROJECT_DIR}
languages = ['am', 'dz', 'ha', 'ig', 'ma', 'pcm', 'pt', 'sw', 'yo', 'twi', 'kr', 'ts']
# Language to train sentiment classifier for
LANGUAGE_CODE = 'ha'

#To avoid overwriting existing models, the model version should be edited
MODEL_VERSION = '3'

if LANGUAGE_CODE in languages:
  # Model Training Parameters
  MODEL_NAME_OR_PATH = 'Davlan/afro-xlmr-large'
  BATCH_SIZE = 16
  LEARNING_RATE = 1e-5
  NUMBER_OF_TRAINING_EPOCHS = 5
  WEIGHT_DECAY = 0.01
  MAXIMUM_SEQUENCE_LENGTH = 128
  SAVE_STEPS = -1
  OVERWRITE_OUPTUT_DIR = True
  LRS = 'linear'
  print('Everything set. You can now start model training.')
#(choose from 'adamw_hf', 'adamw_torch', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad')
else:
  print("Invalid language code/Dataset not released. Please input any of the following released data\n\n\t- 'am'\n\t- 'dz'\n\t- 'ha'\n\t- 'ig'\n\t- 'ma'\n\t- 'pcm'\n\t- 'pt'\n\t- 'sw'\n\t- 'yo'")

/content/drive/MyDrive/afrisent-semeval-2023
Everything set. You can now start model training.


In [10]:
DATA_DIR = os.path.join(TRAINING_DATA_DIR, 'splitted-train-dev', LANGUAGE_CODE)
OUTPUT_DIR = os.path.join(PROJECT_DIR, 'models', LANGUAGE_CODE, 'v_' + MODEL_VERSION)

!CUDA_VISIBLE_DEVICES=0 python starter_kit/run_textclass.py \
  --model_name_or_path {MODEL_NAME_OR_PATH} \
  --data_dir {DATA_DIR} \
  --do_train \
  --do_eval \
  --overwrite_output_dir yes\
  --greater_is_better  no\
  --per_device_train_batch_size {BATCH_SIZE} \
  --weight_decay {WEIGHT_DECAY} \
  --learning_rate {LEARNING_RATE} \
  --num_train_epochs {NUMBER_OF_TRAINING_EPOCHS} \
  --max_seq_length {MAXIMUM_SEQUENCE_LENGTH} \
  --output_dir {OUTPUT_DIR} \
  --save_steps {SAVE_STEPS} \
  --lr_scheduler_type {LRS} \
  --optim {'adafactor'}

INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_m

In [None]:
%cd {PROJECT_DIR}

OUTPUT_DIR = os.path.join(PROJECT_DIR, 'models', LANGUAGE_CODE, 'v_' + MODEL_VERSION)
FILE_NAME = os.path.join(PROJECT_DIR, TASK, 'test', LANGUAGE_CODE + '_test_participants.tsv')
TEXT_COLUMN = 'tweet'

!python starter_kit/run_predict.py \
  --model_path {OUTPUT_DIR} \
  --file_name {FILE_NAME} \
  --text_column {TEXT_COLUMN} \
  --lang_code {LANGUAGE_CODE}