In [1]:
import os
from google.colab import drive, userdata

In [None]:
drive.mount('/content/drive')
# Set working directory to the src folder
os.chdir('/content/drive/MyDrive/thesis_src')
os.getcwd()

In [None]:
# Install requirements
!pip install -r requirements.txt

In [None]:
# --- Model Configuration ---
BASE_MODEL_NAME = 'facebook/m2m100_418M'
MODEL_TYPE = 'bart'
SOURCE_LANG_CODE = 'ha'  # Use lang code if using BART family models
TARGET_LANG_CODE = 'ha'
HUB_MODEL_ID = 'ahmadmwali/m2m100_418M_Hausa'  # Model id on HF Hub

# --- Training Parameters ---
NUM_EPOCHS = 5
LR = 2e-5
WEIGHT_DECAY = 1e-3
GRADIENT_ACCUMULATION_STEPS = 4
TRAIN_BATCH_SIZE = 1
EVAL_BATCH_SIZE = 1

# --- LoRA Parameters ---
LORA_LR = 2e-4
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# --- Dataset Sizes ---
TRAIN_SIZE = 5000
VAL_SIZE = 1000
TEST_SIZE = 1000

# --- File Paths ---
TRAIN_FILE = '/content/drive/MyDrive/thesis_data/train.tsv'
VAL_FILE = '/content/drive/MyDrive/thesis_data/validation.tsv'
TEST_FILE = "/content/drive/MyDrive/thesis_data/test.tsv"

# --- Output Directories ---
OUTPUT_DIR_BASE = os.path.join('/content/drive/MyDrive/train_results/', 'results_'+ MODEL_TYPE)
RESULTS_DIR_BASE = "/content/drive/MyDrive/test_results"

# --- Hugging Face Token ---
# Setting my HF token. You can get it from https://huggingface.co/settings/tokens
token = userdata.get('HUGGING_FACE_HUB_TOKEN')

In [None]:
# Run the training script.
!python train.py \
    --model_type $MODEL_TYPE \
    --base_model_name $BASE_MODEL_NAME \
    --train_size $TRAIN_SIZE \
    --val_size $VAL_SIZE \
    --train_file $TRAIN_FILE \
    --val_file $VAL_FILE \
    --learning_rate $LR \
    --output_dir_base $OUTPUT_DIR_BASE \
    --hub_model_id_prefix $HUB_MODEL_ID \
    --num_epochs $NUM_EPOCHS \
    --lora_learning_rate $LORA_LR \
    --lora_r $LORA_R \
    --lora_alpha $LORA_ALPHA \
    --lora_dropout $LORA_DROPOUT \
    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
    --train_batch_size $TRAIN_BATCH_SIZE \
    --eval_batch_size $EVAL_BATCH_SIZE \
    --weight_decay $WEIGHT_DECAY \
    --use_lora \
    --hf_token $token \
    --push_to_hub

In [None]:
# Manually push the model to HF Hub. This is because the --push_to_hub
# training arg pushes only the readme file without the model and tokenizer.
from huggingface_hub import HfApi, login, create_repo
from huggingface_hub.utils import HfHubHTTPError

best_checkpoint_path = os.path.join(OUTPUT_DIR_BASE, 'm2m100_418M-full/checkpoint-186')
api = HfApi()
commit_message = 'Manual upload of Model and tokenizer from best checkpoint'

# Attempt to create the repository, ignore if it already exists
try:
    create_repo(repo_id='ahmadmwali/m2m100_418M-test1', repo_type="model", exist_ok=True, token=token)
    print(f"Repository {HUB_MODEL_ID} ensured to exist.")
except HfHubHTTPError as e:
    print(f"Error creating repository (or it might be private and inaccessible): {e}")

# Upload the folder contents
try:
    api.upload_folder(
        folder_path=best_checkpoint_path,
        repo_id='ahmadmwali/m2m100_418M-test',
        repo_type="model",
        commit_message=commit_message,
        token=token
    )
    print(f"Manually uploaded contents of {best_checkpoint_path} to {HUB_MODEL_ID}")
except Exception as e:
    print(f"An error occurred during upload: {e}")

In [None]:
# Evaluate the model on the test set.
!python test.py \
    --model_type $MODEL_TYPE \
    --hub_model_id $HUB_MODEL_ID \
    --test_file $TEST_FILE \
    --results_dir_base $RESULTS_DIR_BASE \
    --source_lang_code $SOURCE_LANG_CODE \
    --target_lang_code $TARGET_LANG_CODE \
    --hf_token $token \
    --test_size $TEST_SIZE \
    --is_lora_adapter

In [None]:
# Visualize train and eval logs using Tensorboard. Wandb may be used as well.
# Colab comes with Tensorboard out of the box, it may need to be installed
# if working on a different environment using !pip install tensorboard.
log_dir_path = os.path.join(OUTPUT_DIR_BASE, BASE_MODEL_NAME.split('/')[1], 'logs')

%load_ext tensorboard
%tensorboard --logdir $log_dir_path

In [None]:
# Run this on any dataframe you want to clean.
!python predict_on_dataframe.py \
    --model_type bart \
    --hub_model_id $HUB_MODEL_ID \
    --is_lora_adapter \
    --base_model_name_for_lora $BASE_MODEL_NAME \
    --input_file $TEST_FILE \
    --output_file "/content/drive/MyDrive/thesis_data/downstream.tsv" \
    --input_text_column "tweet" \
    --prediction_column_name "new_tweet" \
    --prediction_batch_size 16 \
    --max_seq_length 256 \
    --max_new_tokens 256 \
    --num_beams 4 \
    --file_separator "\t"

In [None]:
!python evaluate_downstream.py \
    --file-path "/content/drive/MyDrive/thesis_data/downstream.tsv" \
    --tweet-col "tweet" \
    --cleaned-col "new_tweet" \
    --label-col 'label' \
    --learning-rate 1e-5 \
    --model-checkpoint "castorini/afriberta_base" \
    --num-epochs 5 \
    --batch-size 8 \
    --output-dir "/content/drive/MyDrive/Results/downstream_m2m_400M_results"