In [1]:
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
import pandas
import pandas as pd
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForPreTraining,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    DataCollatorForWholeWordMask,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from datasets import Features, Value, ClassLabel, load_dataset, Dataset

In [2]:
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
LANGUAGE_CODE = "am"
LEARNING_RATE = 5e-5
EPOCHS=5
MAX_SEQUENCE_LENGTH=128

In [14]:
OUTPUT_DIR = f"../models/{LANGUAGE_CODE}"
DATA_DIR = f"../data/raw/language_model/{LANGUAGE_CODE}"
MODEL_NAME = f'Davlan/afro-xlmr-small'
def finetune_lm():
    training_args = TrainingArguments(output_dir=OUTPUT_DIR,
                                      overwrite_output_dir=True,
                                     do_train=True,
#                                      do_eval=True,
#                                      do_predict=True,
                                     learning_rate=LEARNING_RATE,
                                     num_train_epochs=EPOCHS,
                                     save_steps=-1,
                                     per_device_train_batch_size = 4)
    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(OUTPUT_DIR):
        last_checkpoint = get_last_checkpoint(OUTPUT_DIR)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            print(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(SEED)


    df = pd.read_csv(DATA_DIR + '/train.tsv', sep='\t')
    df = df.dropna()
    train_dataset = Dataset.from_pandas(df)


    config = AutoConfig.from_pretrained(
        MODEL_NAME,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME,
    )
    model = AutoModelForPreTraining.from_pretrained(
        MODEL_NAME,
        config=config,
    )

    padding = "max_length"


    def preprocess_function(examples):
        texts =(examples['text'],)
        result =  tokenizer(*texts, padding=padding, max_length=MAX_SEQUENCE_LENGTH, truncation=True)

    train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True,
        desc="Running tokenizer on train dataset",
    )

    # Get the metric function
    metric = evaluate.load("accuracy")

    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=p.label_ids)

    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
#         eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.save_state()

In [None]:
finetune_lm()

Running tokenizer on train dataset:   0%|          | 0/103 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `XLMRobertaForMaskedLM.forward` and have been ignored: Unnamed: 0, text. If Unnamed: 0, text are not expected by `XLMRobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 102966
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 128710


Step,Training Loss
500,1.059
1000,1.0495
1500,1.0739
2000,1.0474
2500,1.0502
3000,1.0528
3500,1.0294
4000,1.0029
4500,1.0175
5000,1.033


In [None]:
MODEL_NAME = f'../models/{LANGUAGE_CODE}'
SEED=42
OUTPUT_DIR = f"../models/sentiment/{LANGUAGE_CODE}"
DATA_DIR = f"../data/raw/train/splitted-train-dev-test/{LANGUAGE_CODE}"
def finetune_sentiment():
    
    training_args = TrainingArguments(output_dir=OUTPUT_DIR,
                                      overwrite_output_dir=True,
                                     do_train=True,
                                     do_eval=True,
                                     do_predict=True,
                                     learning_rate=LEARNING_RATE,
                                     num_train_epochs=EPOCHS,
                                     save_steps=-1,
                                     per_device_train_batch_size = 32)
    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(OUTPUT_DIR):
        last_checkpoint = get_last_checkpoint(OUTPUT_DIR)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            print(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(SEED)


    df = pd.read_csv(DATA_DIR + '/train.tsv', sep='\t')
    df = df.dropna()
    train_dataset = Dataset.from_pandas(df)
    label_list = df['label'].unique().tolist()

    df = pd.read_csv(DATA_DIR+ '/dev.tsv', sep='\t')
    df = df.dropna()
    eval_dataset = Dataset.from_pandas(df)
    label_list = df['label'].unique().tolist()

    df = pd.read_csv(DATA_DIR + '/test.tsv', sep='\t')
    df = df.dropna()
    predict_dataset = Dataset.from_pandas(df)
    label_list = df['label'].unique().tolist()

    # Labels
    num_labels = len(label_list)
    print(label_list)

    config = AutoConfig.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        config=config,
    )

    padding = "max_length"


    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    label_to_id = {v: i for i, v in enumerate(label_list)}

    if label_to_id is not None:
        model.config.label2id = label_to_id
        model.config.id2label = {id: label for label, id in config.label2id.items()}

    '''
        def preprocess_function(examples):
        # Tokenize the texts
        return tokenizer(
            examples["premise"],
            examples["hypothesis"],
            padding=padding,
            max_length=data_args.max_seq_length,
            truncation=True,
        )
    '''

    def preprocess_function(examples):
        texts =(examples['text'],)
        result =  tokenizer(*texts, padding=padding, max_length=MAX_SEQUENCE_LENGTH, truncation=True)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
        return result

    train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True,
        desc="Running tokenizer on train dataset",
    )
    eval_dataset = eval_dataset.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True,
        desc="Running tokenizer on validation dataset",
    )

    predict_dataset = predict_dataset.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=not True,
        desc="Running tokenizer on prediction dataset",
    )

    # Get the metric function
    metric = evaluate.load("accuracy")

    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.argmax(preds, axis=1)
        return metric.compute(predictions=preds, references=p.label_ids)

    data_collator = default_data_collator

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        print("*** Evaluate ***")
        metrics = trainer.evaluate(eval_dataset=eval_dataset)

        max_eval_samples = len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Prediction
    if training_args.do_predict:
        print("*** Predict ***")
        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")

        max_predict_samples = (
            len(predict_dataset)
        )
        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        predictions = np.argmax(predictions, axis=1)
        output_predict_file = os.path.join(training_args.output_dir, "predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_predict_file, "w") as writer:
                writer.write("index\tprediction\n")
                for index, item in enumerate(predictions):
                    item = label_list[item]
                    writer.write(f"{index}\t{item}\n")

In [None]:
finetune_sentiment()