In [None]:
import pandas as pd

In [None]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate

# Pre-processing

In [None]:
raw_train_df = pd.read_csv("/content/train_df.csv")
raw_val_df = pd.read_csv("/content/validation_df.csv")
raw_test_df = pd.read_csv("/content/test_df.csv")

In [None]:
raw_train_df = raw_train_df.drop(columns=['category_No-error'])
raw_val_df = raw_val_df.drop(columns=['category_No-error'])
raw_test_df = raw_test_df.drop(columns=['category_No-error'])

In [None]:
error_columns = [col for col in raw_train_df.columns if col.startswith('category_')]

In [None]:
train_df = pd.get_dummies(raw_train_df, columns=error_columns, prefix=error_columns)
val_df = pd.get_dummies(raw_val_df, columns=error_columns, prefix=error_columns)
test_df = pd.get_dummies(raw_test_df, columns=error_columns, prefix=error_columns)

In [None]:
columns_to_drop = [col for col in train_df.columns if col.endswith('0')]

train_df.drop(columns=columns_to_drop, inplace=True)
val_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
errors = ['category_Accuracy/Addition_1',
 'category_Accuracy/Addition_5',
 'category_Accuracy/Mistranslation_1',
 'category_Accuracy/Mistranslation_5',
 'category_Accuracy/Omission_1',
 'category_Accuracy/Omission_5',
 'category_Accuracy/Source language fragment_1',
 'category_Accuracy/Source language fragment_5',
 'category_Fluency/Character encoding_1',
 'category_Fluency/Character encoding_5',
 'category_Fluency/Grammar_1',
 'category_Fluency/Grammar_5',
 'category_Fluency/Inconsistency_1',
 'category_Fluency/Inconsistency_5',
 'category_Fluency/Punctuation_0.1',
 'category_Fluency/Register_1',
 'category_Fluency/Register_5',
 'category_Fluency/Spelling_1',
 'category_Fluency/Spelling_5',
 'category_Locale convention/Currency format_1',
 'category_Locale convention/Currency format_5',
 'category_Locale convention/Date format_1',
 'category_Locale convention/Date format_5',
 'category_Locale convention/Time format_1',
 'category_Other_1',
 'category_Other_5',
 'category_Source error_1',
 'category_Source error_5',
 'category_Style/Awkward_1',
 'category_Style/Awkward_5',
 'category_Terminology/Inappropriate for context_1',
 'category_Terminology/Inappropriate for context_5',
 'category_Terminology/Inconsistent_1',
 'category_Terminology/Inconsistent_5',
 'category_Locale convention/Name format_1',
 'category_Locale convention/Name format_5',
 'category_Non-translation!_25',
 'category_Locale convention/Address format_1',
 'category_Locale convention/Address format_5',
 'category_Locale convention/Telephone format_1']
id2label = {idx:label for idx, label in enumerate(errors)}
label2id = {label:idx for idx, label in enumerate(errors)}

In [None]:
for data in [train_df,val_df,test_df]:
  for error in errors:
    if error not in data.columns:
      data[error] = 0

In [None]:
# print(len(ende_train), len(ende_vld), len(ende_test), len(zhen_train), len(zhen_vld), len(zhen_test))
# 16368 5457 5457 27881 9294 9294

In [None]:
ende_train_df = train_df.iloc[:16367]
zhen_train_df = train_df.iloc[16368:]
ende_val_df = val_df.iloc[:5456]
zhen_val_df = val_df.iloc[5657:]
ende_test_df = test_df.iloc[:5456]
zhen_test_df = test_df.iloc[5457:]

In [None]:
raw_ende_train_ds = Dataset.from_pandas(ende_train_df)
raw_zhen_train_ds = Dataset.from_pandas(zhen_train_df)
raw_ende_val_ds = Dataset.from_pandas(ende_val_df)
raw_zhen_val_ds = Dataset.from_pandas(zhen_val_df)
raw_ende_test_ds = Dataset.from_pandas(ende_test_df)
raw_zhen_test_ds = Dataset.from_pandas(zhen_test_df)

# Tokenisation

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import torch
import numpy as np

In [None]:
BASE_MODEL = "xlm-roberta-base"
LEARNING_RATE = 2e-5
MAX_LEN = 512
BATCH_SIZE = 32
EPOCHS = 4

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

In [None]:
ende_ds = {"train": raw_ende_train_ds, "validation": raw_ende_val_ds, "test": raw_ende_test_ds}
zhen_ds = {"train": raw_zhen_train_ds, "validation": raw_zhen_val_ds, "test": raw_zhen_test_ds}

In [None]:
def preprocess_data(df):
  # encode them
  encoding = tokenizer(df["source"], df["target"], truncation=True, padding="max_length", max_length=MAX_LEN)
  labels_list = [df[label] for label in df.keys() if label in errors]
  encoding["labels"] = np.zeros(len(errors))  # Initialize with zeros
  encoding["labels"][:len(labels_list)] = labels_list  # Assign values

  return encoding

In [None]:
for split in ende_ds:
  col = ende_ds[split].column_names
  ende_ds[split] = ende_ds[split].map(preprocess_data, remove_columns=col)
  ende_ds[split].set_format("torch")

Map:   0%|          | 0/16367 [00:00<?, ? examples/s]

Map:   0%|          | 0/5456 [00:00<?, ? examples/s]

Map:   0%|          | 0/5456 [00:00<?, ? examples/s]

In [None]:
for split in zhen_ds:
  col = zhen_ds[split].column_names
  zhen_ds[split] = zhen_ds[split].map(preprocess_data, remove_columns=col)
  zhen_ds[split].set_format("torch")

Map:   0%|          | 0/27881 [00:00<?, ? examples/s]

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

Map:   0%|          | 0/9294 [00:00<?, ? examples/s]

# En-De Training

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import XLMRobertaForSequenceClassification

ende_model = XLMRobertaForSequenceClassification.from_pretrained(BASE_MODEL,
                                                                 problem_type="multi_label_classification",
                                                                 num_labels=len(errors),
                                                                 id2label=id2label,
                                                                 label2id=label2id,
                                                                 hidden_dropout_prob=0.2,
                                                                 attention_probs_dropout_prob=0.2)

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from scipy.stats import pearsonr

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # Apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    # Use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # Compute metrics
    pearson_corr, _ = pearsonr(predictions.flatten(), labels.flatten())

    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)

    return {'f1': f1_micro_average, 'roc_auc': roc_auc, 'accuracy': accuracy, 'pearson_corr': pearson_corr}

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
from transformers import TrainingArguments, Trainer

ende_args = TrainingArguments(
    output_dir="../models/xlmr_ende_classification",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    remove_unused_columns=False,
)

In [None]:
ende_trainer = Trainer(
    ende_model,
    ende_args,
    train_dataset=ende_ds["train"],
    eval_dataset=ende_ds["validation"],
    compute_metrics=compute_metrics,
)

In [None]:
ende_trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy,Pearson Corr
1,0.1288,0.071772,0.0,0.5,0.403959,0.137532
2,0.0576,0.071503,0.0,0.5,0.403959,0.145175
3,0.053,0.068057,0.0,0.5,0.403959,0.181505
4,0.0504,0.067947,0.0,0.5,0.403959,0.183854


TrainOutput(global_step=2048, training_loss=0.07194932794664055, metrics={'train_runtime': 1619.0778, 'train_samples_per_second': 40.435, 'train_steps_per_second': 1.265, 'total_flos': 1.7231231633620992e+16, 'train_loss': 0.07194932794664055, 'epoch': 4.0})

In [None]:
ende_trainer.eval_dataset = ende_ds["test"]
ende_trainer.evaluate()

{'eval_loss': 0.0712401419878006,
 'eval_f1': 0.0,
 'eval_roc_auc': 0.5,
 'eval_accuracy': 0.4094574780058651,
 'eval_pearson_corr': 0.1376422439127843,
 'eval_runtime': 37.5782,
 'eval_samples_per_second': 145.191,
 'eval_steps_per_second': 4.551,
 'epoch': 4.0}

In [None]:
ende_trainer.model.push_to_hub("xlmr_ende_classification")

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aningddd/xlmr_ende_classification/commit/323f1fe8e2c511d480a9b5bad4b79af10130d4f4', commit_message='Upload XLMRobertaForSequenceClassification', commit_description='', oid='323f1fe8e2c511d480a9b5bad4b79af10130d4f4', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
ende_trainer.eval_dataset = zhen_ds["test"]
ende_trainer.evaluate()

{'eval_loss': 0.07354683429002762,
 'eval_f1': 0.0,
 'eval_roc_auc': 0.5,
 'eval_accuracy': 0.3204217774908543,
 'eval_pearson_corr': 0.19981063699818735,
 'eval_runtime': 63.7313,
 'eval_samples_per_second': 145.831,
 'eval_steps_per_second': 4.566,
 'epoch': 4.0}

# Zh-En Training

In [None]:
zhen_model = XLMRobertaForSequenceClassification.from_pretrained(BASE_MODEL,
                                                                 problem_type="multi_label_classification",
                                                                 num_labels=len(errors),
                                                                 id2label=id2label,
                                                                 label2id=label2id,
                                                                 hidden_dropout_prob=0.2,
                                                                 attention_probs_dropout_prob=0.2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
zhen_args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    remove_unused_columns=False,
    output_dir="../models/xlmr_zhen_classification",
    push_to_hub=True,
)

In [None]:
zhen_trainer = Trainer(
    zhen_model,
    zhen_args,
    train_dataset=zhen_ds["train"],
    eval_dataset=zhen_ds["validation"],
    compute_metrics=compute_metrics,
)

In [None]:
zhen_trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy,Pearson Corr
1,0.1317,0.064869,0.0,0.5,0.328018,0.258549
2,0.0587,0.059786,0.0,0.5,0.328018,0.289604
3,0.0516,0.053927,0.489751,0.731098,0.643171,0.320819


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy,Pearson Corr
1,0.1317,0.064869,0.0,0.5,0.328018,0.258549
2,0.0587,0.059786,0.0,0.5,0.328018,0.289604
3,0.0516,0.053927,0.489751,0.731098,0.643171,0.320819
4,0.0472,0.052478,0.504161,0.736137,0.649329,0.332804


TrainOutput(global_step=3488, training_loss=0.06466711715820732, metrics={'train_runtime': 2722.9167, 'train_samples_per_second': 40.958, 'train_steps_per_second': 1.281, 'total_flos': 2.9353208845664256e+16, 'train_loss': 0.06466711715820732, 'epoch': 4.0})

In [None]:
zhen_trainer.eval_dataset = zhen_ds["test"]
zhen_trainer.evaluate()

{'eval_loss': 0.0664999932050705,
 'eval_f1': 0.19481940897482672,
 'eval_roc_auc': 0.578713977890194,
 'eval_accuracy': 0.5064149560117303,
 'eval_pearson_corr': 0.2137110646228094,
 'eval_runtime': 37.5837,
 'eval_samples_per_second': 145.169,
 'eval_steps_per_second': 4.55,
 'epoch': 4.0}

In [None]:
zhen_trainer.model.push_to_hub("xlmr_zhen_classification")

CommitInfo(commit_url='https://huggingface.co/aningddd/xlmr_zhen_classification/commit/8917336d24d931f0a5d6935d793b44b15c523a60', commit_message='Upload XLMRobertaForSequenceClassification', commit_description='', oid='8917336d24d931f0a5d6935d793b44b15c523a60', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
zhen_trainer.eval_dataset = ende_ds["test"]
zhen_trainer.evaluate()

{'eval_loss': 0.0664999932050705,
 'eval_f1': 0.19481940897482672,
 'eval_roc_auc': 0.578713977890194,
 'eval_accuracy': 0.5064149560117303,
 'eval_pearson_corr': 0.2137110646228094,
 'eval_runtime': 37.5198,
 'eval_samples_per_second': 145.416,
 'eval_steps_per_second': 4.558,
 'epoch': 4.0}