In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate

# Tokenisation

In [None]:
from transformers import AutoTokenizer
import torch
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import StandardScaler

In [None]:
BASE_MODEL = "xlm-roberta-base"
LEARNING_RATE = 2e-5
MAX_LEN = 512
BATCH_SIZE = 32
EPOCHS = 4

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

In [None]:
raw_train_df = pd.read_csv("/content/train_df.csv")
raw_val_df = pd.read_csv("/content/validation_df.csv")
raw_test_df = pd.read_csv("/content/test_df.csv")

In [None]:
ende_train_df = raw_train_df.iloc[:16367]
zhen_train_df = raw_train_df.iloc[16368:]
ende_val_df = raw_val_df.iloc[:5456]
zhen_val_df = raw_val_df.iloc[5657:]
ende_test_df = raw_test_df.iloc[:5456]
zhen_test_df = raw_test_df.iloc[5457:]

In [None]:
ende_mean = ende_train_df["mqm_score"].mean()
ende_sd = ende_train_df["mqm_score"].std()

zhen_mean = zhen_train_df["mqm_score"].mean()
zhen_sd = zhen_train_df["mqm_score"].std()

ende_mean, ende_sd, zhen_mean, zhen_sd

(0.9864911101606892,
 1.5411897967214492,
 1.9939672178185859,
 2.2663194655370646)

In [None]:
ende_train_df['s_mqm'] = (ende_train_df['mqm_score'] - ende_mean) / ende_sd
ende_val_df['s_mqm'] = (ende_val_df['mqm_score'] - ende_mean) / ende_sd
ende_test_df['s_mqm'] = (ende_test_df['mqm_score'] - ende_mean) / ende_sd

zhen_train_df['s_mqm'] = (zhen_train_df['mqm_score'] - zhen_mean) / zhen_sd
zhen_val_df['s_mqm'] = (zhen_val_df['mqm_score'] - zhen_mean) / zhen_sd
zhen_test_df['s_mqm'] = (zhen_test_df['mqm_score'] - zhen_mean) / zhen_sd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ende_train_df['s_mqm'] = (ende_train_df['mqm_score'] - ende_mean) / ende_sd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ende_val_df['s_mqm'] = (ende_val_df['mqm_score'] - ende_mean) / ende_sd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ende_test_df['s_mqm'] = (ende_test_df['mqm_score'] - ende

In [None]:
raw_ende_train_ds = Dataset.from_pandas(ende_train_df)
raw_zhen_train_ds = Dataset.from_pandas(zhen_train_df)
raw_ende_val_ds = Dataset.from_pandas(ende_val_df)
raw_zhen_val_ds = Dataset.from_pandas(zhen_val_df)
raw_ende_test_ds = Dataset.from_pandas(ende_test_df)
raw_zhen_test_ds = Dataset.from_pandas(zhen_test_df)

In [None]:
ende_ds = {"train": raw_ende_train_ds, "validation": raw_ende_val_ds, "test": raw_ende_test_ds}
zhen_ds = {"train": raw_zhen_train_ds, "validation": raw_zhen_val_ds, "test": raw_zhen_test_ds}

In [None]:
def preprocess_function(df):
    score = df['s_mqm']
    df = tokenizer(df["source"], df["target"], truncation=True, padding="max_length", max_length=MAX_LEN)
    df["label"] = float(score)
    return df

In [None]:
for split in ende_ds:
    ende_ds[split] = ende_ds[split].map(preprocess_function, remove_columns=['Unnamed: 0', 'source', 'seg_id', 'target', 'doc_id', 'system', 'doc',
       'rater', 'severity', 'category_Accuracy/Addition',
       'category_Accuracy/Mistranslation', 'category_Accuracy/Omission',
       'category_Accuracy/Source language fragment',
       'category_Fluency/Character encoding', 'category_Fluency/Grammar',
       'category_Fluency/Inconsistency', 'category_Fluency/Punctuation',
       'category_Fluency/Register', 'category_Fluency/Spelling',
       'category_Locale convention/Currency format',
       'category_Locale convention/Date format',
       'category_Locale convention/Time format', 'category_No-error',
       'category_Other', 'category_Source error', 'category_Style/Awkward',
       'category_Terminology/Inappropriate for context',
       'category_Terminology/Inconsistent',
       'category_Locale convention/Name format', 'category_Non-translation!',
       'category_Locale convention/Address format',
       'category_Locale convention/Telephone format', 'mqm_score', 's_mqm'])

Map:   0%|          | 0/16367 [00:00<?, ? examples/s]

Map:   0%|          | 0/5456 [00:00<?, ? examples/s]

Map:   0%|          | 0/5456 [00:00<?, ? examples/s]

In [None]:
for split in zhen_ds:
    zhen_ds[split] = zhen_ds[split].map(preprocess_function, remove_columns=['Unnamed: 0', 'source', 'seg_id', 'target', 'doc_id', 'system', 'doc',
       'rater', 'severity', 'category_Accuracy/Addition',
       'category_Accuracy/Mistranslation', 'category_Accuracy/Omission',
       'category_Accuracy/Source language fragment',
       'category_Fluency/Character encoding', 'category_Fluency/Grammar',
       'category_Fluency/Inconsistency', 'category_Fluency/Punctuation',
       'category_Fluency/Register', 'category_Fluency/Spelling',
       'category_Locale convention/Currency format',
       'category_Locale convention/Date format',
       'category_Locale convention/Time format', 'category_No-error',
       'category_Other', 'category_Source error', 'category_Style/Awkward',
       'category_Terminology/Inappropriate for context',
       'category_Terminology/Inconsistent',
       'category_Locale convention/Name format', 'category_Non-translation!',
       'category_Locale convention/Address format',
       'category_Locale convention/Telephone format', 'mqm_score', 's_mqm'])

Map:   0%|          | 0/27881 [00:00<?, ? examples/s]

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

Map:   0%|          | 0/9294 [00:00<?, ? examples/s]

# General Training

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr

def compute_metrics_for_regression(eval_pred):
    # print("eval_pred: ", eval_pred)
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    if np.all(labels == labels[0]) or np.all(logits == logits[0]):
      print(np.all(labels == labels[0]), np.all(logits == logits[0]))
      print(labels[0], logits[0])

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    pearson_corr, _ = pearsonr(logits.flatten(), labels.flatten())

    return {"mse": mse, "rmse": rmse, "mae": mae, "pearson_corr": pearson_corr}

In [None]:
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

# En-De Train

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
training_args_ende = TrainingArguments(
    output_dir="../models/xlmr_ende_reg",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="mse",
    load_best_model_at_end=True,
    weight_decay=0.01,
    remove_unused_columns=False
)

In [None]:
from transformers import XLMRobertaForSequenceClassification

model_ende = XLMRobertaForSequenceClassification.from_pretrained(BASE_MODEL,
                                                                 num_labels = 1,
                                                                 hidden_dropout_prob=0.2,
                                                                 attention_probs_dropout_prob=0.2)

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer_ende = RegressionTrainer(
    model=model_ende,
    args=training_args_ende,
    train_dataset= ende_ds["train"],
    eval_dataset= ende_ds["validation"],
    compute_metrics=compute_metrics_for_regression,
)

In [None]:
trainer_ende.train()

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson Corr
1,0.7467,0.636189,0.636189,0.797615,0.563303,0.623034
2,0.6244,0.566395,0.566395,0.752592,0.49572,0.685584
3,0.5217,0.531853,0.531853,0.729282,0.421628,0.726853
4,0.4553,0.503686,0.503686,0.709709,0.381389,0.745667


TrainOutput(global_step=2048, training_loss=0.5846737958490849, metrics={'train_runtime': 1655.7951, 'train_samples_per_second': 39.539, 'train_steps_per_second': 1.237, 'total_flos': 1.7225199912800256e+16, 'train_loss': 0.5846737958490849, 'epoch': 4.0})

In [None]:
trainer_ende.eval_dataset = ende_ds["test"]
trainer_ende.evaluate()

{'eval_loss': 0.6272925138473511,
 'eval_mse': 0.6272925138473511,
 'eval_rmse': 0.7920179963111877,
 'eval_mae': 0.5576898455619812,
 'eval_pearson_corr': 0.6325335501757472,
 'eval_runtime': 40.2324,
 'eval_samples_per_second': 135.612,
 'eval_steps_per_second': 4.25,
 'epoch': 4.0}

In [None]:
trainer_ende.eval_dataset = zhen_ds["test"]
trainer_ende.evaluate()

{'eval_loss': 0.6563699841499329,
 'eval_mse': 0.6563699841499329,
 'eval_rmse': 0.8101666569709778,
 'eval_mae': 0.5669180154800415,
 'eval_pearson_corr': 0.650859173696751,
 'eval_runtime': 68.5543,
 'eval_samples_per_second': 135.571,
 'eval_steps_per_second': 4.245,
 'epoch': 4.0}

In [None]:
trainer_ende.model.push_to_hub("xlmr_ende_reg")

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aningddd/xlmr_ende_reg/commit/d96d5a3a7431f3cd44f72d7ca39054ea9c764ead', commit_message='Upload XLMRobertaForSequenceClassification', commit_description='', oid='d96d5a3a7431f3cd44f72d7ca39054ea9c764ead', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import math

nb_batches = math.ceil(len(raw_ende_test_ds)/BATCH_SIZE)
y_preds = []

for i in range(nb_batches):
    input_source = raw_ende_test_ds[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["source"]
    input_target = raw_ende_test_ds[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["target"]
    input_labels = raw_ende_test_ds[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["s_mqm"]
    encoded = tokenizer(input_source, input_target, truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt").to("cuda")
    y_preds += trainer_ende.model(**encoded).logits.reshape(-1).tolist()

In [None]:
pd.set_option('display.max_rows', 500)
df = pd.DataFrame([raw_ende_test_ds["source"], raw_ende_test_ds["target"], raw_ende_test_ds["mqm_score"], (np.array(y_preds).reshape(-1, 1) * ende_sd) + ende_mean], ["Source", "Target", "Score", "Prediction"]).T
df

Unnamed: 0,Source,Target,Score,Prediction
0,Iran reports lowest number of daily COVID-19 c...,Iran meldet<v> niedrigste</v> Zahl an tägliche...,1.0,[1.7549371709714348]
1,If the book still has issues to open after syn...,Falls das Buch nach der Synchronisation im E-R...,0.0,[-0.2086925739790022]
2,"""Russia should be in no doubt that further mil...","""Russland sollte keinen Zweifel daran haben, d...",0.1,[-0.347927914031291]
3,"I do apologise about this, as the account hold...","Ich entschuldige mich dafür, da der Kontoinhab...",1.0,[0.9035340627761467]
4,"Beside 'Repair your #PRS_ORG# account', tap Re...","Neben ""Reparieren Sie Ihr # PRS _ ORG # -Konto...",1.0,[0.878510312343185]
...,...,...,...,...
5451,Germany's regulator has suspended the approval...,Deutschlands Regulierungsbehörde hat das Geneh...,0.0,[-0.3023619387164025]
5452,Move the trailer!,Bewegen Sie den Anhänger!,0.0,[-0.048207883534013396]
5453,A rollercoaster first half ended with Munster ...,Eine Achterbahnfahrt der ersten Halbzeit endet...,5.0,[3.162410149655704]
5454,"Best way to lose a new young worker, by shocki...",Der beste Weg einen neuen jungen Arbeiter zu v...,0.0,[-0.158438693489245]


# Zh-En Training

In [None]:
training_args_zhen = TrainingArguments(
    output_dir="../models/xlmr_zhen_reg",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="mse",
    load_best_model_at_end=True,
    weight_decay=0.01,
    remove_unused_columns=False
)

In [None]:
model_zhen = XLMRobertaForSequenceClassification.from_pretrained(BASE_MODEL,
                                                                 num_labels = 1,
                                                                 hidden_dropout_prob=0.2,
                                                                 attention_probs_dropout_prob=0.2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer_zhen = RegressionTrainer(
    model=model_zhen,
    args=training_args_zhen,
    train_dataset= zhen_ds["train"],
    eval_dataset= zhen_ds["validation"],
    compute_metrics=compute_metrics_for_regression,
)

In [None]:
trainer_zhen.train()

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson Corr
1,0.6741,0.510453,0.510453,0.71446,0.485809,0.719936
2,0.5013,0.503737,0.503737,0.709744,0.45471,0.745066
3,0.4547,0.445416,0.445416,0.667395,0.422724,0.765152
4,0.445,0.422725,0.422725,0.650173,0.416542,0.773731


TrainOutput(global_step=3488, training_loss=0.5005029844581534, metrics={'train_runtime': 2778.5105, 'train_samples_per_second': 40.138, 'train_steps_per_second': 1.255, 'total_flos': 2.934293387723981e+16, 'train_loss': 0.5005029844581534, 'epoch': 4.0})

In [None]:
trainer_zhen.eval_dataset = zhen_ds["test"]
trainer_zhen.evaluate()

{'eval_loss': 0.506644070148468,
 'eval_mse': 0.506644070148468,
 'eval_rmse': 0.7117893695831299,
 'eval_mae': 0.4949461817741394,
 'eval_pearson_corr': 0.7176014455660258,
 'eval_runtime': 68.7157,
 'eval_samples_per_second': 135.253,
 'eval_steps_per_second': 4.235,
 'epoch': 4.0}

In [None]:
trainer_zhen.eval_dataset = ende_ds["test"]
trainer_zhen.evaluate()

{'eval_loss': 0.6930940747261047,
 'eval_mse': 0.6930940747261047,
 'eval_rmse': 0.8325226902961731,
 'eval_mae': 0.6367790699005127,
 'eval_pearson_corr': 0.5815990768762231,
 'eval_runtime': 40.2884,
 'eval_samples_per_second': 135.423,
 'eval_steps_per_second': 4.244,
 'epoch': 4.0}

In [None]:
trainer_zhen.model.push_to_hub("xlmr_zhen_reg")

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aningddd/xlmr_zhen_reg/commit/6a353631034a5f1cf8ae5c9b33cf97fd5506218c', commit_message='Upload XLMRobertaForSequenceClassification', commit_description='', oid='6a353631034a5f1cf8ae5c9b33cf97fd5506218c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
zhen_batches = math.ceil(len(raw_zhen_test_ds)/BATCH_SIZE)
y_preds_zhen = []

for i in range(zhen_batches):
    input_source = raw_zhen_test_ds[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["source"]
    input_target = raw_zhen_test_ds[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["target"]
    input_labels = raw_zhen_test_ds[i * BATCH_SIZE: (i+1) * BATCH_SIZE]["s_mqm"]
    encoded = tokenizer(input_source, input_target, truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt").to("cuda")
    y_preds_zhen += trainer_zhen.model(**encoded).logits.reshape(-1).tolist()

In [None]:
pd.set_option('display.max_rows', 500)
zhen_pred_df = pd.DataFrame([raw_zhen_test_ds["source"], raw_zhen_test_ds["target"], raw_zhen_test_ds["mqm_score"], (np.array(y_preds_zhen).reshape(-1, 1) * zhen_sd) + zhen_mean], ["Source", "Target", "Score", "Prediction"]).T
zhen_pred_df

Unnamed: 0,Source,Target,Score,Prediction
0,但是，时代变了。,But times have changed.,0.0,[-0.1747280484004725]
1,有意思的是，现在字节的TT 是Meta 主要的竞争对手之一，但是前两年Meta 从字节身上可...,"Interestingly, TT is one of the main competito...",5.0,[4.553132399759497]
2,我们召开党的十九届六中全会，总结党的百年奋斗重大成就和历史经验，通过百年党史上第三个历史决议...,We convened the Sixth Plenary Session of the 1...,5.0,[4.684735553910096]
3,奶奶的除夕夜说学逗唱二十四节气清冬见远山爷爷是个笨小孩我的奶奶住在古里古怪镇饺子和汤圆一块巧...,Grandma's New Year's Eve Sing and learn to sin...,1.0,[4.380058173102128]
4,新华社利雅得12月12日电（记者王海洲胡冠）沙特阿拉伯首届当代艺术双年展— — 迪里耶当代艺...,Xinhua News Agency report of December 12 from ...,1.0,[3.630804259736438]
...,...,...,...,...
9289,“最好原则”的最大问题在于门槛太高，它往往超越了普通人的能力极限，让绝大多数人“望门兴叹” 。,The biggest problem with the “best principle” ...,0.1,[0.04167207772549886]
9290,去之前看了店里的拍摄作品，更多的是户外婚纱，拍的很漂亮，涉及到的拍摄风格也很多。,"Before I went there, I saw the shooting works ...",5.0,[4.55305594268715]
9291,马三立先生那段著名的《从明天开始》的相声，就是最传神地表现了这一“普遍人性”的经典。,Mr. Ma Sanli's famous comic “Starting Tomorrow...,5.0,[4.029008362167177]
9292,他同时认为，数字化转型需驱动向纵深发展和可持续发展，参与新冠肺炎疫情防控、经济复苏和发展、应...,He also believes that digital transformation n...,5.0,[3.677204246829425]
