[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/alina-dima/nlp/blob/main/Editing_effort_prediction.ipynb)

In [None]:
!pip install datasets pandas
from sklearn import linear_model
import numpy as np
from datasets import load_dataset
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr
from sklearn.model_selection import KFold

# Predict editing effort (edit time or HTER) based on different features

## Load dataset

In [3]:
data = load_dataset("GroNLP/divemt")
print(data)
data["train"].to_pandas().head()
data_p = data["train"].to_pandas()
data_p_pe = data_p.loc[data_p['task_type'] != "ht"].reset_index()

Downloading builder script:   0%|          | 0.00/6.43k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.7k [00:00<?, ?B/s]



Downloading and preparing dataset divemt/main to /root/.cache/huggingface/datasets/GroNLP___divemt/main/1.0.0/c875a08adbd66d5d9c6d8c79f7816092829b41f855641161190c69e19b4f11b1...


Downloading data:   0%|          | 0.00/85.5M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset divemt downloaded and prepared to /root/.cache/huggingface/datasets/GroNLP___divemt/main/1.0.0/c875a08adbd66d5d9c6d8c79f7816092829b41f855641161190c69e19b4f11b1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['unit_id', 'flores_id', 'item_id', 'subject_id', 'lang_id', 'doc_id', 'task_type', 'translation_type', 'src_len_chr', 'mt_len_chr', 'tgt_len_chr', 'src_len_wrd', 'mt_len_wrd', 'tgt_len_wrd', 'edit_time', 'k_total', 'k_letter', 'k_digit', 'k_white', 'k_symbol', 'k_nav', 'k_erase', 'k_copy', 'k_cut', 'k_paste', 'k_do', 'n_pause_geq_300', 'len_pause_geq_300', 'n_pause_geq_1000', 'len_pause_geq_1000', 'event_time', 'num_annotations', 'last_modification_time', 'n_insert', 'n_delete', 'n_substitute', 'n_shift', 'tot_shifted_words', 'tot_edits', 'hter', 'cer', 'bleu', 'chrf', 'time_s', 'time_m', 'time_h', 'time_per_char', 'time_per_word', 'key_per_char', 'words_per_hour', 'words_per_minute', 'per_subject_visit_order', 'src_text', 'mt_text', 'tgt_text', 'aligned_edit', 'src_tokens', 'src_annotations', 'mt_tokens', 'mt_annotations', 'tgt_tokens', 'tgt_annotations', 'src_wmt22_qe', 'mt_wmt22_qe'],
        num_rows: 7740
    })
})


## Functions to compute features

In [4]:
# HEAD related features
def get_mean_head_dist(data, level="src"):
    heads = data[f"{level}_annotations"]["head"]
    head_dists = np.asarray([abs(index - int(head)) for index, head in enumerate(heads)])
    return np.sum(head_dists)/len(heads)

def get_median_head_dist(data, level="src"):
    heads = data[f"{level}_annotations"]["head"]
    head_dists = np.asarray([abs(index - int(head)) for index, head in enumerate(heads)])
    return np.median(head_dists)


# POS related features
def get_pos_rate(data, more_pos=["NOUN"]):
    all_pos_src = data["src_annotations"]["upos"]
    all_pos_mt = data["mt_annotations"]["upos"]
    all_pos_tgt = data["tgt_annotations"]["upos"]
    
    this_pos_src = [np.count_nonzero(all_pos_src == pos) for pos in more_pos]
    this_pos_mt = [np.count_nonzero(all_pos_mt == pos) for pos in more_pos]
    this_pos_tgt = [np.count_nonzero(all_pos_tgt == pos) for pos in more_pos]

    ratios_src = [this_count / len(all_pos_src) for this_count in this_pos_src]
    ratios_mt = [this_count / len(all_pos_mt) for this_count in this_pos_mt]
    ratios_tgt = [this_count / len(all_pos_tgt) for this_count in this_pos_tgt]

    ratio1 = [r_mt / r_src if r_src > 0 else 0 for r_mt, r_src in zip(ratios_mt, ratios_src)]
    ratio2 = [r_mt / r_tgt if r_tgt > 0 else 0 for r_mt, r_tgt in zip(ratios_mt, ratios_tgt)]

    ratio = [val for pair in zip(ratio1, ratio2) for val in pair]
    return ratio


# features related to BAD-OK labels
def get_bad_rate(data_p_pe, level="src"):
    return data_p_pe.apply(lambda x: np.count_nonzero(x[f"{level}_wmt22_qe"] == "BAD") / len(x[f"{level}_wmt22_qe"]), axis=1)

def get_bad_ratio(data_p_pe):
    return data_p_pe.apply(lambda x: np.count_nonzero(x["mt_wmt22_qe"] == "BAD") / np.count_nonzero(x["src_wmt22_qe"] == "BAD") if np.count_nonzero(x["src_wmt22_qe"] == "BAD") != 0 else 1, axis=1)


#features related to tokens
def get_diff_tokens(data_p_pe, levels=["mt", "tgt"]):
    return (len(data_p_pe[f"{levels[0]}_tokens"]) - len(data_p_pe[f"{levels[1]}_tokens"])) / data_p_pe[f"{levels[1]}_len_wrd"]

def get_tokens_ratio(data_p_pe, levels=["mt", "tgt"]):
    return len(data_p_pe[f"{levels[0]}_tokens"]) / len(data_p_pe[f"{levels[1]}_tokens"]) if len(data_p_pe[f"{levels[1]}_tokens"]) != 0 else 0


## Add features to dataset

In [5]:
data_p_pe["src_bad_rate"] =  get_bad_rate(data_p_pe, level="src")
data_p_pe["mt_bad_rate"] =  get_bad_rate(data_p_pe, level="mt")
data_p_pe["bad_rate_ratio"] = get_bad_ratio(data_p_pe)

data_p_pe["median_head_distance"] = data_p_pe.apply(get_median_head_dist, axis=1)
data_p_pe["mean_head_distance"] = data_p_pe.apply(get_mean_head_dist, axis=1)

languages = np.unique(data_p_pe["lang_id"])
data_p_pe["lang_id_enc"] = data_p_pe.apply(lambda x: np.where(languages == x["lang_id"])[0][0], axis=1)

# data_p_pe["diff_tokens_tgt"] = get_diff_tokens(data_p_pe, levels=["mt", "tgt"])

data_p_pe["ratio_tokens_tgt"] = get_tokens_ratio(data_p_pe, levels=["mt", "tgt"])
data_p_pe["diff_tokens_src"] = get_diff_tokens(data_p_pe, levels=["mt", "src"])
data_p_pe["ratio_tokens_src"] = get_tokens_ratio(data_p_pe, levels=["mt", "src"])

pos_features = ["NOUN"]
pos_features_names = [f'{feature}_{i}' for feature in pos_features for i in range(1, 3)]
data_p_pe[pos_features_names] = data_p_pe.apply(lambda data: get_pos_rate(data, pos_features), axis=1, result_type="expand")


## Select which features to use

In [12]:
response = "hter" # time_s or hter

# source complexity and target fluency features
features_baseline = ["src_len_wrd", "mt_len_wrd", "tgt_len_wrd"]

# adding adequacy features
features_1 = ["src_len_wrd", "mt_len_wrd", "tgt_len_wrd", 
            "src_bad_rate", "mt_bad_rate", "bad_rate_ratio",
            "diff_tokens_src", "ratio_tokens_src", "ratio_tokens_tgt"]

# adding POS + head
features_2 = ["src_len_wrd", "mt_len_wrd", "tgt_len_wrd", 
            "src_bad_rate", "mt_bad_rate", "bad_rate_ratio",
            "diff_tokens_src", "ratio_tokens_src", "ratio_tokens_tgt", "mean_head_distance"] + pos_features_names

In [7]:
# get the correlation coefficient and p-value for time and hter
corr_coef, p_value = pearsonr(data_p_pe['time_s'], data_p_pe['hter'])

print("Correlation coefficient and p-value between time and hter:")
print(corr_coef, p_value)

Correlation coefficient and p-value between time and hter:
0.12833260405435876 2.150545006559888e-20


## Train models with 10-fold CV

In [24]:
num_folds = 10

all_features = [features_baseline, features_1, features_2]
all_models_names = ["Baseline", "Baseline + AD", "Baseline + AD + dataset-specific features"]

print(f"Predicting: {response}\n")

for features, name in zip(all_features, all_models_names):
    mse_list = []
    r2_list = []

    # Perform 5-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True)
    for train_index, val_index in kf.split(data_p_pe):
        train_fold = data_p_pe.iloc[train_index]
        val_fold = data_p_pe.iloc[val_index]

        response_train_fold = train_fold[response]
        response_val_fold = val_fold[response]

        predictors_train_fold = train_fold[features]
        predictors_val_fold = val_fold[features]

        lin_reg_model = linear_model.LinearRegression()
        reg = lin_reg_model.fit(predictors_train_fold, response_train_fold)
        pred = lin_reg_model.predict(predictors_val_fold)

        mse = mean_squared_error(response_val_fold, pred, squared=False)
        r2= r2_score(response_val_fold, pred)

        mse_list.append(mse)
        r2_list.append(r2)

    # Calculate average performance across all folds
    avg_mse = sum(mse_list) / num_folds
    avg_r2 = sum(r2_list) / num_folds
    std_mse = np.std(mse_list)

    print(name)
    print(f"Average Mean squared error: {avg_mse:.3f} +- {std_mse:.3f}")
    print(f"Average R-squared: {avg_r2:.3f}\n")


Predicting: hter

Baseline
Average Mean squared error: 24.068 +- 0.460
Average R-squared: 0.043

Baseline + AD
Average Mean squared error: 7.900 +- 0.827
Average R-squared: 0.896

Baseline + AD + dataset-specific features
Average Mean squared error: 7.860 +- 1.054
Average R-squared: 0.897

