#Data Cleaning

####Move to Repo

In [None]:
#moving to repo
from google.colab import drive
import os

# Mount Drive
drive.mount('/content/drive', force_remount=True)

# Repo info
MYDRIVE = "/content/drive/MyDrive"
REPO_NAME = "chineseproverbs"
REPO_PATH = os.path.join(MYDRIVE, REPO_NAME)

# Go to MyDrive
os.chdir(MYDRIVE)

# Clone if missing, else pull
if not os.path.exists(REPO_PATH):
    print("Cloning repo...")
    !git clone https://github.com/art3misxmoon/chineseproverbs.git
else:
    print("Repo exists, pulling latest updates...")
    os.chdir(REPO_PATH)
    !git pull

# Move to repo folder
os.chdir(REPO_PATH)
print("Current working directory:", os.getcwd())
!ls


Mounted at /content/drive
Repo exists, pulling latest updates...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects: 100% (5/5), 6.47 MiB | 5.66 MiB/s, done.
From https://github.com/art3misxmoon/chineseproverbs
   e4ca79e..f3b552c  main       -> origin/main
[33mhint: You have divergent branches and need to specify how to reconcile them.[m
[33mhint: You can do so by running one of the following commands sometime before[m
[33mhint: your next pull:[m
[33mhint: [m
[33mhint:   git config pull.rebase false  # merge (the default strategy)[m
[33mhint:   git config pull.rebase true   # rebase[m
[33mhint:   git config pull.ff only       # fast-forward only[m
[33mhint: [m
[33mhint: You can replace "git config" with "git config --global" to set a default[m
[33mhint: preference for all repositories. You c

##General Data

In [None]:
import os
import tarfile

# --- Paths to your split files ---
part1 = "UNv1.0.en-zh.tar.gz.00"
part2 = "UNv1.0.en-zh.tar.gz.01"

# --- Path for combined archive ---
combined_tar = "UNv1.0.en-zh.tar.gz"

# --- Concatenate the split files ---
os.system(f"cat {part1} {part2} > {combined_tar}")
print(f"Combined archive saved to: {combined_tar}")

# --- Inspect contents of the tar.gz without extracting ---
with tarfile.open(combined_tar, 'r:gz') as tar:
    print("Files inside the combined tar.gz:")
    for member in tar.getmembers()[:20]:  # just show first 20 files
        print(member.name)


Combined archive saved to: UNv1.0.en-zh.tar.gz
Files inside the combined tar.gz:
en-zh
en-zh/UNv1.0.en-zh.ids
en-zh/UNv1.0.pdf
en-zh/README
en-zh/DISCLAIMER
en-zh/UNv1.0.en-zh.zh
en-zh/UNv1.0.en-zh.en


In [None]:
import tarfile
import os

tar_path = "UNv1.0.en-zh.tar.gz"
extract_path = "UNv1.0_en-zh"

os.makedirs(extract_path, exist_ok=True)

with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_path)

print("Extraction complete. Files:")
print(os.listdir(extract_path))


  tar.extractall(path=extract_path)


Extraction complete. Files:
['en-zh']


In [None]:
ch_file = os.path.join(extract_path, "en-zh", "UNv1.0.en-zh.zh")
en_file = os.path.join(extract_path, "en-zh", "UNv1.0.en-zh.en")

# Peek at first 5 sentences
with open(ch_file, "r", encoding="utf-8") as f_ch, open(en_file, "r", encoding="utf-8") as f_en:
    for i, (c, e) in enumerate(zip(f_ch, f_en)):
        if i >= 5:
            break
        print(f"CH: {c.strip()}")
        print(f"EN: {e.strip()}")
        print("---")


CH: 第918(1994)号决议
EN: RESOLUTION 918 (1994)
---
CH: 1994年5月17日安全理事会第3377次会议通过
EN: Adopted by the Security Council at its 3377th meeting, on 17 May 1994
---
CH: 安全理事会，
EN: The Security Council,
---
CH: 重申其以往关于卢旺达局势的所有决议，特别是成立联合国卢旺达援助团(联卢援助团)的1993年10月5日第872(1993)号决议，延长联卢援助团任务期限至1994年7月29日的1994年4月5日第909(1994)号决议，以及调整联卢援助团的任务规定的1994年4月21日第912(1994)号决议，
EN: Reaffirming all its previous resolutions on the situation in Rwanda, in particular its resolution 872 (1993) of 5 October 1993 by which it established the United Nations Assistance Mission for Rwanda (UNAMIR), its resolution 909 (1994) of 5 April 1994 which extended the mandate of UNAMIR until 29 July 1994, and its resolution 912 (1994) of 21 April 1994 by which it adjusted the mandate of UNAMIR,
---
CH: 回顾安理会主席以安理会名义在1994年4月7日发表的声明(S/PRST/ 1994/16)和在1994年4月30日发表的声明(S/PRST/1994/21)，
EN: Recalling the statements made by the President of the Council on 7 April 1994 (S/PRST/1994/16) and 30 April 1994 (S/PRST/1994/21),
---


In [None]:
import pandas as pd

with open(ch_file, "r", encoding="utf-8") as f_ch, open(en_file, "r", encoding="utf-8") as f_en:
    ch_lines = [line.strip() for line in f_ch]
    en_lines = [line.strip() for line in f_en]

df_un = pd.DataFrame({
    "chinese": ch_lines,
    "english": en_lines
})

print(df_un.head())
print(f"Total sentence pairs: {len(df_un)}")


                                             chinese  \
0                                      第918(1994)号决议   
1                          1994年5月17日安全理事会第3377次会议通过   
2                                             安全理事会，   
3  重申其以往关于卢旺达局势的所有决议，特别是成立联合国卢旺达援助团(联卢援助团)的1993年1...   
4  回顾安理会主席以安理会名义在1994年4月7日发表的声明(S/PRST/ 1994/16)和...   

                                             english  
0                              RESOLUTION 918 (1994)  
1  Adopted by the Security Council at its 3377th ...  
2                              The Security Council,  
3  Reaffirming all its previous resolutions on th...  
4  Recalling the statements made by the President...  
Total sentence pairs: 15886041


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import os
import tarfile
import random

# --- Paths ---
tar_path = "UNv1.0.en-zh.tar.gz"
extract_path = "UNv1.0_en-zh"
os.makedirs(extract_path, exist_ok=True)

# --- Extract tar.gz ---
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_path)

ch_file = os.path.join(extract_path, "en-zh", "UNv1.0.en-zh.zh")
en_file = os.path.join(extract_path, "en-zh", "UNv1.0.en-zh.en")

# --- Load sentences ---
with open(ch_file, "r", encoding="utf-8") as f_ch, open(en_file, "r", encoding="utf-8") as f_en:
    ch_lines = [line.strip() for line in f_ch]
    en_lines = [line.strip() for line in f_en]

df_un = pd.DataFrame({"chinese": ch_lines, "english": en_lines})
print(f"Total sentence pairs: {len(df_un)}")

# --- Cleaning ---
def is_valid_sentence(s):
    if len(s.split()) < 3:  # very short
        return False
    if len(s.split()) > 100:  # very long
        return False
    if re.fullmatch(r'[\d\(\)\-/\s]+', s):  # headings/numbers only
        return False
    return True

df_un["english"] = df_un["english"].str.lower().str.strip()
df_un["chinese"] = df_un["chinese"].str.strip()
df_un = df_un[df_un["english"].apply(is_valid_sentence)].reset_index(drop=True)
print(f"Sentence pairs after filtering: {len(df_un)}")

# --- Sample a reasonable subset for testing ---
subset_size = 50000  # adjust as needed
if len(df_un) > subset_size:
    df_un = df_un.sample(subset_size, random_state=42).reset_index(drop=True)
print(f"Subset size for testing: {len(df_un)}")

df_un.to_csv("UN_cleaned.csv", index=False, encoding="utf-8-sig")

  tar.extractall(path=extract_path)


Total sentence pairs: 15886041
Sentence pairs after filtering: 14614571
Subset size for testing: 50000


##IdiomKB

In [None]:
# ==============================
# Load IdiomKB JSON, remove duplicates, save cleaned dataset
# ==============================
!pip install zhconv
import pandas as pd
import json
from zhconv import convert
import os

# --- Step 0: Ensure we're in the repo folder ---
# Adjust if your notebook is opened elsewhere
REPO_PATH = "/content/drive/MyDrive/chineseproverbs"
os.chdir(REPO_PATH)
print("Current working directory:", os.getcwd())

# --- Step 1: Load JSON dataset (IdiomKB) ---
with open('zh_idiom_meaning.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

df_json = pd.DataFrame(json_data)
df_json = df_json[['idiom', 'en_meaning']]
df_json.rename(columns={'idiom': 'chinese', 'en_meaning': 'english'}, inplace=True)
df_json['source'] = 'JSON'  # mark source

print(f"IdiomKB JSON dataset loaded: {len(df_json)} rows")

# --- Step 2: Normalize Chinese characters (Traditional -> Simplified) ---
df_json['chinese'] = df_json['chinese'].apply(lambda x: convert(x, 'zh-cn'))

# --- Step 3: Find and show duplicates ---
duplicates = df_json[df_json.duplicated(subset='chinese', keep=False)]
if not duplicates.empty:
    print("\nFound duplicates (before dropping):")
    print(duplicates.sort_values('chinese'))
    print(f"Total duplicates found: {len(duplicates)}")
else:
    print("\nNo duplicates found.")

# --- Step 4: Remove duplicates ---
df_json.drop_duplicates(subset='chinese', keep='first', inplace=True)
print(f"Dataset after removing duplicates: {len(df_json)} rows")

# --- Step 5: Save cleaned dataset ---
df_json.to_csv('idiomkb_cleaned.csv', index=False, encoding='utf-8-sig')
print("\nCleaned dataset saved to 'idiomkb_cleaned.csv'")


Current working directory: /content/drive/MyDrive/chineseproverbs
IdiomKB JSON dataset loaded: 8643 rows

Found duplicates (before dropping):
     chinese                                            english source
48      一干二净                          completely and thoroughly   JSON
5566    一干二净  completely and thoroughly, leaving nothing behind   JSON
316     不遗余力            spare no effort; do everything possible   JSON
7953    不遗余力                   spare no effort, do one's utmost   JSON
725     前仆后继  succeeding each other in a continuous and unbr...   JSON
727     前仆后继             successors stepping forward one by one   JSON
869     反复无常  being unpredictable or changing one's mind fre...   JSON
4254    反复无常  being inconsistent and unpredictable, changing...   JSON
924     固执己见  being stubborn and insisting on one's own opin...   JSON
1085    固执己见            holding onto one's own views stubbornly   JSON
2583    无济于事                               ineffective, useless   JSON
4950  

In [None]:
# ==============================
# Further clean English references and save refs_list
# ==============================
import pandas as pd
import re

# --- Step 1: Load previously cleaned CSV ---
df = pd.read_csv('idiomkb_cleaned.csv')
print(f"Loaded cleaned dataset: {len(df)} rows")

# --- Step 2: Further clean English references ---
def clean_refs(text):
    """
    Returns a list of cleaned English references:
    - Lowercase and strip
    - Replace first '(' with ',' and remove all ')'
    - Strip leading/trailing quotes
    - If quotes exist, only keep quoted strings
    - Else if semicolons exist, split by semicolons
    - Do NOT include original string if multiple references extracted
    - Only include original string if nothing else extracted
    """
    text = str(text).lower().strip()

    # --- Minimal change: handle parentheses ---
    text = re.sub(r'\(', ',', text, count=1)  # first '(' -> ','
    text = text.replace(')', '')              # remove all ')'

    # 1️⃣ Extract quoted alternatives
    quote_pattern = re.findall(r'"([^"]+)"', text)
    if quote_pattern:
        parts = [q.strip().strip('"').strip("'") for q in quote_pattern]
    # 2️⃣ Else split by semicolons
    elif ';' in text:
        parts = [p.strip().strip('"').strip("'") for p in text.split(';') if p.strip()]
    # 3️⃣ Fallback: keep full original
    else:
        parts = [text.strip().strip('"').strip("'")]

    # 4️⃣ Remove duplicates while preserving order
    seen = set()
    cleaned = []
    for p in parts:
        if p not in seen:
            cleaned.append(p)
            seen.add(p)

    return cleaned

# --- Step 3: Apply to all rows ---
df['refs_list'] = df['english'].apply(clean_refs)

# --- Step 4: Save new cleaned dataset with references ---
df.to_csv('idiomkb_cleaned_refs.csv', index=False, encoding='utf-8-sig')
print("Further cleaned dataset with reference lists saved to 'idiomkb_cleaned_refs.csv'")


Loaded cleaned dataset: 8632 rows
Further cleaned dataset with reference lists saved to 'idiomkb_cleaned_refs.csv'


### Split IdiomKB data (80:10:10)- Train: 6904, Validation: 864, Test: 864


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load cleaned dataset
df = pd.read_csv("idiomkb_cleaned_refs.csv")
print(f"Total dataset size: {len(df)}")

# Split: 80% train, 10% validation, 10% test
train_val, test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)
train, val = train_test_split(train_val, test_size=0.1111, random_state=42)  # 0.1111*0.9 ≈ 0.1 total

print(f"Train: {len(train)}, Validation: {len(val)}, Test: {len(test)}")

# Save to separate CSVs
train.to_csv("idiomkb_train.csv", index=False, encoding="utf-8")
val.to_csv("idiomkb_val.csv", index=False, encoding="utf-8")
test.to_csv("idiomkb_test.csv", index=False, encoding="utf-8")

print("Saved train, validation, and test CSVs successfully.")

Total dataset size: 8632
Train: 6904, Validation: 864, Test: 864
Saved train, validation, and test CSVs successfully.


### Load opus100 dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load cleaned OPUS100 dataset under a different name
opus_df = pd.read_csv("opus100_cleaned.csv")
print(f"Total OPUS100 dataset size: {len(opus_df)}")

Total OPUS100 dataset size: 2000


# Load Model

In [None]:
!pip install -U transformers



In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

## Translate some text

In [None]:
input_text = "一举两得"  # Chinese text you want to translate
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(translated)

I'll do it both ways.


Testing Model Behavior for Traditional Characters

Notes: same translation for both traditional & normal characters; neither captures meaningful idiom info.

In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

examples = ["畫蛇添足", "画蛇添足"]  # Traditional vs simplified

for text in examples:
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"{text} -> {translation}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

畫蛇添足 -> Draw the snake's feet.
画蛇添足 -> Draw the snake's feet.


#Model Evaluation (before Finetuning)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

# Tokenize Chinese input sentences
test_inputs = tokenizer(list(test['chinese']), return_tensors='pt', padding=True, truncation=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



generate some translations of the test set to check behavior

In [None]:
# ==============================
# Fast BLEU evaluation on test set (SacreBLEU + batching)
# ==============================
!pip install sacrebleu

import pandas as pd
import ast
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu

# --- Load test split (already cleaned CSV) ---
test = pd.read_csv('idiomkb_cleaned_refs.csv')
test['refs_list'] = test['refs_list'].apply(ast.literal_eval)

# --- Setup GPU ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# --- Load model & tokenizer ---
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model.to(device)

# --- Prepare test sentences ---
test_sentences = list(test['chinese'])

# --- Generate translations in batches ---
batch_size = 64
translations = []

for i in range(0, len(test_sentences), batch_size):
    batch_texts = test_sentences[i:i+batch_size]
    batch_inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
    batch_inputs = {k:v.to(device) for k,v in batch_inputs.items()}
    outputs = model.generate(**batch_inputs, max_length=64)
    translations.extend([tokenizer.decode(t, skip_special_tokens=True).lower().strip() for t in outputs])

print(f"Generated {len(translations)} translations.")

# --- Prepare references for SacreBLEU ---
# SacreBLEU expects list of predictions and list of reference lists (one per reference)
references_clean = [[r.strip().strip('"').strip("'").lower() for r in ref_list]
                    for ref_list in test['refs_list']]

# SacreBLEU expects refs as list of lists per reference
# If multiple references per sentence: [[ref1_sent1, ref1_sent2], [ref2_sent1, ref2_sent2], ...]
refs_for_sacrebleu = list(zip(*references_clean))  # transpose to match SacreBLEU format

# --- Inspect first 5 translations ---
for src, refs, pred in zip(test['chinese'][:5], references_clean[:5], translations[:5]):
    print("\nSRC:", src)
    print("REFS:", refs)
    print("PRED:", pred)
    print("---")


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1
Using device: cuda




Generated 8632 translations.

SacreBLEU score on test set: 1.10

SRC: 一波未平，一波又起
REFS: ['as soon as one problem is solved, another arises']
PRED: a wave of twilight and a wave of twilight.
---

SRC: 一败涂地
REFS: ['being utterly defeated or experiencing a complete failure']
PRED: you're a loser.
---

SRC: 一般见识
REFS: ['limited knowledge or experience']
PRED: it's common knowledge.
---

SRC: 一板三眼
REFS: ["meticulous and strict in one's work or behavior"]
PRED: it's three-eyes.
---

SRC: 一本正经
REFS: ['to be serious and earnest']
PRED: it's a serious book.
---


##Bleu score

In [9]:
import pandas as pd
import ast
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
!pip install sacrebleu
import sacrebleu

def evaluate_bleu(df, source_col='chinese', refs_col='refs_list',
                  model_name="Helsinki-NLP/opus-mt-zh-en",
                  batch_size=64, device=None, max_length=64):
    """
    Evaluate SacreBLEU for a given dataset and seq2seq model.

    Args:
        df: pd.DataFrame containing source sentences and reference translations
        source_col: column name of source sentences
        refs_col: column name containing references (as list of strings)
        model_name: Hugging Face model name
        batch_size: batch size for generation
        device: 'cuda', 'cpu', or None (auto-detect)
        max_length: max length of generated sequences

    Returns:
        bleu_score: float BLEU score
        translations: list of generated predictions
    """
    # Setup device
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.to(device)

    # Prepare test sentences
    test_sentences = list(df[source_col])

    # Generate translations in batches
    translations = []
    for i in range(0, len(test_sentences), batch_size):
        batch_texts = test_sentences[i:i+batch_size]
        batch_inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        outputs = model.generate(**batch_inputs, max_length=max_length)
        translations.extend([tokenizer.decode(t, skip_special_tokens=True).lower().strip() for t in outputs])

    print(f"Generated {len(translations)} translations.")

    # Prepare references for SacreBLEU
    # Convert stringified lists if necessary
    refs_lists = []
    for r in df[refs_col]:
        if isinstance(r, str):
            refs_lists.append(ast.literal_eval(r))
        else:
            refs_lists.append(r)

    references_clean = [[ref.strip().strip('"').strip("'").lower() for ref in ref_list]
                        for ref_list in refs_lists]

    # Transpose to match SacreBLEU expected input: list of lists per reference
    refs_for_sacrebleu = list(zip(*references_clean))

    # Compute BLEU
    bleu = sacrebleu.corpus_bleu(translations, refs_for_sacrebleu)
    print(f"SacreBLEU score: {bleu.score:.2f}")

    return bleu.score, translations



Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
# --- Compute BLEU ---
bleu = sacrebleu.corpus_bleu(translations, refs_for_sacrebleu)
print(f"\nSacreBLEU score on test set: {bleu.score:.2f}")


SacreBLEU score on test set: 1.10


In [10]:
bleu_score_opus, preds_opus = evaluate_bleu(opus_df, source_col='chinese', refs_col='refs_list')

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

Generated 2000 translations.
SacreBLEU score: 46.26


In [12]:
import pandas as pd
import ast
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu
!pip install bert_score
import bert_score

def evaluate_bleu_bertscore(df, source_col='chinese', refs_col='refs_list',
                             model_name="Helsinki-NLP/opus-mt-zh-en",
                             batch_size=64, device=None, max_length=64,
                             bertscore_model="microsoft/deberta-xlarge-mnli"):
    """
    Generate translations and evaluate SacreBLEU and BERTScore in one pass.

    Args:
        df: pd.DataFrame with source sentences and references
        source_col: column name for source sentences
        refs_col: column name containing references (list of strings)
        model_name: Hugging Face seq2seq model name
        batch_size: batch size for generation
        device: 'cuda', 'cpu', or None (auto-detect)
        max_length: max length of generated sequences
        bertscore_model: Hugging Face model for BERTScore

    Returns:
        metrics: dict with keys 'sacrebleu' and 'bertscore_f1'
        translations: list of generated translations
    """
    # Setup device
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.to(device)

    # Prepare source sentences
    source_texts = list(df[source_col])

    # Generate translations in batches
    translations = []
    for i in range(0, len(source_texts), batch_size):
        batch_texts = source_texts[i:i+batch_size]
        batch_inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        outputs = model.generate(**batch_inputs, max_length=max_length)
        translations.extend([tokenizer.decode(t, skip_special_tokens=True).strip().lower() for t in outputs])
    print(f"Generated {len(translations)} translations.")

    # Prepare references
    refs_lists = []
    for r in df[refs_col]:
        if isinstance(r, str):
            refs_lists.append(ast.literal_eval(r))
        else:
            refs_lists.append(r)
    references_clean = [[ref.strip().strip('"').strip("'").lower() for ref in ref_list] for ref_list in refs_lists]

    # SacreBLEU
    refs_for_sacrebleu = list(zip(*references_clean))
    bleu = sacrebleu.corpus_bleu(translations, refs_for_sacrebleu)
    sacrebleu_score = bleu.score

    # BERTScore (use first reference per sentence)
    first_refs = [refs[0] for refs in references_clean]
    P, R, F1 = bert_score.score(translations, first_refs,
                                model_type=bertscore_model,
                                lang="en",  # or 'zh' if evaluating Chinese
                                rescale_with_baseline=True,
                                device=device)
    bertscore_f1 = F1.mean().item()

    metrics = {
        "sacrebleu": sacrebleu_score,
        "bertscore_f1": bertscore_f1
    }

    return metrics, translations


Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [13]:
# Idiom dataset
test_df = pd.read_csv('idiomkb_test.csv')
metrics, preds = evaluate_bleu_bertscore(test_df)
print(metrics)

# OPUS100 dataset
opus_df = pd.read_csv('opus100_cleaned.csv')
metrics_opus, preds_opus = evaluate_bleu_bertscore(opus_df)
print(metrics_opus)

Using device: cuda




Generated 864 translations.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

{'sacrebleu': 1.2303728360168207, 'bertscore_f1': 0.042263977229595184}
Using device: cuda




Generated 2000 translations.
{'sacrebleu': 46.256060254251814, 'bertscore_f1': 0.6907729506492615}
