#Data Cleaning

####Move to Repo

In [5]:
#moving to repo
from google.colab import drive
import os

# Mount Drive
drive.mount('/content/drive', force_remount=True)

# Repo info
MYDRIVE = "/content/drive/MyDrive"
REPO_NAME = "chineseproverbs"
REPO_PATH = os.path.join(MYDRIVE, REPO_NAME)

# Go to MyDrive
os.chdir(MYDRIVE)

# Clone if missing, else pull
if not os.path.exists(REPO_PATH):
    print("Cloning repo...")
    !git clone https://github.com/art3misxmoon/chineseproverbs.git
else:
    print("Repo exists, pulling latest updates...")
    os.chdir(REPO_PATH)
    !git pull

# Move to repo folder
os.chdir(REPO_PATH)
print("Current working directory:", os.getcwd())
!ls


MessageError: Error: credential propagation was unsuccessful

##General Data

In [None]:
import os
import tarfile

# --- Paths to your split files ---
part1 = "UNv1.0.en-zh.tar.gz.00"
part2 = "UNv1.0.en-zh.tar.gz.01"

# --- Path for combined archive ---
combined_tar = "UNv1.0.en-zh.tar.gz"

# --- Concatenate the split files ---
os.system(f"cat {part1} {part2} > {combined_tar}")
print(f"Combined archive saved to: {combined_tar}")

# --- Inspect contents of the tar.gz without extracting ---
with tarfile.open(combined_tar, 'r:gz') as tar:
    print("Files inside the combined tar.gz:")
    for member in tar.getmembers()[:20]:  # just show first 20 files
        print(member.name)


In [None]:
import tarfile
import os

tar_path = "UNv1.0.en-zh.tar.gz"
extract_path = "UNv1.0_en-zh"

os.makedirs(extract_path, exist_ok=True)

with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_path)

print("Extraction complete. Files:")
print(os.listdir(extract_path))


In [None]:
ch_file = os.path.join(extract_path, "en-zh", "UNv1.0.en-zh.zh")
en_file = os.path.join(extract_path, "en-zh", "UNv1.0.en-zh.en")

# Peek at first 5 sentences
with open(ch_file, "r", encoding="utf-8") as f_ch, open(en_file, "r", encoding="utf-8") as f_en:
    for i, (c, e) in enumerate(zip(f_ch, f_en)):
        if i >= 5:
            break
        print(f"CH: {c.strip()}")
        print(f"EN: {e.strip()}")
        print("---")


In [None]:
import pandas as pd

with open(ch_file, "r", encoding="utf-8") as f_ch, open(en_file, "r", encoding="utf-8") as f_en:
    ch_lines = [line.strip() for line in f_ch]
    en_lines = [line.strip() for line in f_en]

df_un = pd.DataFrame({
    "chinese": ch_lines,
    "english": en_lines
})

print(df_un.head())
print(f"Total sentence pairs: {len(df_un)}")


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import os
import tarfile
import random

# --- Paths ---
tar_path = "UNv1.0.en-zh.tar.gz"
extract_path = "UNv1.0_en-zh"
os.makedirs(extract_path, exist_ok=True)

# --- Extract tar.gz ---
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_path)

ch_file = os.path.join(extract_path, "en-zh", "UNv1.0.en-zh.zh")
en_file = os.path.join(extract_path, "en-zh", "UNv1.0.en-zh.en")

# --- Load sentences ---
with open(ch_file, "r", encoding="utf-8") as f_ch, open(en_file, "r", encoding="utf-8") as f_en:
    ch_lines = [line.strip() for line in f_ch]
    en_lines = [line.strip() for line in f_en]

df_un = pd.DataFrame({"chinese": ch_lines, "english": en_lines})
print(f"Total sentence pairs: {len(df_un)}")

# --- Cleaning ---
def is_valid_sentence(s):
    if len(s.split()) < 3:  # very short
        return False
    if len(s.split()) > 100:  # very long
        return False
    if re.fullmatch(r'[\d\(\)\-/\s]+', s):  # headings/numbers only
        return False
    return True

df_un["english"] = df_un["english"].str.lower().str.strip()
df_un["chinese"] = df_un["chinese"].str.strip()
df_un = df_un[df_un["english"].apply(is_valid_sentence)].reset_index(drop=True)
print(f"Sentence pairs after filtering: {len(df_un)}")

# --- Sample a reasonable subset for testing ---
subset_size = 50000  # adjust as needed
if len(df_un) > subset_size:
    df_un = df_un.sample(subset_size, random_state=42).reset_index(drop=True)
print(f"Subset size for testing: {len(df_un)}")

df_un.to_csv("UN_cleaned.csv", index=False, encoding="utf-8-sig")

##IdiomKB

In [None]:
# ==============================
# Load IdiomKB JSON, remove duplicates, save cleaned dataset
# ==============================
!pip install zhconv
import pandas as pd
import json
from zhconv import convert
import os

# --- Step 0: Ensure we're in the repo folder ---
# Adjust if your notebook is opened elsewhere
REPO_PATH = "/content/drive/MyDrive/chineseproverbs"
os.chdir(REPO_PATH)
print("Current working directory:", os.getcwd())

# --- Step 1: Load JSON dataset (IdiomKB) ---
with open('zh_idiom_meaning.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

df_json = pd.DataFrame(json_data)
df_json = df_json[['idiom', 'en_meaning']]
df_json.rename(columns={'idiom': 'chinese', 'en_meaning': 'english'}, inplace=True)
df_json['source'] = 'JSON'  # mark source

print(f"IdiomKB JSON dataset loaded: {len(df_json)} rows")

# --- Step 2: Normalize Chinese characters (Traditional -> Simplified) ---
df_json['chinese'] = df_json['chinese'].apply(lambda x: convert(x, 'zh-cn'))

# --- Step 3: Find and show duplicates ---
duplicates = df_json[df_json.duplicated(subset='chinese', keep=False)]
if not duplicates.empty:
    print("\nFound duplicates (before dropping):")
    print(duplicates.sort_values('chinese'))
    print(f"Total duplicates found: {len(duplicates)}")
else:
    print("\nNo duplicates found.")

# --- Step 4: Remove duplicates ---
df_json.drop_duplicates(subset='chinese', keep='first', inplace=True)
print(f"Dataset after removing duplicates: {len(df_json)} rows")

# --- Step 5: Save cleaned dataset ---
df_json.to_csv('idiomkb_cleaned.csv', index=False, encoding='utf-8-sig')
print("\nCleaned dataset saved to 'idiomkb_cleaned.csv'")


In [None]:
# ==============================
# Further clean English references and save refs_list
# ==============================
import pandas as pd
import re

# --- Step 1: Load previously cleaned CSV ---
df = pd.read_csv('idiomkb_cleaned.csv')
print(f"Loaded cleaned dataset: {len(df)} rows")

# --- Step 2: Further clean English references ---
def clean_refs(text):
    """
    Returns a list of cleaned English references:
    - Lowercase and strip
    - Replace first '(' with ',' and remove all ')'
    - Strip leading/trailing quotes
    - If quotes exist, only keep quoted strings
    - Else if semicolons exist, split by semicolons
    - Do NOT include original string if multiple references extracted
    - Only include original string if nothing else extracted
    """
    text = str(text).lower().strip()

    # --- Minimal change: handle parentheses ---
    text = re.sub(r'\(', ',', text, count=1)  # first '(' -> ','
    text = text.replace(')', '')              # remove all ')'

    # 1️⃣ Extract quoted alternatives
    quote_pattern = re.findall(r'"([^"]+)"', text)
    if quote_pattern:
        parts = [q.strip().strip('"').strip("'") for q in quote_pattern]
    # 2️⃣ Else split by semicolons
    elif ';' in text:
        parts = [p.strip().strip('"').strip("'") for p in text.split(';') if p.strip()]
    # 3️⃣ Fallback: keep full original
    else:
        parts = [text.strip().strip('"').strip("'")]

    # 4️⃣ Remove duplicates while preserving order
    seen = set()
    cleaned = []
    for p in parts:
        if p not in seen:
            cleaned.append(p)
            seen.add(p)

    return cleaned

# --- Step 3: Apply to all rows ---
df['refs_list'] = df['english'].apply(clean_refs)

# --- Step 4: Save new cleaned dataset with references ---
df.to_csv('idiomkb_cleaned_refs.csv', index=False, encoding='utf-8-sig')
print("Further cleaned dataset with reference lists saved to 'idiomkb_cleaned_refs.csv'")


### Split IdiomKB data (80:10:10)- Train: 6904, Validation: 864, Test: 864


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load cleaned dataset
df = pd.read_csv("idiomkb_cleaned_refs.csv")
print(f"Total dataset size: {len(df)}")

# Split: 80% train, 10% validation, 10% test
train_val, test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)
train, val = train_test_split(train_val, test_size=0.1111, random_state=42)  # 0.1111*0.9 ≈ 0.1 total

print(f"Train: {len(train)}, Validation: {len(val)}, Test: {len(test)}")

# Save to separate CSVs
train.to_csv("idiomkb_train.csv", index=False, encoding="utf-8")
val.to_csv("idiomkb_val.csv", index=False, encoding="utf-8")
test.to_csv("idiomkb_test.csv", index=False, encoding="utf-8")

print("Saved train, validation, and test CSVs successfully.")

### Load opus100 dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load cleaned OPUS100 dataset under a different name
opus_df = pd.read_csv("opus100_cleaned.csv")
print(f"Total OPUS100 dataset size: {len(opus_df)}")

# Load Model

In [None]:
!pip install -U transformers

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

## Translate some text

In [None]:
input_text = "一举两得"  # Chinese text you want to translate
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(translated)

Testing Model Behavior for Traditional Characters

Notes: same translation for both traditional & normal characters; neither captures meaningful idiom info.

In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

examples = ["畫蛇添足", "画蛇添足"]  # Traditional vs simplified

for text in examples:
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"{text} -> {translation}")


#Model Evaluation (before Finetuning)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

# Tokenize Chinese input sentences
test_inputs = tokenizer(list(test['chinese']), return_tensors='pt', padding=True, truncation=True)


## Behavior checking

In [None]:
# ==============================
# Fast BLEU evaluation on test set (SacreBLEU + batching)
# ==============================
!pip install sacrebleu
!pip install bleurt-pytorch

import pandas as pd
import ast
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu

# --- Load test split (already cleaned CSV) ---
test = pd.read_csv('idiomkb_cleaned_refs.csv')
test['refs_list'] = test['refs_list'].apply(ast.literal_eval)

# --- Setup GPU ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# --- Load model & tokenizer ---
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model.to(device)

# --- Prepare test sentences ---
test_sentences = list(test['chinese'])

# --- Generate translations in batches ---
batch_size = 64
translations = []

for i in range(0, len(test_sentences), batch_size):
    batch_texts = test_sentences[i:i+batch_size]
    batch_inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
    batch_inputs = {k:v.to(device) for k,v in batch_inputs.items()}
    outputs = model.generate(**batch_inputs, max_length=64)
    translations.extend([tokenizer.decode(t, skip_special_tokens=True).lower().strip() for t in outputs])

print(f"Generated {len(translations)} translations.")

# --- Prepare references for SacreBLEU ---
# SacreBLEU expects list of predictions and list of reference lists (one per reference)
references_clean = [[r.strip().strip('"').strip("'").lower() for r in ref_list]
                    for ref_list in test['refs_list']]

# SacreBLEU expects refs as list of lists per reference
# If multiple references per sentence: [[ref1_sent1, ref1_sent2], [ref2_sent1, ref2_sent2], ...]
refs_for_sacrebleu = list(zip(*references_clean))  # transpose to match SacreBLEU format

# --- Inspect first 5 translations ---
for src, refs, pred in zip(test['chinese'][:5], references_clean[:5], translations[:5]):
    print("\nSRC:", src)
    print("REFS:", refs)
    print("PRED:", pred)
    print("---")


##Bleu score

In [None]:
import pandas as pd
import ast
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
!pip install sacrebleu
import sacrebleu

def evaluate_bleu(df, source_col='chinese', refs_col='refs_list',
                  model_name="Helsinki-NLP/opus-mt-zh-en",
                  batch_size=64, device=None, max_length=64):
    """
    Evaluate SacreBLEU for a given dataset and seq2seq model.

    Args:
        df: pd.DataFrame containing source sentences and reference translations
        source_col: column name of source sentences
        refs_col: column name containing references (as list of strings)
        model_name: Hugging Face model name
        batch_size: batch size for generation
        device: 'cuda', 'cpu', or None (auto-detect)
        max_length: max length of generated sequences

    Returns:
        bleu_score: float BLEU score
        translations: list of generated predictions
    """
    # Setup device
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.to(device)

    # Prepare test sentences
    test_sentences = list(df[source_col])

    # Generate translations in batches
    translations = []
    for i in range(0, len(test_sentences), batch_size):
        batch_texts = test_sentences[i:i+batch_size]
        batch_inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        outputs = model.generate(**batch_inputs, max_length=max_length)
        translations.extend([tokenizer.decode(t, skip_special_tokens=True).lower().strip() for t in outputs])

    print(f"Generated {len(translations)} translations.")

    # Prepare references for SacreBLEU
    # Convert stringified lists if necessary
    refs_lists = []
    for r in df[refs_col]:
        if isinstance(r, str):
            refs_lists.append(ast.literal_eval(r))
        else:
            refs_lists.append(r)

    references_clean = [[ref.strip().strip('"').strip("'").lower() for ref in ref_list]
                        for ref_list in refs_lists]

    # Transpose to match SacreBLEU expected input: list of lists per reference
    refs_for_sacrebleu = list(zip(*references_clean))

    # Compute BLEU
    bleu = sacrebleu.corpus_bleu(translations, refs_for_sacrebleu)
    print(f"SacreBLEU score: {bleu.score:.2f}")

    return bleu.score, translations



In [None]:
# --- Compute BLEU ---
bleu = sacrebleu.corpus_bleu(translations, refs_for_sacrebleu)
print(f"\nSacreBLEU score on test set: {bleu.score:.2f}")

In [None]:
bleu_score_opus, preds_opus = evaluate_bleu(opus_df, source_col='chinese', refs_col='refs_list')

## Bleurt

In [None]:
from bleurt_pytorch import BleurtConfig, BleurtForSequenceClassification, BleurtTokenizer

# Load BLEURT model (this will download ~1.6GB on first run)
bleurt_config = BleurtConfig.from_pretrained('lucadiliello/BLEURT-20-D12')
bleurt_model = BleurtForSequenceClassification.from_pretrained('lucadiliello/BLEURT-20-D12')
bleurt_tokenizer = BleurtTokenizer.from_pretrained('lucadiliello/BLEURT-20-D12')

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
bleurt_model.to(device)
bleurt_model.eval()

print(f"BLEURT model loaded on {device}")

In [None]:
import torch
from tqdm import tqdm

def evaluate_bleurt(predictions, references, batch_size=16, device=None):
    """
    Evaluate BLEURT scores between predictions and references.

    Args:
        predictions: list of predicted translations
        references: list of reference translations (one per prediction)
        batch_size: batch size for BLEURT scoring
        device: 'cuda', 'cpu', or None (auto-detect)

    Returns:
        mean_bleurt: average BLEURT score
        bleurt_scores: list of individual BLEURT scores
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    bleurt_scores = []

    with torch.no_grad():
        for i in tqdm(range(0, len(predictions), batch_size), desc="Computing BLEURT"):
            batch_preds = predictions[i:i+batch_size]
            batch_refs = references[i:i+batch_size]

            # Tokenize
            inputs = bleurt_tokenizer(
                batch_refs,
                batch_preds,
                padding='longest',
                return_tensors='pt',
                max_length=512,
                truncation=True
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Get BLEURT scores
            outputs = bleurt_model(**inputs)
            scores = outputs.logits.flatten().tolist()
            bleurt_scores.extend(scores)

    mean_bleurt = sum(bleurt_scores) / len(bleurt_scores)
    print(f"Mean BLEURT score: {mean_bleurt:.4f}")

    return mean_bleurt, bleurt_scores

## Evaluation (BLEU, BertScore, BLEURT)

In [None]:
import pandas as pd
import ast
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu
from bert_score import score as bert_score
from tqdm import tqdm

def evaluate_all_metrics(df, source_col='chinese', refs_col='refs_list',
                         model_name="Helsinki-NLP/opus-mt-zh-en",
                         batch_size=64, device=None, max_length=64,
                         bertscore_model="microsoft/deberta-xlarge-mnli",
                         bleurt_batch_size=16):
    """
    Generate translations and evaluate SacreBLEU, BERTScore, and BLEURT.

    Returns:
        metrics: dict with keys 'sacrebleu', 'bertscore_f1', 'bleurt'
        translations: list of generated translations
    """
    # Setup device
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    # Load translation model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.to(device)

    # Prepare source sentences
    source_texts = list(df[source_col])

    # Generate translations in batches
    translations = []
    print("Generating translations...")
    for i in tqdm(range(0, len(source_texts), batch_size)):
        batch_texts = source_texts[i:i+batch_size]
        batch_inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        outputs = model.generate(**batch_inputs, max_length=max_length)
        translations.extend([tokenizer.decode(t, skip_special_tokens=True).strip().lower() for t in outputs])

    print(f"Generated {len(translations)} translations.")

    # Prepare references
    refs_lists = []
    for r in df[refs_col]:
        if isinstance(r, str):
            refs_lists.append(ast.literal_eval(r))
        else:
            refs_lists.append(r)

    references_clean = [[ref.strip().strip('"').strip("'").lower() for ref in ref_list]
                       for ref_list in refs_lists]

    # SacreBLEU
    print("\nComputing SacreBLEU...")
    refs_for_sacrebleu = list(zip(*references_clean))
    bleu = sacrebleu.corpus_bleu(translations, refs_for_sacrebleu)
    sacrebleu_score = bleu.score

    # BERTScore (use first reference per sentence)
    print("Computing BERTScore...")
    first_refs = [refs[0] for refs in references_clean]
    P, R, F1 = bert_score(translations, first_refs,
                         model_type=bertscore_model,
                         lang="en",
                         rescale_with_baseline=True,
                         device=device)
    bertscore_f1 = F1.mean().item()

    # BLEURT (use first reference per sentence)
    print("Computing BLEURT...")
    bleurt_mean, bleurt_scores = evaluate_bleurt(
        translations,
        first_refs,
        batch_size=bleurt_batch_size,
        device=device
    )

    metrics = {
        "sacrebleu": sacrebleu_score,
        "bertscore_f1": bertscore_f1,
        "bleurt": bleurt_mean
    }

    return metrics, translations, bleurt_scores

In [None]:
# Idiom dataset
print("="*50)
print("Evaluating on Idiom dataset")
print("="*50)
test_df = pd.read_csv('idiomkb_test.csv')
metrics, preds, bleurt_scores = evaluate_all_metrics(test_df)
print("\n--- Idiom Dataset Results ---")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

# OPUS100 dataset
print("\n" + "="*50)
print("Evaluating on OPUS100 dataset")
print("="*50)
opus_df = pd.read_csv('opus100_cleaned.csv')
metrics_opus, preds_opus, bleurt_scores_opus = evaluate_all_metrics(opus_df)
print("\n--- OPUS100 Dataset Results ---")
for key, value in metrics_opus.items():
    print(f"{key}: {value:.4f}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot BLEURT score distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(bleurt_scores, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('BLEURT Score')
plt.ylabel('Frequency')
plt.title('Idiom Dataset BLEURT Distribution')
plt.axvline(np.mean(bleurt_scores), color='red', linestyle='--', label=f'Mean: {np.mean(bleurt_scores):.3f}')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(bleurt_scores_opus, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('BLEURT Score')
plt.ylabel('Frequency')
plt.title('OPUS100 BLEURT Distribution')
plt.axvline(np.mean(bleurt_scores_opus), color='red', linestyle='--', label=f'Mean: {np.mean(bleurt_scores_opus):.3f}')
plt.legend()

plt.tight_layout()
plt.show()

# Show examples with lowest and highest BLEURT scores
print("\n=== Idiom Dataset: Lowest BLEURT Scores ===")
lowest_indices = np.argsort(bleurt_scores)[:5]
for idx in lowest_indices:
    print(f"\nBLEURT: {bleurt_scores[idx]:.3f}")
    print(f"Source: {test_df.iloc[idx]['chinese']}")
    print(f"Reference: {ast.literal_eval(test_df.iloc[idx]['refs_list'])[0]}")
    print(f"Prediction: {preds[idx]}")

print("\n=== Idiom Dataset: Highest BLEURT Scores ===")
highest_indices = np.argsort(bleurt_scores)[-5:]
for idx in highest_indices:
    print(f"\nBLEURT: {bleurt_scores[idx]:.3f}")
    print(f"Source: {test_df.iloc[idx]['chinese']}")
    print(f"Reference: {ast.literal_eval(test_df.iloc[idx]['refs_list'])[0]}")
    print(f"Prediction: {preds[idx]}")