In [None]:
# Install required packages
!pip install -q transformers sentencepiece pandas torch scikit-learn rouge-score evaluate

import json
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    get_cosine_schedule_with_warmup,
    pipeline,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from rouge_score import rouge_scorer
import evaluate
from tqdm import tqdm
import re

# Configuration
RANDOM_SEED = 24
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 4
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
MODEL_SAVE_PATH = "bangla_t5_privacy_model"
PATIENCE = 3
MIN_DELTA = 0.001

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Custom Bangla tokenizer class for ROUGE
class BanglaTokenizer:
    def tokenize(self, text):
        """Simple word tokenizer for Bangla text"""
        return re.findall(r"[\w'-]+|[^\s\w]", text)

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(
    ['rouge1', 'rougeLsum'],
    use_stemmer=False,
    tokenizer=BanglaTokenizer()  # Using class instance
)

# Load dataset from JSON files
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return pd.DataFrame(data)

# Load and combine training data
print("Loading training data...")
train_df = load_json_data('/content/train.json')
synth_df = load_json_data('/content/bn_cb.json')
train_df = pd.concat([train_df, synth_df], ignore_index=True)

# Load validation and test data
val_df = load_json_data('/content/valid.json')
test_df = load_json_data('/content/test.json')

# Preprocessing functions
def preprocess_text(text):
    """Clean and normalize Bangla text"""
    if not isinstance(text, str):
        return ""
    text = text.replace('\n', ' ').replace('\t', ' ').strip()
    return ' '.join(text.split())

# Apply preprocessing
for df in [train_df, val_df, test_df]:
    df['sentence'] = df['sentence'].apply(preprocess_text)
    df['personinfo'] = df['personinfo'].apply(preprocess_text)
    df['r_o'] = df['r_o'].apply(preprocess_text)
    df['r_d'] = df['r_d'].apply(preprocess_text)

# Create input-output pairs
for df in [train_df, val_df, test_df]:
    df['input_text'] = "SENTENCE: " + df['sentence'] + " PERSONAL_INFO: " + df['personinfo']
    df['target_text'] = df['r_o'] + " | " + df['r_d']

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m108.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m986.0 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.8 MB/s[0m eta [36

In [None]:
# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained("csebuetnlp/banglat5_banglaparaphrase")

# Dataset class
class BanglaPrivacyDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_len, max_target_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = str(self.data.iloc[idx]['input_text'])
        target_text = str(self.data.iloc[idx]['target_text'])

        input_encoding = tokenizer(
            input_text,
            max_length=self.max_input_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            target_text,
            max_length=self.max_target_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

# Create datasets
train_dataset = BanglaPrivacyDataset(train_df, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)
val_dataset = BanglaPrivacyDataset(val_df, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)
test_dataset = BanglaPrivacyDataset(test_df, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Load T5 model
model = T5ForConditionalGeneration.from_pretrained("csebuetnlp/banglat5_banglaparaphrase")
model = model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=total_steps
)

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
# Training function
def train_epoch(model, data_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    return total_loss / len(data_loader)

# Validation function
def eval_epoch(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
# Training loop with early stopping
print("Starting training...")
best_val_loss = float('inf')
epochs_without_improvement = 0

for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, scheduler)
    val_loss = eval_epoch(model, val_loader)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss - MIN_DELTA:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)
        print(f"Validation loss improved. Saving model...")
    else:
        epochs_without_improvement += 1
        print(f"No improvement for {epochs_without_improvement}/{PATIENCE} epochs")

    if epochs_without_improvement >= PATIENCE:
        print(f"Early stopping triggered after {epoch+1} epochs!")
        break

Starting training...
Epoch 1/20
Train Loss: 0.7756 | Val Loss: 1.2400
Validation loss improved. Saving model...
Epoch 2/20
Train Loss: 0.5397 | Val Loss: 1.0639
Validation loss improved. Saving model...
Epoch 3/20
Train Loss: 0.4460 | Val Loss: 0.9068
Validation loss improved. Saving model...
Epoch 4/20
Train Loss: 0.3707 | Val Loss: 0.7879
Validation loss improved. Saving model...
Epoch 5/20
Train Loss: 0.3116 | Val Loss: 0.7050
Validation loss improved. Saving model...
Epoch 6/20
Train Loss: 0.2661 | Val Loss: 0.6688
Validation loss improved. Saving model...
Epoch 7/20
Train Loss: 0.2304 | Val Loss: 0.6159
Validation loss improved. Saving model...
Epoch 8/20
Train Loss: 0.2019 | Val Loss: 0.5877
Validation loss improved. Saving model...
Epoch 9/20
Train Loss: 0.1811 | Val Loss: 0.5666
Validation loss improved. Saving model...
Epoch 10/20
Train Loss: 0.1629 | Val Loss: 0.5578
Validation loss improved. Saving model...
Epoch 11/20
Train Loss: 0.1487 | Val Loss: 0.5583
No improvement for

In [None]:
# Load best model
print("Loading best model for evaluation...")
model = T5ForConditionalGeneration.from_pretrained(MODEL_SAVE_PATH).to(device)

# Load mDeBERTa for privacy evaluation
print("Loading mDeBERTa for privacy evaluation...")
nli_tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")
nli_model = AutoModelForSequenceClassification.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7").to(device)
nli_model.eval()

def get_privacy_nli_score(text, personal_info):
    """Calculate privacy score using mDeBERTa (1 - entailment probability)"""
    if not personal_info.strip():
        return 1.0  # Maximally private when no personal info exists
    inputs = nli_tokenizer(text,personal_info,return_tensors="pt",truncation=True).to(device)

    with torch.no_grad():
        outputs = nli_model(**inputs)

    probs = torch.softmax(outputs.logits, dim=1)[0]
    return 1 - probs[0].item()  # Privacy score = 1 - entailment probability

def evaluate_privacy_metrics(original_text, rewritten_text, personal_info):
    """Evaluate all privacy metrics"""
    results = {}

    # Privacy NLI score (higher = more private)
    results['privacy_nli'] = get_privacy_nli_score(rewritten_text, personal_info)

    # ROUGE scores
    rouge_scores = rouge.score(original_text, rewritten_text)
    results['rouge1'] = rouge_scores['rouge1'].fmeasure
    results['rougeLsum'] = rouge_scores['rougeLsum'].fmeasure

    return results

Loading best model for evaluation...
Loading mDeBERTa for privacy evaluation...


tokenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
# Test evaluation
def evaluate_test_set():
    model.eval()
    privacy_metrics = {'r_o': [], 'r_d': []}

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating test set"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=MAX_TARGET_LENGTH,
                num_beams=4,
                early_stopping=True
            )

            original_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
            rewritten_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            personal_infos = []
            original_sentences = []
            for text in original_texts:
                parts = text.split("PERSONAL_INFO:")
                if len(parts) > 1:
                    personal_info = parts[1].strip()
                    sentence_part = parts[0].split("SENTENCE:")
                    sentence = sentence_part[1].strip() if len(sentence_part) > 1 else sentence_part[0].strip()
                else:
                    personal_info = ""
                    sentence = text.split("SENTENCE:")[-1].strip()

                personal_infos.append(personal_info)
                original_sentences.append(sentence)

            for orig, rewritten, personal_info in zip(original_sentences, rewritten_texts, personal_infos):
                parts = rewritten.split(' | ')
                obscured = parts[0] if len(parts) > 0 else ""
                deleted = parts[1] if len(parts) > 1 else ""

                if obscured:
                    metrics_o = evaluate_privacy_metrics(orig, obscured, personal_info)
                    privacy_metrics['r_o'].append(metrics_o)

                if deleted:
                    metrics_d = evaluate_privacy_metrics(orig, deleted, personal_info)
                    privacy_metrics['r_d'].append(metrics_d)

    avg_metrics = {}
    for version in ['r_o', 'r_d']:
        if privacy_metrics[version]:
            avg_metrics[version] = {
                'privacy_nli': np.mean([m['privacy_nli'] for m in privacy_metrics[version]]),
                'rouge1': np.mean([m['rouge1'] for m in privacy_metrics[version]]),
                'rougeLsum': np.mean([m['rougeLsum'] for m in privacy_metrics[version]])
            }

    return avg_metrics

# Run evaluation
print("\nEvaluating on test set...")
test_metrics = evaluate_test_set()

print("\nTest Set Metrics:")
print("Obscured Rewrites (r_o):")
print(f"  Privacy NLI: {test_metrics['r_o']['privacy_nli']:.4f} (higher = more private)")
print(f"  ROUGE-1: {test_metrics['r_o']['rouge1']:.4f}")
print(f"  ROUGE-LSum: {test_metrics['r_o']['rougeLsum']:.4f}")

print("\nDeleted Rewrites (r_d):")
print(f"  Privacy NLI: {test_metrics['r_d']['privacy_nli']:.4f} (higher = more private)")
print(f"  ROUGE-1: {test_metrics['r_d']['rouge1']:.4f}")
print(f"  ROUGE-LSum: {test_metrics['r_d']['rougeLsum']:.4f}")


Evaluating on test set...


Evaluating test set: 100%|██████████| 70/70 [02:06<00:00,  1.81s/it]


Test Set Metrics:
Obscured Rewrites (r_o):
  Privacy NLI: 0.9143 (higher = more private)
  ROUGE-1: 0.8102
  ROUGE-LSum: 0.8097

Deleted Rewrites (r_d):
  Privacy NLI: 0.9475 (higher = more private)
  ROUGE-1: 0.6952
  ROUGE-LSum: 0.6952





In [None]:
# Interactive inference
def rewrite_text(sentence, personinfo):
    input_text = f"SENTENCE: {sentence} PERSONAL_INFO: {personinfo}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    outputs = model.generate(input_ids, max_length=MAX_TARGET_LENGTH)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    parts = decoded_output.split(' | ')
    return {
        'r_o': parts[0] if len(parts) > 0 else '',
        'r_d': parts[1] if len(parts) > 1 else ''
    }

# Interactive demo
while True:
    print("\nEnter a Bangla sentence and personal information:")
    sentence = input("Sentence: ")
    if sentence.lower() in ['quit', 'exit']:
        break
    personinfo = input("Personal Info: ")

    results = rewrite_text(sentence, personinfo)
    print("\nGenerated Rewrites:")
    print(f"Obscured (r_o): {results['r_o']}")
    print(f"Deleted (r_d): {results['r_d']}")

    if results['r_o']:
        metrics_o = evaluate_privacy_metrics(sentence, results['r_o'], personinfo)
        print("\nObscured Rewrite Metrics:")
        print(f"  Privacy NLI: {metrics_o['privacy_nli']:.4f} (higher = more private)")
        print(f"  ROUGE-1: {metrics_o['rouge1']:.4f}")
        print(f"  ROUGE-LSum: {metrics_o['rougeLsum']:.4f}")

    if results['r_d']:
        metrics_d = evaluate_privacy_metrics(sentence, results['r_d'], personinfo)
        print("\nDeleted Rewrite Metrics:")
        print(f"  Privacy NLI: {metrics_d['privacy_nli']:.4f} (higher = more private)")
        print(f"  ROUGE-1: {metrics_d['rouge1']:.4f}")
        print(f"  ROUGE-LSum: {metrics_d['rougeLsum']:.4f}")


Enter a Bangla sentence and personal information:
Sentence: আমি ২০২১ সালে কেমব্রিজ বিশ্ববিদ্যালয় থেকে কৃত্রিম বুদ্ধিমত্তায় মাস্টার্স সম্পন্ন করেছি।
Personal Info: আমি কেমব্রিজ বিশ্ববিদ্যালয়ে পড়াশোনা করেছি।

Generated Rewrites:
Obscured (r_o): আমি ২০২১ সালে একটি মর্যাদাপূর্ণ বিশ্ববিদ্যালয় থেকে কৃত্রিম বুদ্ধিমত্তায় মাস্টার্স সম্পন্ন করেছি।
Deleted (r_d): আমি ২০২১ সালে কৃত্রিম বুদ্ধিমত্তায় মাস্টার্স সম্পন্ন করেছি।

Obscured Rewrite Metrics:
  Privacy NLI: 0.9994 (higher = more private)
  ROUGE-1: 0.8571
  ROUGE-LSum: 0.8429

Deleted Rewrite Metrics:
  Privacy NLI: 0.9881 (higher = more private)
  ROUGE-1: 0.7928
  ROUGE-LSum: 0.7748

Enter a Bangla sentence and personal information:
Sentence: আমি গত মাসে নিউ ইয়র্কে আমার প্রথম আর্ট এক্সিবিশন আয়োজন করেছি।
Personal Info: আমি নিউ ইয়র্কে একটি আর্ট ইভেন্ট করেছি

Generated Rewrites:
Obscured (r_o): আমি গত মাসে একটি বড় আর্ট এক্সিবিশন আয়োজন করেছি।
Deleted (r_d): আমি গত মাসে একটি আর্ট এক্সিবিশন আয়োজন করেছি।

Obscured Rewrite Metrics:
  Priva