## Load Data and Tokenizer

In [1]:
from transformers import BertTokenizer
from datasets import load_dataset

ds = load_dataset('openpecha/tagged_cleaned_MT_v1.0.3')

tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

ds['train'][0]

{'Source': 'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།',
 'Target': 'The aspirant should move in such a way at all times.',
 'File_Name': 'TM2382',
 'Machine Aligned': True,
 '__index_level_0__': 0,
 'Tag': 'Prophecies, Rituals'}

## Preprocess Data

### Remove Blank Tags

In [2]:
def condition(example):
    return example['Tag'] != ''

ds = ds.filter(condition)

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['Source', 'Target', 'File_Name', 'Machine Aligned', '__index_level_0__', 'Tag'],
        num_rows: 1163105
    })
    test: Dataset({
        features: ['Source', 'Target', 'File_Name', 'Machine Aligned', '__index_level_0__', 'Tag'],
        num_rows: 0
    })
})

In [4]:
ds['train'][0]

{'Source': 'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།',
 'Target': 'The aspirant should move in such a way at all times.',
 'File_Name': 'TM2382',
 'Machine Aligned': True,
 '__index_level_0__': 0,
 'Tag': 'Prophecies, Rituals'}

### Collapse Labels

In [5]:
buddhist_labels = ['Mantras',
                    'Dzogchen',
                    'Astrology',
                    'Monastery',
                    'Mahamudra',
                    'Mind',
                    'Meditation',
                    'Self, Logic, Aggregates',
                    'Tantra',
                    'Emptiness',
                    'Dreams',
                    'Education, Teaching',
                    'Ethics, Enlightenment, Wisdom',
                    'Prophecies, Rituals',
                    'Lama',
                    'Samsara, Nirvana',
                    'Milarepa, Realization, Biography',
                    'Kayas',
                    'Intrinsic Existence, Conventional Existence',
                    'Time, Causality, Perception',
                    'Natural State',
                    'Karma, Consequences',
                    'Dharma']

In [6]:
def collapse_labels(example):
    if example['Tag'] in buddhist_labels:
        example['Tag'] = 'Buddhist'
    else:
        example['Tag'] = 'Non-Buddhist'
    return example

# Apply the function to the dataset
ds = ds.map(collapse_labels)

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['Source', 'Target', 'File_Name', 'Machine Aligned', '__index_level_0__', 'Tag'],
        num_rows: 1163105
    })
    test: Dataset({
        features: ['Source', 'Target', 'File_Name', 'Machine Aligned', '__index_level_0__', 'Tag'],
        num_rows: 0
    })
})

### Convert Labels to Id Numbers

In [8]:
all_tags = list(set(ds['train']['Tag']))

# Create a label-to-index mapping
label2id = {label: idx for idx, label in enumerate(all_tags)}
id2label = {idx: label for label, idx in label2id.items()}

# Save label mappings for future use
import json
with open("bin_op_label_mapping.json", "w") as f:
    json.dump(label2id, f)


In [9]:
all_tags

['Non-Buddhist', 'Buddhist']

In [10]:
def preprocess(examples):
    tokens = tokenizer(examples["Target"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = [label2id[label] for label in examples["Tag"]]    
    return tokens

encoded_dataset = ds.map(preprocess, batched=True)


In [11]:
encoded_dataset = encoded_dataset.remove_columns(['Source', 'Target', 'File_Name', 'Machine Aligned', '__index_level_0__', 'Tag'])

In [12]:
encoded_dataset = encoded_dataset['train'].train_test_split(.15)

## Train Model

In [13]:
from transformers import BertForSequenceClassification

# Load tokenizer and model
model = BertForSequenceClassification.from_pretrained("bert-large-cased", num_labels=len(label2id))

# Resize embeddings to match the new tokenizer
model.resize_token_embeddings(len(tokenizer))

# Move model to GPU
model = model.to('cuda:0')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    
    # Get predicted class indices
    predictions = np.argmax(predictions, axis=1)
    
    # Compute metrics
    accuracy = accuracy_score(references, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average="weighted")
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


In [15]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="en-lg-bin-op-bert-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,  # Set a maximum number of epochs
    weight_decay=0.01,
    eval_strategy="epoch",  # Evaluate at the end of every epoch
    save_strategy="epoch",  # Save the model at the end of every epoch
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="accuracy",  # Metric to monitor
    greater_is_better=True,  # Higher accuracy is better
    logging_dir="./logs"
)

# Add the EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3  # Stop training if the metric does not improve for 3 evaluation steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]  # Add the early stopping callback
)

# Start training
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/6179000 [00:00<?, ?it/s]

{'loss': 0.3397, 'grad_norm': 7.9321184158325195, 'learning_rate': 1.9998381615148082e-05, 'epoch': 0.01}
{'loss': 0.3096, 'grad_norm': 3.769512176513672, 'learning_rate': 1.9996763230296166e-05, 'epoch': 0.02}
{'loss': 0.3059, 'grad_norm': 3.2693333625793457, 'learning_rate': 1.9995144845444247e-05, 'epoch': 0.02}
{'loss': 0.2897, 'grad_norm': 4.172903537750244, 'learning_rate': 1.999352646059233e-05, 'epoch': 0.03}
{'loss': 0.2876, 'grad_norm': 3.770603895187378, 'learning_rate': 1.9991908075740415e-05, 'epoch': 0.04}
{'loss': 0.2864, 'grad_norm': 13.307938575744629, 'learning_rate': 1.9990289690888496e-05, 'epoch': 0.05}
{'loss': 0.2908, 'grad_norm': 4.294888973236084, 'learning_rate': 1.9988671306036576e-05, 'epoch': 0.06}
{'loss': 0.2745, 'grad_norm': 2.7505578994750977, 'learning_rate': 1.998705292118466e-05, 'epoch': 0.06}
{'loss': 0.285, 'grad_norm': 4.877462863922119, 'learning_rate': 1.998543453633274e-05, 'epoch': 0.07}
{'loss': 0.2738, 'grad_norm': 6.4428629875183105, 'lear

  0%|          | 0/10905 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.7154206037521362, 'eval_accuracy': 0.38903282014833834, 'eval_f1': 0.2179164278298431, 'eval_precision': 0.15134653515256938, 'eval_recall': 0.38903282014833834, 'eval_runtime': 1184.4032, 'eval_samples_per_second': 147.303, 'eval_steps_per_second': 9.207, 'epoch': 1.0}
{'loss': 0.6612, 'grad_norm': 1.3529378175735474, 'learning_rate': 1.9799320278362196e-05, 'epoch': 1.0}
{'loss': 0.6672, 'grad_norm': 2.4305572509765625, 'learning_rate': 1.979770189351028e-05, 'epoch': 1.01}
{'loss': 0.6642, 'grad_norm': 4.355087757110596, 'learning_rate': 1.979608350865836e-05, 'epoch': 1.02}
{'loss': 0.6629, 'grad_norm': 1.5011831521987915, 'learning_rate': 1.9794465123806445e-05, 'epoch': 1.03}
{'loss': 0.6661, 'grad_norm': 1.5247712135314941, 'learning_rate': 1.9792846738954525e-05, 'epoch': 1.04}
{'loss': 0.6616, 'grad_norm': 1.2828189134597778, 'learning_rate': 1.9791228354102606e-05, 'epoch': 1.04}
{'loss': 0.6654, 'grad_norm': 1.2788029909133911, 'learning_rate': 1.978960996925

KeyboardInterrupt: 