## Load Data and Tokenizer

In [1]:
from transformers import BertTokenizer
from datasets import load_dataset

ds = load_dataset('billingsmoore/tagged-tibetan-to-english-translation-dataset')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

## Preprocess Data

### Use just first two tags

In [2]:
def just_one_tag(examples):
    tags = [tag[0] for tag in examples['Tags']]
    examples['Tags'] = tags
    return examples

ds = ds.map(just_one_tag, batched=True)

Map:   0%|          | 0/107525 [00:00<?, ? examples/s]

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer()
labels = mlb.fit(ds['train']['Tags'])  # Fit all unique Tags

# Save label mappings
import json
with open("en_lh_label_mapping.json", "w") as f:
    json.dump(mlb.classes_.tolist(), f)


In [4]:
def preprocess(examples):
    tokens = tokenizer(examples["English"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] =  mlb.transform(examples['Tags']).astype(float).tolist() # Convert labels to multi-hot
    return tokens

encoded_dataset = ds.map(preprocess, batched=True)


Map:   0%|          | 0/107525 [00:00<?, ? examples/s]

In [5]:
encoded_dataset = encoded_dataset.remove_columns(['Tibetan', 'Phonetic', 'English', 'Tags'])

In [6]:
encoded_dataset = encoded_dataset['train'].train_test_split(.15)

In [7]:
"""enc = tokenizer(ds['train'][0]['Tibetan'])
tokenizer.decode(enc.input_ids)"""

"enc = tokenizer(ds['train'][0]['Tibetan'])\ntokenizer.decode(enc.input_ids)"

## Train Model

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load tokenizer and model
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(mlb.classes_))

# Resize embeddings to match the new tokenizer
model.resize_token_embeddings(len(tokenizer))

# Move model to GPU
model = model.to('cuda:0')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    
    # Get predicted class indices
    predictions = np.argmax(predictions, axis=1)
    
    # Compute metrics
    accuracy = accuracy_score(references, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average="weighted")
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [10]:
from transformers import TrainingArguments, Trainer

from transformers import EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="en-lh-single-bert-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,  # Set a maximum number of epochs
    weight_decay=0.01,
    eval_strategy="epoch",  # Evaluate at the end of every epoch
    save_strategy="epoch",  # Save the model at the end of every epoch
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="accuracy",  # Metric to monitor
    greater_is_better=True,  # Higher accuracy is better
    logging_dir="./logs"
)

# Add the EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3  # Stop training if the metric does not improve for 3 evaluation steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]  # Add the early stopping callback
)

# Start training
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/571300 [00:00<?, ?it/s]

{'loss': 0.3155, 'grad_norm': 0.23054976761341095, 'learning_rate': 1.9982496061613864e-05, 'epoch': 0.09}
{'loss': 0.2658, 'grad_norm': 0.3172425925731659, 'learning_rate': 1.996499212322773e-05, 'epoch': 0.18}
{'loss': 0.2568, 'grad_norm': 0.34036630392074585, 'learning_rate': 1.994748818484159e-05, 'epoch': 0.26}
{'loss': 0.2533, 'grad_norm': 0.3668232560157776, 'learning_rate': 1.9929984246455454e-05, 'epoch': 0.35}
{'loss': 0.249, 'grad_norm': 0.5497780442237854, 'learning_rate': 1.9912480308069317e-05, 'epoch': 0.44}
{'loss': 0.247, 'grad_norm': 1.0625371932983398, 'learning_rate': 1.9894976369683182e-05, 'epoch': 0.53}
{'loss': 0.2364, 'grad_norm': 0.7028191685676575, 'learning_rate': 1.987747243129704e-05, 'epoch': 0.61}
{'loss': 0.2366, 'grad_norm': 0.8484417200088501, 'learning_rate': 1.9859968492910907e-05, 'epoch': 0.7}
{'loss': 0.229, 'grad_norm': 0.7681417465209961, 'learning_rate': 1.984246455452477e-05, 'epoch': 0.79}
{'loss': 0.2262, 'grad_norm': 0.5539542436599731, 'l

  0%|          | 0/1009 [00:00<?, ?it/s]

ValueError: Classification metrics can't handle a mix of multilabel-indicator and multiclass targets



## Test Outputs

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/bert-classifier/checkpoint-34275")
tokenizer = BertTokenizer.from_pretrained("./tibetan_tokenizer")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load label mapping
with open("label_mapping.json", "r") as f:
    label_mapping = json.load(f)

# Example input text
input_text = "བླ་མ་དང་ལྷག་པའི་ལྷ་ལ་ཕྱག་འཚལ་ལོ།།"

# Tokenize input
encoded_input = tokenizer(
    input_text, 
    padding="max_length", 
    truncation=True, 
    max_length=128, 
    return_tensors="pt"
)
encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

# Get predictions
with torch.no_grad():
    outputs = model(**encoded_input)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits).cpu().numpy()
    predictions = (probabilities > 0.5).astype(int)

# Decode predictions
predicted_tags = [label_mapping[i] for i, val in enumerate(predictions[0]) if val == 1]

print("Predicted Tags:", predicted_tags)
