## Load Data and Tokenizer

In [1]:
from transformers import BertTokenizer
from datasets import load_dataset

ds = load_dataset('billingsmoore/tagged-tibetan-to-english-translation-dataset')

tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

## Preprocess Data

### Use just first two tags

In [2]:
def just_two_tags(examples):
    tags = [tag[:2] for tag in examples['Tags']]
    examples['Tags'] = tags
    return examples

ds = ds.map(just_two_tags, batched=True)

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer()
labels = mlb.fit(ds['train']['Tags'])  # Fit all unique Tags

# Save label mappings
import json
with open("en_lh_label_mapping.json", "w") as f:
    json.dump(mlb.classes_.tolist(), f)


In [4]:
def preprocess(examples):
    tokens = tokenizer(examples["English"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] =  mlb.transform(examples['Tags']).astype(float).tolist() # Convert labels to multi-hot
    return tokens

encoded_dataset = ds.map(preprocess, batched=True)


Map:   0%|          | 0/107525 [00:00<?, ? examples/s]

In [5]:
encoded_dataset = encoded_dataset.remove_columns(['Tibetan', 'Phonetic', 'English', 'Tags'])

In [6]:
encoded_dataset = encoded_dataset['train'].train_test_split(.15)

In [7]:
"""enc = tokenizer(ds['train'][0]['Tibetan'])
tokenizer.decode(enc.input_ids)"""

"enc = tokenizer(ds['train'][0]['Tibetan'])\ntokenizer.decode(enc.input_ids)"

## Train Model

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load tokenizer and model
model = BertForSequenceClassification.from_pretrained("bert-large-cased", num_labels=len(mlb.classes_))

# Resize embeddings to match the new tokenizer
model.resize_token_embeddings(len(tokenizer))

# Move model to GPU
model = model.to('cuda:0')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    
    # Apply a threshold to convert logits to binary predictions
    predictions = (predictions > 0.5).astype(int)
    
    # Compute metrics
    accuracy = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions, average="micro")
    precision = precision_score(references, predictions, average="micro")
    recall = recall_score(references, predictions, average="micro")

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision_1": precision,
        "recall": recall,
    }


In [10]:
from transformers import TrainingArguments, Trainer

from transformers import EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="en-lh-lg-bert-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,  # Set a maximum number of epochs
    weight_decay=0.01,
    eval_strategy="epoch",  # Evaluate at the end of every epoch
    save_strategy="epoch",  # Save the model at the end of every epoch
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="accuracy",  # Metric to monitor
    greater_is_better=True,  # Higher accuracy is better
    logging_dir="./logs"
)

# Add the EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3  # Stop training if the metric does not improve for 3 evaluation steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]  # Add the early stopping callback
)

# Start training
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/571300 [00:00<?, ?it/s]

{'loss': 0.1355, 'grad_norm': 0.06919872015714645, 'learning_rate': 1.9982496061613864e-05, 'epoch': 0.09}
{'loss': 0.0542, 'grad_norm': 0.06292929500341415, 'learning_rate': 1.996499212322773e-05, 'epoch': 0.18}
{'loss': 0.0524, 'grad_norm': 0.07311862707138062, 'learning_rate': 1.994748818484159e-05, 'epoch': 0.26}
{'loss': 0.0518, 'grad_norm': 0.06628433614969254, 'learning_rate': 1.9929984246455454e-05, 'epoch': 0.35}
{'loss': 0.0506, 'grad_norm': 0.0655488520860672, 'learning_rate': 1.9912480308069317e-05, 'epoch': 0.44}
{'loss': 0.0495, 'grad_norm': 0.0706687867641449, 'learning_rate': 1.9894976369683182e-05, 'epoch': 0.53}
{'loss': 0.0482, 'grad_norm': 0.07596486061811447, 'learning_rate': 1.987747243129704e-05, 'epoch': 0.61}
{'loss': 0.0474, 'grad_norm': 0.10036158561706543, 'learning_rate': 1.9859968492910907e-05, 'epoch': 0.7}
{'loss': 0.0472, 'grad_norm': 0.11987017095088959, 'learning_rate': 1.984246455452477e-05, 'epoch': 0.79}
{'loss': 0.045, 'grad_norm': 0.1358622312545

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.04201948642730713, 'eval_accuracy': 0.0883501767003534, 'eval_f1': 0.18714687299952684, 'eval_precision_1': 0.915826750204304, 'eval_recall': 0.10422220844441689, 'eval_runtime': 112.7539, 'eval_samples_per_second': 143.046, 'eval_steps_per_second': 8.949, 'epoch': 1.0}
{'loss': 0.0425, 'grad_norm': 0.20416289567947388, 'learning_rate': 1.978995273936636e-05, 'epoch': 1.05}
{'loss': 0.0411, 'grad_norm': 0.13094212114810944, 'learning_rate': 1.9772448800980222e-05, 'epoch': 1.14}
{'loss': 0.0399, 'grad_norm': 0.16791507601737976, 'learning_rate': 1.9754944862594084e-05, 'epoch': 1.23}
{'loss': 0.0384, 'grad_norm': 0.16840516030788422, 'learning_rate': 1.973744092420795e-05, 'epoch': 1.31}
{'loss': 0.038, 'grad_norm': 0.1695626676082611, 'learning_rate': 1.9719936985821812e-05, 'epoch': 1.4}
{'loss': 0.0372, 'grad_norm': 0.16167773306369781, 'learning_rate': 1.9702433047435675e-05, 'epoch': 1.49}
{'loss': 0.0365, 'grad_norm': 0.20023199915885925, 'learning_rate': 1.968492

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.03160368651151657, 'eval_accuracy': 0.19430838861677724, 'eval_f1': 0.4533149805001147, 'eval_precision_1': 0.8718672785033533, 'eval_recall': 0.30628061256122513, 'eval_runtime': 112.836, 'eval_samples_per_second': 142.942, 'eval_steps_per_second': 8.942, 'epoch': 2.0}
{'loss': 0.0327, 'grad_norm': 0.27825242280960083, 'learning_rate': 1.9597409417118852e-05, 'epoch': 2.01}
{'loss': 0.0294, 'grad_norm': 0.23852816224098206, 'learning_rate': 1.9579905478732717e-05, 'epoch': 2.1}
{'loss': 0.029, 'grad_norm': 0.31421583890914917, 'learning_rate': 1.956240154034658e-05, 'epoch': 2.19}
{'loss': 0.0311, 'grad_norm': 0.17516115307807922, 'learning_rate': 1.9544897601960442e-05, 'epoch': 2.28}
{'loss': 0.0285, 'grad_norm': 0.25422823429107666, 'learning_rate': 1.9527393663574304e-05, 'epoch': 2.36}
{'loss': 0.0276, 'grad_norm': 0.2110826075077057, 'learning_rate': 1.950988972518817e-05, 'epoch': 2.45}
{'loss': 0.0276, 'grad_norm': 0.21295394003391266, 'learning_rate': 1.949238

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.025138769298791885, 'eval_accuracy': 0.3063426126852254, 'eval_f1': 0.5989313537545377, 'eval_precision_1': 0.8752980448259419, 'eval_recall': 0.4552049104098208, 'eval_runtime': 112.5446, 'eval_samples_per_second': 143.312, 'eval_steps_per_second': 8.965, 'epoch': 3.0}
{'loss': 0.0226, 'grad_norm': 0.2705877125263214, 'learning_rate': 1.938736215648521e-05, 'epoch': 3.06}
{'loss': 0.0218, 'grad_norm': 0.27206090092658997, 'learning_rate': 1.9369858218099072e-05, 'epoch': 3.15}
{'loss': 0.0213, 'grad_norm': 0.3895599842071533, 'learning_rate': 1.9352354279712938e-05, 'epoch': 3.24}
{'loss': 0.0213, 'grad_norm': 0.28284987807273865, 'learning_rate': 1.93348503413268e-05, 'epoch': 3.33}
{'loss': 0.0211, 'grad_norm': 0.22725263237953186, 'learning_rate': 1.9317346402940666e-05, 'epoch': 3.41}
{'loss': 0.021, 'grad_norm': 0.285044401884079, 'learning_rate': 1.9299842464554525e-05, 'epoch': 3.5}
{'loss': 0.021, 'grad_norm': 0.2570587396621704, 'learning_rate': 1.928233852616

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.02075696736574173, 'eval_accuracy': 0.41279682559365116, 'eval_f1': 0.6905366037026596, 'eval_precision_1': 0.8769752231823172, 'eval_recall': 0.5694711389422779, 'eval_runtime': 112.7791, 'eval_samples_per_second': 143.014, 'eval_steps_per_second': 8.947, 'epoch': 4.0}
{'loss': 0.0185, 'grad_norm': 0.194328173995018, 'learning_rate': 1.9194818834237705e-05, 'epoch': 4.03}
{'loss': 0.0167, 'grad_norm': 0.3178427219390869, 'learning_rate': 1.9177314895851568e-05, 'epoch': 4.11}
{'loss': 0.0164, 'grad_norm': 0.32872244715690613, 'learning_rate': 1.9159810957465433e-05, 'epoch': 4.2}
{'loss': 0.0165, 'grad_norm': 0.24874894320964813, 'learning_rate': 1.9142307019079292e-05, 'epoch': 4.29}
{'loss': 0.0168, 'grad_norm': 0.20611819624900818, 'learning_rate': 1.9124803080693158e-05, 'epoch': 4.38}
{'loss': 0.0162, 'grad_norm': 0.18036417663097382, 'learning_rate': 1.910729914230702e-05, 'epoch': 4.46}
{'loss': 0.0163, 'grad_norm': 0.3178583085536957, 'learning_rate': 1.9089795

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.017427338287234306, 'eval_accuracy': 0.5041230082460165, 'eval_f1': 0.7473903217706352, 'eval_precision_1': 0.8867466802860061, 'eval_recall': 0.6458862917725835, 'eval_runtime': 112.3636, 'eval_samples_per_second': 143.543, 'eval_steps_per_second': 8.98, 'epoch': 5.0}
{'loss': 0.0133, 'grad_norm': 0.2691614627838135, 'learning_rate': 1.8984771573604063e-05, 'epoch': 5.08}
{'loss': 0.0127, 'grad_norm': 0.11879652738571167, 'learning_rate': 1.8967267635217926e-05, 'epoch': 5.16}
{'loss': 0.0131, 'grad_norm': 0.17571920156478882, 'learning_rate': 1.8949763696831788e-05, 'epoch': 5.25}
{'loss': 0.013, 'grad_norm': 0.22276969254016876, 'learning_rate': 1.8932259758445654e-05, 'epoch': 5.34}
{'loss': 0.0129, 'grad_norm': 0.7894952297210693, 'learning_rate': 1.8914755820059513e-05, 'epoch': 5.43}
{'loss': 0.0128, 'grad_norm': 0.34514617919921875, 'learning_rate': 1.8897251881673378e-05, 'epoch': 5.51}
{'loss': 0.0129, 'grad_norm': 0.21411694586277008, 'learning_rate': 1.88797

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.016099855303764343, 'eval_accuracy': 0.5570091140182281, 'eval_f1': 0.7750425490849754, 'eval_precision_1': 0.8929047907822923, 'eval_recall': 0.6846673693347387, 'eval_runtime': 112.7569, 'eval_samples_per_second': 143.042, 'eval_steps_per_second': 8.948, 'epoch': 6.0}
{'loss': 0.0115, 'grad_norm': 0.35580867528915405, 'learning_rate': 1.8792228251356555e-05, 'epoch': 6.04}
{'loss': 0.0103, 'grad_norm': 0.27472078800201416, 'learning_rate': 1.877472431297042e-05, 'epoch': 6.13}
{'loss': 0.0107, 'grad_norm': 0.3970605432987213, 'learning_rate': 1.8757220374584283e-05, 'epoch': 6.21}
{'loss': 0.0106, 'grad_norm': 0.2053329199552536, 'learning_rate': 1.8739716436198146e-05, 'epoch': 6.3}
{'loss': 0.0105, 'grad_norm': 0.24324451386928558, 'learning_rate': 1.8722212497812008e-05, 'epoch': 6.39}
{'loss': 0.0104, 'grad_norm': 0.26518112421035767, 'learning_rate': 1.8704708559425874e-05, 'epoch': 6.48}
{'loss': 0.0101, 'grad_norm': 0.289683073759079, 'learning_rate': 1.8687204

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014534314163029194, 'eval_accuracy': 0.612003224006448, 'eval_f1': 0.8023419525428385, 'eval_precision_1': 0.8992072044334976, 'eval_recall': 0.7243164486328972, 'eval_runtime': 112.2723, 'eval_samples_per_second': 143.66, 'eval_steps_per_second': 8.987, 'epoch': 7.0}
{'loss': 0.0105, 'grad_norm': 0.46232694387435913, 'learning_rate': 1.859968492910905e-05, 'epoch': 7.0}
{'loss': 0.0084, 'grad_norm': 0.20556584000587463, 'learning_rate': 1.8582180990722913e-05, 'epoch': 7.09}
{'loss': 0.0085, 'grad_norm': 0.4391337037086487, 'learning_rate': 1.8564677052336776e-05, 'epoch': 7.18}
{'loss': 0.0087, 'grad_norm': 0.2782006561756134, 'learning_rate': 1.854717311395064e-05, 'epoch': 7.26}
{'loss': 0.0087, 'grad_norm': 0.13461683690547943, 'learning_rate': 1.8529669175564504e-05, 'epoch': 7.35}
{'loss': 0.0087, 'grad_norm': 0.2574334144592285, 'learning_rate': 1.8512165237178366e-05, 'epoch': 7.44}
{'loss': 0.0088, 'grad_norm': 0.39042460918426514, 'learning_rate': 1.849466129

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013621049001812935, 'eval_accuracy': 0.6554653109306219, 'eval_f1': 0.8221167648995997, 'eval_precision_1': 0.9029864589130031, 'eval_recall': 0.7545415090830182, 'eval_runtime': 112.6342, 'eval_samples_per_second': 143.198, 'eval_steps_per_second': 8.958, 'epoch': 8.0}
{'loss': 0.0078, 'grad_norm': 0.42857035994529724, 'learning_rate': 1.838963766847541e-05, 'epoch': 8.05}
{'loss': 0.0072, 'grad_norm': 0.19721662998199463, 'learning_rate': 1.837213373008927e-05, 'epoch': 8.14}
{'loss': 0.0073, 'grad_norm': 0.322400838136673, 'learning_rate': 1.8354629791703137e-05, 'epoch': 8.23}
{'loss': 0.0072, 'grad_norm': 0.1977987140417099, 'learning_rate': 1.8337125853316996e-05, 'epoch': 8.31}
{'loss': 0.0073, 'grad_norm': 0.202492818236351, 'learning_rate': 1.831962191493086e-05, 'epoch': 8.4}
{'loss': 0.0073, 'grad_norm': 0.32967957854270935, 'learning_rate': 1.8302117976544724e-05, 'epoch': 8.49}
{'loss': 0.0073, 'grad_norm': 2.301284074783325, 'learning_rate': 1.828461403815

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013210223987698555, 'eval_accuracy': 0.6863413726827454, 'eval_f1': 0.8322421165152325, 'eval_precision_1': 0.9022235098138626, 'eval_recall': 0.7723355446710893, 'eval_runtime': 112.4781, 'eval_samples_per_second': 143.397, 'eval_steps_per_second': 8.971, 'epoch': 9.0}
{'loss': 0.007, 'grad_norm': 0.3003098666667938, 'learning_rate': 1.8197094346227905e-05, 'epoch': 9.01}
{'loss': 0.0061, 'grad_norm': 0.1199743002653122, 'learning_rate': 1.8179590407841764e-05, 'epoch': 9.1}
{'loss': 0.0063, 'grad_norm': 0.17040960490703583, 'learning_rate': 1.816208646945563e-05, 'epoch': 9.19}
{'loss': 0.0063, 'grad_norm': 0.3128323256969452, 'learning_rate': 1.814458253106949e-05, 'epoch': 9.28}
{'loss': 0.0065, 'grad_norm': 0.09931075572967529, 'learning_rate': 1.8127078592683357e-05, 'epoch': 9.36}
{'loss': 0.0063, 'grad_norm': 0.1867365688085556, 'learning_rate': 1.8109574654297216e-05, 'epoch': 9.45}
{'loss': 0.0063, 'grad_norm': 0.19756628572940826, 'learning_rate': 1.809207071

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012515860609710217, 'eval_accuracy': 0.7162254324508649, 'eval_f1': 0.8481602697609838, 'eval_precision_1': 0.908498583569405, 'eval_recall': 0.7953375906751814, 'eval_runtime': 112.607, 'eval_samples_per_second': 143.233, 'eval_steps_per_second': 8.96, 'epoch': 10.0}
{'loss': 0.0056, 'grad_norm': 0.2813168466091156, 'learning_rate': 1.798704708559426e-05, 'epoch': 10.06}
{'loss': 0.0053, 'grad_norm': 0.18704338371753693, 'learning_rate': 1.7969543147208125e-05, 'epoch': 10.15}
{'loss': 0.0053, 'grad_norm': 0.3618831932544708, 'learning_rate': 1.7952039208821987e-05, 'epoch': 10.24}
{'loss': 0.0057, 'grad_norm': 0.19652719795703888, 'learning_rate': 1.793453527043585e-05, 'epoch': 10.33}
{'loss': 0.0058, 'grad_norm': 0.15256543457508087, 'learning_rate': 1.7917031332049712e-05, 'epoch': 10.41}
{'loss': 0.0058, 'grad_norm': 0.39395377039909363, 'learning_rate': 1.7899527393663578e-05, 'epoch': 10.5}
{'loss': 0.0057, 'grad_norm': 0.11713112145662308, 'learning_rate': 1.78

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012281239032745361, 'eval_accuracy': 0.7283154566309132, 'eval_f1': 0.8530143509136144, 'eval_precision_1': 0.9103569544575347, 'eval_recall': 0.8024676049352099, 'eval_runtime': 112.5951, 'eval_samples_per_second': 143.248, 'eval_steps_per_second': 8.961, 'epoch': 11.0}
{'loss': 0.0055, 'grad_norm': 0.2359459400177002, 'learning_rate': 1.7794503763346755e-05, 'epoch': 11.03}
{'loss': 0.0046, 'grad_norm': 0.1466279923915863, 'learning_rate': 1.7776999824960617e-05, 'epoch': 11.12}
{'loss': 0.0048, 'grad_norm': 0.5298412442207336, 'learning_rate': 1.775949588657448e-05, 'epoch': 11.2}
{'loss': 0.0051, 'grad_norm': 0.11904683709144592, 'learning_rate': 1.7741991948188345e-05, 'epoch': 11.29}
{'loss': 0.0049, 'grad_norm': 0.08445987105369568, 'learning_rate': 1.7724488009802207e-05, 'epoch': 11.38}
{'loss': 0.0052, 'grad_norm': 0.27659353613853455, 'learning_rate': 1.770698407141607e-05, 'epoch': 11.47}
{'loss': 0.0054, 'grad_norm': 0.11452889442443848, 'learning_rate': 1.

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012300258502364159, 'eval_accuracy': 0.7363134726269452, 'eval_f1': 0.8559358253608179, 'eval_precision_1': 0.9110792273236282, 'eval_recall': 0.8070866141732284, 'eval_runtime': 112.4875, 'eval_samples_per_second': 143.385, 'eval_steps_per_second': 8.97, 'epoch': 12.0}
{'loss': 0.0045, 'grad_norm': 0.04768279567360878, 'learning_rate': 1.7584456502713113e-05, 'epoch': 12.08}
{'loss': 0.0042, 'grad_norm': 0.3201231360435486, 'learning_rate': 1.7566952564326975e-05, 'epoch': 12.17}
{'loss': 0.0044, 'grad_norm': 0.3760961890220642, 'learning_rate': 1.754944862594084e-05, 'epoch': 12.25}
{'loss': 0.0046, 'grad_norm': 0.43150830268859863, 'learning_rate': 1.75319446875547e-05, 'epoch': 12.34}
{'loss': 0.0048, 'grad_norm': 0.3589419722557068, 'learning_rate': 1.7514440749168565e-05, 'epoch': 12.43}
{'loss': 0.0046, 'grad_norm': 0.1075017899274826, 'learning_rate': 1.7496936810782428e-05, 'epoch': 12.52}
{'loss': 0.0048, 'grad_norm': 0.07792709767818451, 'learning_rate': 1.74

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012023263610899448, 'eval_accuracy': 0.7575795151590303, 'eval_f1': 0.8643234145464647, 'eval_precision_1': 0.9138878330280936, 'eval_recall': 0.8198586397172795, 'eval_runtime': 112.8973, 'eval_samples_per_second': 142.864, 'eval_steps_per_second': 8.937, 'epoch': 13.0}
{'loss': 0.0045, 'grad_norm': 0.22377435863018036, 'learning_rate': 1.739191318046561e-05, 'epoch': 13.04}
{'loss': 0.0039, 'grad_norm': 0.09689246863126755, 'learning_rate': 1.7374409242079467e-05, 'epoch': 13.13}
{'loss': 0.004, 'grad_norm': 0.6155219674110413, 'learning_rate': 1.7356905303693333e-05, 'epoch': 13.22}
{'loss': 0.0041, 'grad_norm': 0.19249451160430908, 'learning_rate': 1.7339401365307195e-05, 'epoch': 13.3}
{'loss': 0.0042, 'grad_norm': 0.08826547861099243, 'learning_rate': 1.732189742692106e-05, 'epoch': 13.39}
{'loss': 0.0044, 'grad_norm': 0.2834056317806244, 'learning_rate': 1.730439348853492e-05, 'epoch': 13.48}
{'loss': 0.0045, 'grad_norm': 0.17663374543190002, 'learning_rate': 1.7

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01242041029036045, 'eval_accuracy': 0.7525575051150102, 'eval_f1': 0.8589186441233867, 'eval_precision_1': 0.9118066919675499, 'eval_recall': 0.8118296236592473, 'eval_runtime': 112.4533, 'eval_samples_per_second': 143.428, 'eval_steps_per_second': 8.973, 'epoch': 14.0}
{'loss': 0.0043, 'grad_norm': 0.09326950460672379, 'learning_rate': 1.71993698582181e-05, 'epoch': 14.0}
{'loss': 0.0037, 'grad_norm': 0.23660404980182648, 'learning_rate': 1.7181865919831963e-05, 'epoch': 14.09}
{'loss': 0.0037, 'grad_norm': 0.38568469882011414, 'learning_rate': 1.716436198144583e-05, 'epoch': 14.18}
{'loss': 0.004, 'grad_norm': 0.05990004912018776, 'learning_rate': 1.714685804305969e-05, 'epoch': 14.27}
{'loss': 0.0039, 'grad_norm': 0.20772579312324524, 'learning_rate': 1.7129354104673553e-05, 'epoch': 14.35}
{'loss': 0.0041, 'grad_norm': 0.1929357945919037, 'learning_rate': 1.7111850166287416e-05, 'epoch': 14.44}
{'loss': 0.004, 'grad_norm': 0.05685802549123764, 'learning_rate': 1.709

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012408880516886711, 'eval_accuracy': 0.7676855353710708, 'eval_f1': 0.8653524239814078, 'eval_precision_1': 0.9094728931096915, 'eval_recall': 0.8253146506293012, 'eval_runtime': 112.7759, 'eval_samples_per_second': 143.018, 'eval_steps_per_second': 8.947, 'epoch': 15.0}
{'loss': 0.0038, 'grad_norm': 0.07515055686235428, 'learning_rate': 1.698932259758446e-05, 'epoch': 15.05}
{'loss': 0.0036, 'grad_norm': 0.12372808158397675, 'learning_rate': 1.697181865919832e-05, 'epoch': 15.14}
{'loss': 0.0038, 'grad_norm': 0.12526224553585052, 'learning_rate': 1.6954314720812183e-05, 'epoch': 15.23}
{'loss': 0.0036, 'grad_norm': 0.09534534066915512, 'learning_rate': 1.693681078242605e-05, 'epoch': 15.32}
{'loss': 0.0037, 'grad_norm': 0.12991882860660553, 'learning_rate': 1.691930684403991e-05, 'epoch': 15.4}
{'loss': 0.0039, 'grad_norm': 0.0604887530207634, 'learning_rate': 1.6901802905653773e-05, 'epoch': 15.49}
{'loss': 0.0036, 'grad_norm': 0.2742388844490051, 'learning_rate': 1.6

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012159086763858795, 'eval_accuracy': 0.7774195548391096, 'eval_f1': 0.8710250775594622, 'eval_precision_1': 0.9096523793452582, 'eval_recall': 0.8355446710893422, 'eval_runtime': 112.376, 'eval_samples_per_second': 143.527, 'eval_steps_per_second': 8.979, 'epoch': 16.0}
{'loss': 0.0038, 'grad_norm': 0.025794517248868942, 'learning_rate': 1.679677927533695e-05, 'epoch': 16.02}
{'loss': 0.0032, 'grad_norm': 0.17027345299720764, 'learning_rate': 1.6779275336950816e-05, 'epoch': 16.1}
{'loss': 0.0034, 'grad_norm': 0.29429617524147034, 'learning_rate': 1.676177139856468e-05, 'epoch': 16.19}
{'loss': 0.0033, 'grad_norm': 0.18505752086639404, 'learning_rate': 1.674426746017854e-05, 'epoch': 16.28}
{'loss': 0.0036, 'grad_norm': 0.5442038178443909, 'learning_rate': 1.6726763521792403e-05, 'epoch': 16.37}
{'loss': 0.0036, 'grad_norm': 0.05492652580142021, 'learning_rate': 1.670925958340627e-05, 'epoch': 16.45}
{'loss': 0.0035, 'grad_norm': 0.050366293638944626, 'learning_rate': 1

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012542661279439926, 'eval_accuracy': 0.7778535557071115, 'eval_f1': 0.8676126008851861, 'eval_precision_1': 0.9130762380984999, 'eval_recall': 0.8264616529233059, 'eval_runtime': 113.0078, 'eval_samples_per_second': 142.725, 'eval_steps_per_second': 8.929, 'epoch': 17.0}
{'loss': 0.003, 'grad_norm': 0.08213008940219879, 'learning_rate': 1.6586732014703312e-05, 'epoch': 17.07}
{'loss': 0.0031, 'grad_norm': 0.1905675083398819, 'learning_rate': 1.656922807631717e-05, 'epoch': 17.15}
{'loss': 0.0033, 'grad_norm': 0.10864297300577164, 'learning_rate': 1.6551724137931037e-05, 'epoch': 17.24}
{'loss': 0.0034, 'grad_norm': 0.1000591591000557, 'learning_rate': 1.65342201995449e-05, 'epoch': 17.33}
{'loss': 0.0036, 'grad_norm': 0.07200787961483002, 'learning_rate': 1.651671626115876e-05, 'epoch': 17.42}
{'loss': 0.0037, 'grad_norm': 0.09751130640506744, 'learning_rate': 1.6499212322772624e-05, 'epoch': 17.5}
{'loss': 0.0032, 'grad_norm': 0.06270167231559753, 'learning_rate': 1.64

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012337646447122097, 'eval_accuracy': 0.7840535681071362, 'eval_f1': 0.8721084473777864, 'eval_precision_1': 0.9119079837618403, 'eval_recall': 0.8356376712753425, 'eval_runtime': 113.5503, 'eval_samples_per_second': 142.043, 'eval_steps_per_second': 8.886, 'epoch': 18.0}
{'loss': 0.0034, 'grad_norm': 0.059207718819379807, 'learning_rate': 1.6394188692455804e-05, 'epoch': 18.03}
{'loss': 0.0028, 'grad_norm': 0.07195138186216354, 'learning_rate': 1.6376684754069667e-05, 'epoch': 18.12}
{'loss': 0.0031, 'grad_norm': 0.31195545196533203, 'learning_rate': 1.6359180815683532e-05, 'epoch': 18.2}
{'loss': 0.0033, 'grad_norm': 0.08465807139873505, 'learning_rate': 1.634167687729739e-05, 'epoch': 18.29}
{'loss': 0.0036, 'grad_norm': 0.12053372710943222, 'learning_rate': 1.6324172938911257e-05, 'epoch': 18.38}
{'loss': 0.0035, 'grad_norm': 0.4177282154560089, 'learning_rate': 1.630666900052512e-05, 'epoch': 18.47}
{'loss': 0.0034, 'grad_norm': 0.08894557505846024, 'learning_rate':

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01274662185460329, 'eval_accuracy': 0.784239568479137, 'eval_f1': 0.8685562845415468, 'eval_precision_1': 0.907169862273832, 'eval_recall': 0.8330956661913324, 'eval_runtime': 113.3949, 'eval_samples_per_second': 142.237, 'eval_steps_per_second': 8.898, 'epoch': 19.0}
{'loss': 0.0029, 'grad_norm': 0.08018404990434647, 'learning_rate': 1.6184141431822162e-05, 'epoch': 19.08}
{'loss': 0.003, 'grad_norm': 0.08708932250738144, 'learning_rate': 1.6166637493436024e-05, 'epoch': 19.17}
{'loss': 0.0032, 'grad_norm': 0.05443791300058365, 'learning_rate': 1.6149133555049887e-05, 'epoch': 19.25}
{'loss': 0.003, 'grad_norm': 0.03450365737080574, 'learning_rate': 1.6131629616663753e-05, 'epoch': 19.34}
{'loss': 0.0032, 'grad_norm': 0.18414118885993958, 'learning_rate': 1.6114125678277615e-05, 'epoch': 19.43}
{'loss': 0.0031, 'grad_norm': 0.044097475707530975, 'learning_rate': 1.6096621739891477e-05, 'epoch': 19.52}
{'loss': 0.0033, 'grad_norm': 0.09567022323608398, 'learning_rate': 

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012371249496936798, 'eval_accuracy': 0.7867195734391469, 'eval_f1': 0.872744294153727, 'eval_precision_1': 0.9103919719827587, 'eval_recall': 0.8380866761733523, 'eval_runtime': 113.5648, 'eval_samples_per_second': 142.025, 'eval_steps_per_second': 8.885, 'epoch': 20.0}
{'loss': 0.0031, 'grad_norm': 0.035520296543836594, 'learning_rate': 1.5991598109574654e-05, 'epoch': 20.04}
{'loss': 0.003, 'grad_norm': 0.3665737807750702, 'learning_rate': 1.597409417118852e-05, 'epoch': 20.13}
{'loss': 0.003, 'grad_norm': 0.025458533316850662, 'learning_rate': 1.5956590232802382e-05, 'epoch': 20.22}
{'loss': 0.0031, 'grad_norm': 0.01922805793583393, 'learning_rate': 1.5939086294416245e-05, 'epoch': 20.3}
{'loss': 0.0031, 'grad_norm': 0.18952970206737518, 'learning_rate': 1.5921582356030107e-05, 'epoch': 20.39}
{'loss': 0.0032, 'grad_norm': 0.03059622459113598, 'learning_rate': 1.5904078417643973e-05, 'epoch': 20.48}
{'loss': 0.0032, 'grad_norm': 0.08143120259046555, 'learning_rate': 

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012958060018718243, 'eval_accuracy': 0.7870915741831483, 'eval_f1': 0.8692397056925262, 'eval_precision_1': 0.9063867016622922, 'eval_recall': 0.8350176700353401, 'eval_runtime': 113.7188, 'eval_samples_per_second': 141.832, 'eval_steps_per_second': 8.873, 'epoch': 21.0}
{'loss': 0.0031, 'grad_norm': 0.05561451241374016, 'learning_rate': 1.579905478732715e-05, 'epoch': 21.0}
{'loss': 0.0026, 'grad_norm': 0.16680364310741425, 'learning_rate': 1.5781550848941016e-05, 'epoch': 21.09}
{'loss': 0.0029, 'grad_norm': 0.10206746309995651, 'learning_rate': 1.5764046910554875e-05, 'epoch': 21.18}
{'loss': 0.0027, 'grad_norm': 0.1462629735469818, 'learning_rate': 1.574654297216874e-05, 'epoch': 21.27}
{'loss': 0.0031, 'grad_norm': 0.1820955127477646, 'learning_rate': 1.5729039033782603e-05, 'epoch': 21.35}
{'loss': 0.0031, 'grad_norm': 0.32860615849494934, 'learning_rate': 1.5711535095396465e-05, 'epoch': 21.44}
{'loss': 0.0029, 'grad_norm': 0.07196151465177536, 'learning_rate': 1

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012736617587506771, 'eval_accuracy': 0.7874635749271498, 'eval_f1': 0.8719093851132687, 'eval_precision_1': 0.9119897095660415, 'eval_recall': 0.8352036704073408, 'eval_runtime': 113.8574, 'eval_samples_per_second': 141.66, 'eval_steps_per_second': 8.862, 'epoch': 22.0}
{'loss': 0.0026, 'grad_norm': 0.07129199057817459, 'learning_rate': 1.5589007526693508e-05, 'epoch': 22.05}
{'loss': 0.0028, 'grad_norm': 0.030295355245471, 'learning_rate': 1.557150358830737e-05, 'epoch': 22.14}
{'loss': 0.0029, 'grad_norm': 0.1133953183889389, 'learning_rate': 1.5553999649921233e-05, 'epoch': 22.23}
{'loss': 0.0028, 'grad_norm': 0.3489568829536438, 'learning_rate': 1.5536495711535095e-05, 'epoch': 22.32}
{'loss': 0.0028, 'grad_norm': 0.15032409131526947, 'learning_rate': 1.551899177314896e-05, 'epoch': 22.41}
{'loss': 0.0029, 'grad_norm': 0.24968084692955017, 'learning_rate': 1.5501487834762823e-05, 'epoch': 22.49}
{'loss': 0.0029, 'grad_norm': 0.03027733974158764, 'learning_rate': 1.5

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012513984926044941, 'eval_accuracy': 0.7862235724471449, 'eval_f1': 0.8738566331495297, 'eval_precision_1': 0.916298211006054, 'eval_recall': 0.8351726703453407, 'eval_runtime': 113.9494, 'eval_samples_per_second': 141.545, 'eval_steps_per_second': 8.855, 'epoch': 23.0}
{'loss': 0.0031, 'grad_norm': 0.053791116923093796, 'learning_rate': 1.5396464204446004e-05, 'epoch': 23.02}
{'loss': 0.0028, 'grad_norm': 0.06200553849339485, 'learning_rate': 1.5378960266059866e-05, 'epoch': 23.11}
{'loss': 0.0027, 'grad_norm': 0.06717050820589066, 'learning_rate': 1.5361456327673728e-05, 'epoch': 23.19}
{'loss': 0.0027, 'grad_norm': 0.03196967393159866, 'learning_rate': 1.534395238928759e-05, 'epoch': 23.28}
{'loss': 0.0028, 'grad_norm': 0.054560039192438126, 'learning_rate': 1.5326448450901453e-05, 'epoch': 23.37}
{'loss': 0.0032, 'grad_norm': 0.4048727750778198, 'learning_rate': 1.530894451251532e-05, 'epoch': 23.46}
{'loss': 0.0028, 'grad_norm': 0.13920627534389496, 'learning_rate'

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013048234395682812, 'eval_accuracy': 0.7921755843511687, 'eval_f1': 0.8720482897384306, 'eval_precision_1': 0.9069541634579972, 'eval_recall': 0.839729679459359, 'eval_runtime': 113.4697, 'eval_samples_per_second': 142.144, 'eval_steps_per_second': 8.892, 'epoch': 24.0}
{'loss': 0.0025, 'grad_norm': 0.020054319873452187, 'learning_rate': 1.5186416943812358e-05, 'epoch': 24.07}
{'loss': 0.0027, 'grad_norm': 0.05007627233862877, 'learning_rate': 1.5168913005426222e-05, 'epoch': 24.16}
{'loss': 0.0025, 'grad_norm': 0.03048541396856308, 'learning_rate': 1.5151409067040086e-05, 'epoch': 24.24}
{'loss': 0.0028, 'grad_norm': 0.12620504200458527, 'learning_rate': 1.5133905128653947e-05, 'epoch': 24.33}
{'loss': 0.0029, 'grad_norm': 0.10131364315748215, 'learning_rate': 1.511640119026781e-05, 'epoch': 24.42}
{'loss': 0.0027, 'grad_norm': 1.1029716730117798, 'learning_rate': 1.5098897251881675e-05, 'epoch': 24.51}
{'loss': 0.0026, 'grad_norm': 0.437293142080307, 'learning_rate': 

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012843534350395203, 'eval_accuracy': 0.7952135904271809, 'eval_f1': 0.871871678423036, 'eval_precision_1': 0.9072596862850248, 'eval_recall': 0.8391406782813565, 'eval_runtime': 113.3802, 'eval_samples_per_second': 142.256, 'eval_steps_per_second': 8.899, 'epoch': 25.0}
{'loss': 0.0027, 'grad_norm': 0.09436175227165222, 'learning_rate': 1.4993873621564854e-05, 'epoch': 25.03}
{'loss': 0.0024, 'grad_norm': 0.028055889531970024, 'learning_rate': 1.4976369683178718e-05, 'epoch': 25.12}
{'loss': 0.0026, 'grad_norm': 0.012291422113776207, 'learning_rate': 1.4958865744792578e-05, 'epoch': 25.21}
{'loss': 0.0029, 'grad_norm': 0.19876140356063843, 'learning_rate': 1.4941361806406442e-05, 'epoch': 25.29}
{'loss': 0.0028, 'grad_norm': 0.04795808345079422, 'learning_rate': 1.4923857868020306e-05, 'epoch': 25.38}
{'loss': 0.0028, 'grad_norm': 0.030622493475675583, 'learning_rate': 1.490635392963417e-05, 'epoch': 25.47}
{'loss': 0.0027, 'grad_norm': 0.13271339237689972, 'learning_ra

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.012929117307066917, 'eval_accuracy': 0.7975695951391902, 'eval_f1': 0.8742945596439743, 'eval_precision_1': 0.9109879032258065, 'eval_recall': 0.8404426808853618, 'eval_runtime': 113.4414, 'eval_samples_per_second': 142.179, 'eval_steps_per_second': 8.894, 'epoch': 26.0}
{'loss': 0.0024, 'grad_norm': 0.04990154877305031, 'learning_rate': 1.478382636093121e-05, 'epoch': 26.08}
{'loss': 0.0026, 'grad_norm': 0.27397915720939636, 'learning_rate': 1.4766322422545074e-05, 'epoch': 26.17}
{'loss': 0.0026, 'grad_norm': 0.01916823908686638, 'learning_rate': 1.4748818484158938e-05, 'epoch': 26.26}
{'loss': 0.0027, 'grad_norm': 0.1451733410358429, 'learning_rate': 1.4731314545772799e-05, 'epoch': 26.34}
{'loss': 0.0026, 'grad_norm': 0.1396797001361847, 'learning_rate': 1.4713810607386663e-05, 'epoch': 26.43}


KeyboardInterrupt: 

## Test Outputs

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/bert-classifier/checkpoint-34275")
tokenizer = BertTokenizer.from_pretrained("./tibetan_tokenizer")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load label mapping
with open("label_mapping.json", "r") as f:
    label_mapping = json.load(f)

# Example input text
input_text = "བླ་མ་དང་ལྷག་པའི་ལྷ་ལ་ཕྱག་འཚལ་ལོ།།"

# Tokenize input
encoded_input = tokenizer(
    input_text, 
    padding="max_length", 
    truncation=True, 
    max_length=128, 
    return_tensors="pt"
)
encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

# Get predictions
with torch.no_grad():
    outputs = model(**encoded_input)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits).cpu().numpy()
    predictions = (probabilities > 0.5).astype(int)

# Decode predictions
predicted_tags = [label_mapping[i] for i, val in enumerate(predictions[0]) if val == 1]

print("Predicted Tags:", predicted_tags)
