In [1]:
from datasets import load_dataset

ds = load_dataset('openpecha/tagged_cleaned_MT_v1.0.3', split='train')

In [None]:
ds['Tag'][:13]

## add missing tags

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en-col-op-bert-classifier/checkpoint-308950")
tokenizer = BertTokenizer.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en-col-op-bert-classifier/checkpoint-308950")
model.to('cuda:0')
model.eval()

# Load label mapping
with open("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/simple_op_label_mapping.json", "r") as f:
    label_mapping = json.load(f)

In [None]:
# Define the tag generation function
def gen_tag(input_text):
    # Tokenize input
    encoded_inputs = tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Move input tensors to GPU
    encoded_inputs = {key: val.to('cuda:0') for key, val in encoded_inputs.items()}


    # Get predictions
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1)

    # Decode predictions
    predicted_tag = list(label_mapping.keys())[int(prediction)]

    return predicted_tag

# Use the map function to modify the dataset
def update_tag(example):
    if example['Tag'] == '':  # Check if the tag is empty
        example['Tag'] = gen_tag(example['Target'])
    return example

# Apply the update_tag function
ds = ds.map(update_tag)

In [None]:
ds['Tag'][:13]

## add buddhist bool

In [17]:
buddhist_labels = ['Mantras',
                    'Dzogchen',
                    'Astrology',
                    'Monastery',
                    'Mahamudra',
                    'Mind',
                    'Meditation',
                    'Self, Logic, Aggregates',
                    'Tantra',
                    'Emptiness',
                    'Dreams',
                    'Education, Teaching',
                    'Ethics, Enlightenment, Wisdom',
                    'Prophecies, Rituals',
                    'Lama',
                    'Samsara, Nirvana',
                    'Milarepa, Realization, Biography',
                    'Kayas',
                    'Intrinsic Existence, Conventional Existence',
                    'Time, Causality, Perception',
                    'Natural State',
                    'Karma, Consequences',
                    'Dharma',
                    'Buddhist']

In [None]:
def bool_tag(example):
    buddhist_bool = example['Tag'] in buddhist_labels
    topic_tag = example['Tag']
    example['Tag'] = {'Buddhist': buddhist_bool, 'Topic': topic_tag}
    return example

ds = ds.map(bool_tag)
        

In [None]:
ds['Tag'][:13]

In [None]:
ds.save_to_disk('wip-ds')

In [1]:
from datasets import load_from_disk

ds = load_from_disk('wip-ds')

In [2]:
ds['Tag'][:13]

[{'Buddhist': True, 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'Topic': 'Emptiness'},
 {'Buddhist': True, 'Topic': 'Intrinsic Existence, Conventional Existence'},
 {'Buddhist': True, 'Topic': 'Meditation'},
 {'Buddhist': True, 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': False, 'Topic': 'History, Politics, Law'},
 {'Buddhist': False, 'Topic': 'History, Politics, Law'}]

# multilabel for buddhist texts

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en-lh-bert-classifier/checkpoint-148538")
tokenizer = BertTokenizer.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en-lh-bert-classifier/checkpoint-148538")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load label mapping
with open("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en_lh_label_mapping.json", "r") as f:
    label_mapping = json.load(f)

def multilab(input_text):
    # Tokenize input
    encoded_input = tokenizer(
        input_text, 
        padding="max_length", 
        truncation=True, 
        max_length=128, 
        return_tensors="pt"
    )
    encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**encoded_input)
        logits = outputs.logits
        probabilities = torch.sigmoid(logits).cpu().numpy()
        predictions = (probabilities > 0.5).astype(int)

    # Decode predictions
    predicted_tags = [label_mapping[i] for i, val in enumerate(predictions[0]) if val == 1]
    return predicted_tags

In [5]:
def multilab_example(example):
    if example['Tag']['Buddhist']:
        example['Tag']['LH labels'] = multilab(example['Target'])
    return example

ds = ds.map(multilab_example)

Map:   0%|          | 0/1429192 [00:00<?, ? examples/s]

In [6]:
ds['Tag'][:13]

[{'Buddhist': True,
  'LH labels': ['Akṣobhya', 'Longsal Dorje Nyingpo'],
  'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True,
  'LH labels': ['Aspiration Prayers', 'Dzogchen'],
  'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True,
  'LH labels': ['Tibetan Masters'],
  'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True,
  'LH labels': ['Tibetan Masters'],
  'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'LH labels': [], 'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True,
  'LH labels': ['Advice', 'Songs and Poems', 'Tsok'],
  'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'LH labels': ['Longchen Nyingtik'], 'Topic': 'Emptiness'},
 {'Buddhist': True,
  'LH labels': ['Dzogchen'],
  'Topic': 'Intrinsic Existence, Conventional Existence'},
 {'Buddhist': True, 'LH labels': [], 'Topic': 'Meditation'},
 {'Buddhist': True,
  'LH labels': ['Aspiration Prayers', 'Tibetan Masters'],
  'Topic': 'Prophecies, Rituals'},
 {'Buddhist': True, 'LH labels': ['Praise'], 'Topic': 'Propheci

In [8]:
from datasets import DatasetDict, load_dataset

ds_train = ds
ds_test = load_dataset('openpecha/tagged_cleaned_MT_v1.0.3', split='test')
ds = DatasetDict()

Using the latest cached version of the dataset since openpecha/tagged_cleaned_MT_v1.0.3 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/j/.cache/huggingface/datasets/openpecha___tagged_cleaned_mt_v1.0.3/default/0.0.0/fac96d7b2c2af320e7e569f1b06f3ffeb0155a2c (last modified on Thu Jan 16 16:51:52 2025).


In [9]:
ds['train'] = ds_train
ds['test'] = ds_test

ds['train'][0]

{'Source': 'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།',
 'Target': 'The aspirant should move in such a way at all times.',
 'File_Name': 'TM2382',
 'Machine Aligned': True,
 '__index_level_0__': 0,
 'Tag': {'Buddhist': True,
  'LH labels': ['Akṣobhya', 'Longsal Dorje Nyingpo'],
  'Topic': 'Prophecies, Rituals'}}

In [12]:
ds['test'][0]

{'Source': 'ཚད་མེད་བཏང་སྙོམས་གསུམ་ལས།',
 'Target': '3. Immeasureable equanimity ',
 'File_Name': 'TM2203',
 'Machine Aligned': True,
 '__index_level_0__': 0,
 'Tag': ''}

In [15]:
def no_tag(example):
    example['Tag'] = ''
    return example

ds['test'] = ds['test'].map(no_tag)

Map:   0%|          | 0/9066 [00:00<?, ? examples/s]

In [16]:
ds.save_to_disk('wip-ds')

Saving the dataset (0/3 shards):   0%|          | 0/1429192 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9066 [00:00<?, ? examples/s]

## tag test data

In [18]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json

# Load the trained model and tokenizer
topic_model = BertForSequenceClassification.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en-col-op-bert-classifier/checkpoint-308950")
topic_tokenizer = BertTokenizer.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en-col-op-bert-classifier/checkpoint-308950")
topic_model.to('cuda:0')
topic_model.eval()

# Load label mapping
with open("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/simple_op_label_mapping.json", "r") as f:
    topic_label_mapping = json.load(f)

# Define the tag generation function
def gen_tag(input_text):
    # Tokenize input
    encoded_inputs = topic_tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Move input tensors to GPU
    encoded_inputs = {key: val.to('cuda:0') for key, val in encoded_inputs.items()}


    # Get predictions
    with torch.no_grad():
        outputs = topic_model(**encoded_inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1)

    # Decode predictions
    predicted_tag = list(topic_label_mapping.keys())[int(prediction)]

    return predicted_tag

In [19]:
# Load the trained model and tokenizer
lh_model = BertForSequenceClassification.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en-lh-bert-classifier/checkpoint-148538")
lh_tokenizer = BertTokenizer.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en-lh-bert-classifier/checkpoint-148538")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lh_model.to(device)
lh_model.eval()

# Load label mapping
with open("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/en_lh_label_mapping.json", "r") as f:
    lh_label_mapping = json.load(f)

def multilab(input_text):
    # Tokenize input
    encoded_input = lh_tokenizer(
        input_text, 
        padding="max_length", 
        truncation=True, 
        max_length=128, 
        return_tensors="pt"
    )
    encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

    # Get predictions
    with torch.no_grad():
        outputs = lh_model(**encoded_input)
        logits = outputs.logits
        probabilities = torch.sigmoid(logits).cpu().numpy()
        predictions = (probabilities > 0.5).astype(int)

    # Decode predictions
    predicted_tags = [label_mapping[i] for i, val in enumerate(predictions[0]) if val == 1]
    return predicted_tags

In [20]:
def full_tag(example):
    
    # domain tag
    if example['Tag'] == '':  # Check if the tag is empty
        example['Tag'] = gen_tag(example['Target'])
    
    # bool tag
    buddhist_bool = example['Tag'] in buddhist_labels
    topic_tag = example['Tag']
    example['Tag'] = {'Buddhist': buddhist_bool, 'Topic': topic_tag}

    # multilab tag
    if example['Tag']['Buddhist']:
        example['Tag']['LH labels'] = multilab(example['Target'])

    return example

In [21]:
ds['test'] = ds['test'].map(full_tag)

Map:   0%|          | 0/9066 [00:00<?, ? examples/s]

In [22]:
ds.push_to_hub('openpecha/tagged_cleaned_MT_v1.0.3')

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/477 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/477 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/477 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/openpecha/tagged_cleaned_MT_v1.0.3/commit/f2a2e6444786ae3d5bee169d67dcb32100ea7cf7', commit_message='Upload dataset', commit_description='', oid='f2a2e6444786ae3d5bee169d67dcb32100ea7cf7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/openpecha/tagged_cleaned_MT_v1.0.3', endpoint='https://huggingface.co', repo_type='dataset', repo_id='openpecha/tagged_cleaned_MT_v1.0.3'), pr_revision=None, pr_num=None)