In [20]:
%%capture
!pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
from huggingface_hub import login
login()

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [23]:
from sentence_transformers import SentenceTransformer, models

## Step 1: use an existing language model
word_embedding_model = models.Transformer('sbastola/muril-base-cased-sentence-transformer-snli')

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer("sbastola/muril-base-cased-sentence-transformer-snli", modules=[word_embedding_model, pooling_model])

In [24]:
%%capture
!pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
from datasets import load_dataset


dataset = load_dataset('csv', data_files={'train': "/kaggle/input/stanfordnlp-snli-nepali/train.csv", 'validation': "/kaggle/input/stanfordnlp-snli-nepali/validation.csv"})


In [26]:
dataset = dataset.class_encode_column("label")

In [27]:
dataset_features = dataset['train'].features.copy()

In [28]:
from datasets import ClassLabel, Value
dataset_features['label'] = ClassLabel(num_classes = 3, names=['entailment', 'neutral', 'contradiction'], id=None)
dataset_features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}

In [29]:
dataset['train'].cast(dataset_features)

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 110140
})

In [30]:
print(f"- The dataset has {dataset['train'].num_rows} examples.")
print(f"- Examples look like this: {dataset['train'][0]}")

- The dataset has 110140 examples.
- Examples look like this: {'premise': ' घोडामा सवार एक व्यक्ति भाँचिएको हवाइजहाजमाथि हाम फाल्दै छ।', 'hypothesis': ' एक व्यक्ति प्रतियोगिताको लागि आफ्नो घोडालाई तालिम दिइरहेका छन्।', 'label': 126}


In [31]:
from sentence_transformers import InputExample

# Train Dataset
train_examples = []
train_data = dataset['train']
# For agility we only 1/5 of our available data
n_examples = dataset['train'].num_rows // 2

for i in range(n_examples):
    example = train_data[i]
    if example['label'] not in [125, 126, 127] :
        continue
    else:
        example['label'] -= 125
    train_examples.append(InputExample(texts=[example['premise'], example['hypothesis']], label=example['label']))


In [32]:
# Evaluation Dataset
evaluation_examples = []
evaluation_data = dataset['validation']
# For agility we only 1/5 of our available data
n_examples = dataset['validation'].num_rows // 4

for i in range(n_examples):
    example = train_data[i]
    if example['label'] not in [125, 126, 127] :
        continue
    else:
        example['label'] -= 125
    evaluation_examples.append(InputExample(texts=[example['premise'], example['hypothesis']], label=example['label']))

In [33]:
print(f"We have a {type(train_examples)} of length {len(train_examples)} containing {type(train_examples[0])}'s.")

We have a <class 'list'> of length 54960 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


In [34]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
evaluator_dataloader = DataLoader(train_examples, shuffle=False, batch_size=64)

In [35]:
from sentence_transformers import losses
from sentence_transformers import evaluation

train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), num_labels=3)
evaluator = evaluation.LabelAccuracyEvaluator(dataloader=evaluator_dataloader, softmax_model=train_loss)

In [36]:
num_epochs = 2

warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [39]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          evaluator=evaluator
         ) 

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/859 [00:00<?, ?it/s]

Iteration:   0%|          | 0/859 [00:00<?, ?it/s]

In [40]:
model.save_to_hub(
    "muril-base-cased-sentence-transformer-snli-nepali-2", 
    organization="sbastola",
    train_datasets=["stanfordnlp/snli"],
    exist_ok=True, 
    )

model.safetensors:   0%|          | 0.00/950M [00:00<?, ?B/s]

'https://huggingface.co/sbastola/muril-base-cased-sentence-transformer-snli-nepali-2/commit/d2e02dcb07ca8ac9ec9222a4823dc0a5dfbb3214'