## Installing Libraries

In [1]:
! pip install datasets evaluate

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m


## Load Dataset

In [2]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:30000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [3]:
squad = squad.train_test_split(test_size=0.2)

In [4]:
squad["train"][0]

{'id': '570964e1200fba1400367f47',
 'title': 'Himachal_Pradesh',
 'context': 'Other religions that form a small percentage are Buddhism and Sikhism. The Lahaulis of Lahaul and Spiti region are mainly Buddhists. Sikhs mostly live in towns and cities and constitute 1.16% of the state population. For example, they form 10% of the population in Una District adjoining the state of Punjab and 17% in Shimla, the state capital. The Buddhists constitute 1.15% are mainly natives and tribals from Lahaul and Spiti, where they form majority of 60% and Kinnaur where they form 40%, however the bulk are refugees from Tibet. The Muslims constitute slightly 2.18% of the population of Himachal Pradesh.',
 'question': 'What is the Muslim population in Himachal Pradesh?',
 'answers': {'text': ['slightly 2.18%'], 'answer_start': [561]}}

## Data Preprocessing

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [8]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Train The Model

In [9]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,1.4233,1.226573
2,1.0478,1.117594
3,0.8075,1.145341


TrainOutput(global_step=4500, training_loss=1.277330362955729, metrics={'train_runtime': 2917.1674, 'train_samples_per_second': 24.681, 'train_steps_per_second': 1.543, 'total_flos': 7055267401728000.0, 'train_loss': 1.277330362955729, 'epoch': 3.0})

In [11]:
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')

In [12]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
pt_model = AutoModelForQuestionAnswering.from_pretrained('./model')
pt_tokenizer = AutoTokenizer.from_pretrained('./model')

In [13]:
question = "Natural Language Processing (NLP) is a branch of artificial intelligence aimed at facilitating interaction between computers and humans through natural language. The primary objective of NLP is to enable computers to understand, interpret, and generate human language meaningfully and usefully. Techniques in NLP allow machines to process language by learning patterns from large datasets."
context = "What is the ultimate goal of Natural Language Processing (NLP)?"

In [17]:
inputs = pt_tokenizer(question, context, return_tensors="pt")

In [18]:
import torch

with torch.no_grad():
    outputs = pt_model(**inputs)

In [19]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [20]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
pt_tokenizer.decode(predict_answer_tokens)

'natural language processing ( nlp ) is a branch of artificial intelligence aimed at facilitating interaction between computers and humans through natural language.'

In [21]:
! zip -r model.zip model

  adding: model/ (stored 0%)
  adding: model/config.json (deflated 44%)
  adding: model/tokenizer.json (deflated 71%)
  adding: model/tokenizer_config.json (deflated 76%)
  adding: model/vocab.txt (deflated 53%)
  adding: model/model.safetensors (deflated 8%)
  adding: model/special_tokens_map.json (deflated 42%)
