In [None]:
import json
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Load the data from JSON files
train_file = '/content/sample_data/train.json'
eval_file = '/content/sample_data/test.json'

with open(train_file, 'r') as f:
    train_data = json.load(f)

with open(eval_file, 'r') as f:
    eval_data = json.load(f)

# Custom Dataset Class for Handling JSON Data
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        # Return the total number of Q&A pairs in the dataset
        return sum(len(item['qas']) for item in self.data)

    def __getitem__(self, idx):
        # Find the corresponding context and Q&A pair based on the index
        for item in self.data:
            if idx < len(item['qas']):
                qa_pair = item['qas'][idx]
                break
            idx -= len(item['qas'])

        context = item['context']
        question = qa_pair['question']
        answers = qa_pair['answers']

        # Tokenize input
        inputs = self.tokenizer(
            question, context,
            max_length=self.max_length, truncation=True, padding="max_length", return_offsets_mapping=True, return_tensors="pt"
        )

        # Get offset mappings for start and end positions
        offset_mapping = inputs.pop("offset_mapping")[0]

        # We assume only one answer per Q&A pair, so we handle only the first answer
        start_char = answers[0]['answer_start']
        end_char = start_char + len(answers[0]['text'])

        # Find the start and end token positions
        token_start_index = None
        token_end_index = None

        for i, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                token_start_index = i
            if start < end_char <= end:
                token_end_index = i
                break

        # Handle cases where start or end token indices are not found
        if token_start_index is None:
            token_start_index = 0  # Set to beginning of context if start not found
        if token_end_index is None:
            token_end_index = inputs['input_ids'].shape[0] - 1  # Set to end of context if end not found

        # Add start and end positions to inputs
        inputs.update({
            'start_positions': torch.tensor(token_start_index, dtype=torch.long),
            'end_positions': torch.tensor(token_end_index, dtype=torch.long)
        })

        # Flatten the tensors as this is not a batched input
        for key in inputs:
            inputs[key] = inputs[key].squeeze(0)

        return inputs

# Load the datasets
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
train_dataset = QADataset(train_data, tokenizer)
eval_dataset = QADataset(eval_data, tokenizer)

# Create DataLoaders for the datasets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

# Initialize the model
model = BertForQuestionAnswering.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  # Add batch size for evaluation
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=5_000,
    save_total_limit=3,
    logging_dir='./logs',  # Directory for storing logs
    report_to="none",  # Disable reporting to W&B or other platforms
    do_eval=True,  # Enable evaluation
    eval_steps=250,  # More frequent evaluations
    warmup_steps=500,  # Consider adding warmup steps
    fp16=True  # Use mixed precision if supported # Evaluate every 500 steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset  # Add the evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine-tuned-bert")
tokenizer.save_pretrained("./fine-tuned-bert")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,5.922781


In [2]:
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-bert"
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

context = ("Small healthcare organizations (SHCO) under Aravinda Eyecare System are located in Tirupur, Dindigul, Tuticorin, and Udumalpet. Larger healthcare organizations (HCO) are located in Madurai, Tirunelveli, Coimbatore, Pondicherry, Theni, and Salem.")
questions = [
    "Where are Small healthcare organizations located?",
]

# Function to post-process the answer for more complete sentences
def post_process_answer(answer, context):
    # Expand the answer to the end of the nearest sentence or word
    while answer and not answer.endswith(('.', '!', '?')):
        answer_end_index = context.find('.', len(answer))
        if answer_end_index == -1:  # No more sentences found
            break
        answer = context[:answer_end_index + 1]

    # Remove any leading/trailing whitespace or incomplete punctuation
    return answer.strip()

# Function to get the answer from a given context
def get_answer_from_context(question, context):
    inputs = tokenizer(
        question,
        context,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
        return_offsets_mapping=True
    )

    # Get offset mapping
    offset_mapping = inputs.pop("offset_mapping")

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Convert logits to probabilities and find the start and end positions with the maximum score
    start_probs = torch.softmax(start_logits, dim=-1)
    end_probs = torch.softmax(end_logits, dim=-1)

    # Apply heuristic: ensure the start index is less than or equal to the end index
    start_idx = torch.argmax(start_probs)
    end_idx = torch.argmax(end_probs)

    # Adjust end index to improve completeness of the answer
    if start_idx > end_idx:
        end_idx = start_idx + torch.argmax(end_probs[0, start_idx:start_idx + 15])

    # Ensure we are capturing the full answer by extending the range
    for i in range(end_idx + 1, len(end_probs[0])):
        if end_probs[0][i] > 0.5:  # Threshold to continue expanding the end index
            end_idx = i

    # Get the answer from the context using the start and end positions
    answer_start = offset_mapping[0][start_idx][0].item()
    answer_end = offset_mapping[0][end_idx][1].item()
    answer = context[answer_start:answer_end]

    # Post-process the answer for more complete sentences
    return post_process_answer(answer, context)

# Iterate over each question and find the best context chunk to answer it
for question in questions:
    answer = get_answer_from_context(question, context)

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

Question: Where are Small healthcare organizations located?
Answer: Small healthcare organizations (SHCO) under Aravinda Eyecare System are located in Tirupur, Dindigul, Tuticorin, and Udumalpet. Larger healthcare organizations (HCO) are located in Madurai, Tirunelveli, Coimbatore, Pondicherry, Theni, and Salem.



In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, AdamW, get_linear_schedule_with_warmup

# Load training and test data
with open('/content/sample_data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/content/sample_data/test.json', 'r') as f:
    test_data = json.load(f)

# Define Dataset Class
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        question = item['qas'][0]['question']
        answers = item['qas'][0]['answers'][0]

        # Tokenize the input text with offset mapping
        inputs = self.tokenizer(
            question, context,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        offset_mapping = inputs['offset_mapping'].squeeze(0)
        input_ids = inputs['input_ids'].squeeze(0)
        start_char = answers['answer_start']
        end_char = start_char + len(answers['text'])

        # Convert character positions to token positions
        token_start_index = 0
        token_end_index = 0

        for i, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                token_start_index = i
            if start < end_char <= end:
                token_end_index = i
                break

        inputs.update({
            'start_positions': torch.tensor(token_start_index, dtype=torch.long),
            'end_positions': torch.tensor(token_end_index, dtype=torch.long)
        })

        # Remove offset_mapping as it's not needed for the model
        inputs.pop('offset_mapping')

        # Flatten the tensors as this is not a batched input
        for key in inputs:
            inputs[key] = inputs[key].squeeze(0)

        return inputs

# Load the tokenizer and dataset
model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
train_dataset = QADataset(train_data, tokenizer)
test_dataset = QADataset(test_data, tokenizer)

# Create DataLoader for training and testing
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize the model
model = RobertaForQuestionAnswering.from_pretrained(model_name)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # Train for 3 epochs
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            start_positions=batch['start_positions'],
            end_positions=batch['end_positions']
        )
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss:.4f}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine-tuned-roberta-qa")
tokenizer.save_pretrained("./fine-tuned-roberta-qa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 5.8828
Epoch 2, Loss: 5.6540
Epoch 3, Loss: 5.4870


('./fine-tuned-roberta-qa/tokenizer_config.json',
 './fine-tuned-roberta-qa/special_tokens_map.json',
 './fine-tuned-roberta-qa/vocab.json',
 './fine-tuned-roberta-qa/merges.txt',
 './fine-tuned-roberta-qa/added_tokens.json',
 './fine-tuned-roberta-qa/tokenizer.json')

In [4]:
!pip install transformers datasets torch



In [None]:
!pip install huggingface_hub



In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset

model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name, low_cpu_mem_usage=True).to('cuda')

train_dataset = load_dataset('json', data_files={'train': '/content/sample_data/train.json'})['train']
test_dataset = load_dataset('json', data_files={'test': '/content/sample_data/test.json'})['test']

def preprocess_function(examples):
    questions = [q['question'] for q in examples['qas']]
    contexts = [examples['context'] for _ in examples['qas']]
    answers = [q['answers'][0]['text'] for q in examples['qas']]
    start_positions = [q['answers'][0]['answer_start'] for q in examples['qas']]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation=True,
        padding='max_length',
        max_length=256  # Reduced max length
    )

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = [
        start + len(tokenizer.encode(answer, add_special_tokens=False)) for start, answer in zip(start_positions, answers)
    ]

    return tokenized_examples

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True  # Enable mixed precision
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()
trainer.save_model("./fine-tuned-model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ValueError: Unrecognized configuration class <class 'transformers.models.mistral.configuration_mistral.MistralConfig'> for this kind of AutoModel: AutoModelForQuestionAnswering.
Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, CamembertConfig, CanineConfig, ConvBertConfig, Data2VecTextConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, ErnieConfig, ErnieMConfig, FalconConfig, FlaubertConfig, FNetConfig, FunnelConfig, GPT2Config, GPTNeoConfig, GPTNeoXConfig, GPTJConfig, IBertConfig, LayoutLMv2Config, LayoutLMv3Config, LEDConfig, LiltConfig, LlamaConfig, LongformerConfig, LukeConfig, LxmertConfig, MarkupLMConfig, MBartConfig, MegaConfig, MegatronBertConfig, MobileBertConfig, MPNetConfig, MptConfig, MraConfig, MT5Config, MvpConfig, NezhaConfig, NystromformerConfig, OPTConfig, QDQBertConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, SplinterConfig, SqueezeBertConfig, T5Config, UMT5Config, XLMConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, YosoConfig.

In [2]:
!pip install transformers datasets torch



In [46]:
import json
import torch
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Load the data from JSON files
train_file = '/content/sample_data/train.json'
eval_file = '/content/sample_data/test.json'

with open(train_file, 'r') as f:
    train_data = json.load(f)

with open(eval_file, 'r') as f:
    eval_data = json.load(f)

# Custom Dataset Class for Handling JSON Data
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        # Return the total number of Q&A pairs in the dataset
        return sum(len(item['qas']) for item in self.data)

    def __getitem__(self, idx):
        # Find the corresponding context and Q&A pair based on the index
        for item in self.data:
            if idx < len(item['qas']):
                qa_pair = item['qas'][idx]
                break
            idx -= len(item['qas'])

        context = item['context']
        question = qa_pair['question']
        answers = qa_pair['answers']

        # Tokenize input
        inputs = self.tokenizer(
            question, context,
            max_length=self.max_length, truncation=True, padding="max_length", return_offsets_mapping=True, return_tensors="pt"
        )

        # Get offset mappings for start and end positions
        offset_mapping = inputs.pop("offset_mapping")[0]

        # We assume only one answer per Q&A pair, so we handle only the first answer
        start_char = answers[0]['answer_start']
        end_char = start_char + len(answers[0]['text'])

        # Find the start and end token positions
        token_start_index = None
        token_end_index = None

        for i, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                token_start_index = i
            if start < end_char <= end:
                token_end_index = i
                break

        # Handle cases where start or end token indices are not found
        if token_start_index is None:
            token_start_index = 0  # Set to beginning of context if start not found
        if token_end_index is None:
            token_end_index = inputs['input_ids'].shape[1] - 1  # Set to end of context if end not found

        # Add start and end positions to inputs
        inputs.update({
            'start_positions': torch.tensor(token_start_index, dtype=torch.long),
            'end_positions': torch.tensor(token_end_index, dtype=torch.long)
        })

        # Flatten the tensors as this is not a batched input
        for key in inputs:
            inputs[key] = inputs[key].squeeze(0)

        return inputs

# Load the datasets
model_name = "distilbert/distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
train_dataset = QADataset(train_data, tokenizer)
eval_dataset = QADataset(eval_data, tokenizer)

# Create DataLoaders for the datasets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

# Initialize the model
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    learning_rate=2e-5,  # Adjusted for DistilBERT
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.0005,
    save_steps=5_000,
    save_total_limit=3,
    logging_dir='./logs',
    report_to="none",
    warmup_steps=500,
    fp16=True  # Use mixed precision if supported
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine-tuned-distilbert")
tokenizer.save_pretrained("./fine-tuned-distilbert")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Evaluation results: {'eval_loss': 6.156252384185791, 'eval_runtime': 3.1685, 'eval_samples_per_second': 1.578, 'eval_steps_per_second': 0.316, 'epoch': 3.0}


('./fine-tuned-distilbert/tokenizer_config.json',
 './fine-tuned-distilbert/special_tokens_map.json',
 './fine-tuned-distilbert/vocab.txt',
 './fine-tuned-distilbert/added_tokens.json',
 './fine-tuned-distilbert/tokenizer.json')

In [47]:
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering
import torch

# Load the fine-tuned model and tokenizer
model = DistilBertForQuestionAnswering.from_pretrained("./fine-tuned-distilbert")
tokenizer = DistilBertTokenizerFast.from_pretrained("./fine-tuned-distilbert")


In [52]:
def answer_question(question, context):
    # Tokenize the input question and context
    inputs = tokenizer(
        question, context,
        max_length=512, truncation=True, padding="max_length", return_tensors="pt"
    )

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the start and end logits from the model outputs
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Find the start and end token positions with the highest scores
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    # Ensure that the start_index is not beyond end_index
    if start_index > end_index:
        end_index = start_index

    # Convert token IDs to actual words
    input_ids = inputs['input_ids'].tolist()[0] # Convert tensor to list
    answer_tokens = input_ids[start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    # Post-process to remove any overlap with the question
    question_tokens = tokenizer.tokenize(question)
    answer_tokens = tokenizer.tokenize(answer)

    # Remove question tokens from the beginning of the answer
    for q_token in question_tokens:
        if q_token in answer_tokens and answer_tokens[0] == q_token:
            answer_tokens.pop(0)

    # Convert tokens back to the final answer
    final_answer = tokenizer.convert_tokens_to_string(answer_tokens)

    # Further clean up the answer by removing any [SEP] tokens or extraneous text
    final_answer = final_answer.split('[SEP]')[0].strip()

    return final_answer

In [62]:
# Define a sample context and question
context = (
    #"Aravinda Eyecare System's vision is to eliminate needless blindness."
   "The mission of Aravinda Eyecare System is to eliminate needless blindness by providing compassionate and quality eye care affordable to all."
)
question = "Mission of aravind eye care system"

# Get the model's answer
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")


Question: Mission of aravind eye care system
Answer: the mission of aravinda eyecare system is to eliminate needless blindness by providing compassionate and quality eye care affordable to all.
