###PDFMINER

In [None]:
#!pip install pdfminer.six

import requests
from pdfminer.high_level import extract_text

DOWNLOAD_PATH = "/content/Glaucoma.pdf"
# URL = "https://www.bkconnection.com/static/Infinite_Vision_EXCERPT.pdf"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    'Accept-Language': 'en-GB,en;q=0.5',
    'Referer': 'https://google.com',
    'DNT': '1',
}

def extract_text_from_pages(pdf_path, start_page, end_page):
    text = ''
    # extract_text has a page_numbers parameter to specify the pages
    for page_num in range(start_page-1, end_page):
        page_text = extract_text(pdf_path, page_numbers=[page_num])
        text += page_text
    return text

start_page = 9
end_page = 24

raw_data = extract_text_from_pages(DOWNLOAD_PATH, start_page, end_page)

with open("/content/infinite_vision.txt", "w") as f:
    f.write(raw_data)
f.close()


In [None]:
!pip install pdfminer.six

###ROBERTA

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, AdamW, get_linear_schedule_with_warmup

# Load training and test data
with open('train.json', 'r') as f:
    train_data = json.load(f)

with open('test.json', 'r') as f:
    test_data = json.load(f)

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        question = item['qas'][0]['question']
        answers = item['qas'][0]['answers'][0]

        # Tokenize the input text with offset mapping
        inputs = self.tokenizer(
            question, context,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        offset_mapping = inputs['offset_mapping'].squeeze(0)
        input_ids = inputs['input_ids'].squeeze(0)
        start_char = answers['answer_start']
        end_char = start_char + len(answers['text'])

        token_start_index = 0
        token_end_index = 0

        for i, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                token_start_index = i
            if start < end_char <= end:
                token_end_index = i
                break

        inputs.update({
            'start_positions': torch.tensor(token_start_index, dtype=torch.long),
            'end_positions': torch.tensor(token_end_index, dtype=torch.long)
        })

        inputs.pop('offset_mapping')
        for key in inputs:
            inputs[key] = inputs[key].squeeze(0)

        return inputs

model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
train_dataset = QADataset(train_data, tokenizer)
test_dataset = QADataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model = RobertaForQuestionAnswering.from_pretrained(model_name)

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            start_positions=batch['start_positions'],
            end_positions=batch['end_positions']
        )
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss:.4f}")

model.save_pretrained("./fine-tuned-roberta-qa")
tokenizer.save_pretrained("./fine-tuned-roberta-qa")


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 5.7724
Epoch 2, Loss: 5.3937
Epoch 3, Loss: 5.2544


('./fine-tuned-roberta-qa/tokenizer_config.json',
 './fine-tuned-roberta-qa/special_tokens_map.json',
 './fine-tuned-roberta-qa/vocab.json',
 './fine-tuned-roberta-qa/merges.txt',
 './fine-tuned-roberta-qa/added_tokens.json',
 './fine-tuned-roberta-qa/tokenizer.json')

In [None]:
import torch
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering

model_path = "./fine-tuned-roberta-qa"
model = RobertaForQuestionAnswering.from_pretrained(model_path)
tokenizer = RobertaTokenizerFast.from_pretrained(model_path)

model.eval()

context = ("Aravinda Eyecare System's vision is to eliminate needless blindness. The mission of Aravinda Eyecare, system is to eliminate needless blindness by providing compassionate and quality eye care affordable to all. NABH stands for National Accreditation Board for Hospitals and Healthcare providers.")
questions = [
    "Where are Small healthcare organizations located?",
    "What is the vision of Aravinda Eyecare System?",
    # "What are the constituents of accreditation?",
    # "Where are Larger healthcare organizations located?",
    "What is the mission of Aravinda Eyecare System?",
    # "What is the NABH process?",
    # "What are the constituents of NABH?",
    # "What is a Healthcare organization?",
    # "What types of organizations does NABH accredit?",
]

def post_process_answer(answer, context):
    while answer and not answer.endswith(('.', '!', '?')):
        answer_end_index = context.find('.', len(answer))
        if answer_end_index == -1:
            break
        answer = context[:answer_end_index + 1]

    return answer.strip()

def get_answer_from_context(question, context):
    inputs = tokenizer(
        question,
        context,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
        return_offsets_mapping=True
    )

    offset_mapping = inputs.pop("offset_mapping")

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    start_probs = torch.softmax(start_logits, dim=-1)
    end_probs = torch.softmax(end_logits, dim=-1)

    start_idx = torch.argmax(start_probs)
    end_idx = torch.argmax(end_probs)

    if start_idx > end_idx:
        end_idx = start_idx + torch.argmax(end_probs[0, start_idx:start_idx + 15])

    for i in range(end_idx + 1, len(end_probs[0])):
        if end_probs[0][i] > 0.5:
            end_idx = i

    answer_start = offset_mapping[0][start_idx][0].item()
    answer_end = offset_mapping[0][end_idx][1].item()
    answer = context[answer_start:answer_end]

    return post_process_answer(answer, context)

for question in questions:
    answer = get_answer_from_context(question, context)

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")


Question: Where are Small healthcare organizations located?
Answer: Aravinda Eyecare System's vision is to eliminate needless blindness. The mission of Aravinda Eyecare, system is to eliminate needless blindness by providing compassionate and quality eye care affordable to all.

Question: What is the vision of Aravinda Eyecare System?
Answer: Aravinda Eyecare System's vision is to eliminate needless blindness. The mission of Aravinda Eyecare, system is to eliminate needless blindness by providing compassionate and quality eye care affordable to all.

Question: What is the mission of Aravinda Eyecare System?
Answer: Aravinda Eyecare System's vision is to eliminate needless blindness. The mission of Aravinda Eyecare, system is to eliminate needless blindness by providing compassionate and quality eye care affordable to all.



###LLAMA-2-7B

In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_

In [None]:
from huggingface_hub import login
login(token='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')  # Replace YOUR_TOKEN with your actual token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LlamaTokenizer, LlamaForQuestionAnswering, AdamW, get_linear_schedule_with_warmup

with open('train.json', 'r') as f:
    train_data = json.load(f)

with open('test.json', 'r') as f:
    test_data = json.load(f)

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        question = item['qas'][0]['question']
        answers = item['qas'][0]['answers'][0]

        inputs = self.tokenizer(
            question, context,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        offset_mapping = inputs['offset_mapping'].squeeze(0)
        input_ids = inputs['input_ids'].squeeze(0)
        start_char = answers['answer_start']
        end_char = start_char + len(answers['text'])

        token_start_index = 0
        token_end_index = 0

        for i, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                token_start_index = i
            if start < end_char <= end:
                token_end_index = i
                break

        inputs.update({
            'start_positions': torch.tensor(token_start_index, dtype=torch.long),
            'end_positions': torch.tensor(token_end_index, dtype=torch.long)
        })

        inputs.pop('offset_mapping')

        for key in inputs:
            inputs[key] = inputs[key].squeeze(0)

        return inputs

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
train_dataset = QADataset(train_data, tokenizer)
test_dataset = QADataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

model = LlamaForQuestionAnswering.from_pretrained(model_name)

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            start_positions=batch['start_positions'],
            end_positions=batch['end_positions']
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss:.4f}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine-tuned-llama-qa")
tokenizer.save_pretrained("./fine-tuned-llama-qa")


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like meta-llama/Llama-2-7b-chat-hf is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cl

In [None]:
import torch
from transformers import LlamaTokenizer, LlamaForQuestionAnswering

model_path = "./fine-tuned-llama-qa"
model = LlamaForQuestionAnswering.from_pretrained(model_path)
tokenizer = LlamaTokenizer.from_pretrained(model_path)

model.eval()

context = ("Aravinda Eyecare System's vision is to eliminate needless blindness. The mission of Aravinda Eyecare. System is to eliminate needless blindness by providing compassionate and quality eye care affordable to all. NABH stands for National Accreditation Board for Hospitals and Healthcare providers. It is a constituent board of the Quality Council of India (QCI) and was set up to establish and operate an accreditation programme for healthcare organizations in India. Initially, it was a voluntary programme but now includes mandated entry-level standards. NABH accredits all types of healthcare delivery organizations, including blood banks, imaging centers, AYUSH, nursing homes, clinics, and more. The NABH accreditation process involves three levels: Pre-accreditation entry level, Pre-accreditation progressive level, and Full accreditation. Healthcare organizations can be classified into two main categories: Small healthcare organizations (SHCO), which have less than 50 beds, and larger healthcare organizations (HCO), which have more than 50 beds. Small healthcare organizations (SHCO) under Aravinda Eyecare System are located in Tirupur, Dindigul, Tuticorin, and Udumalpet. Larger healthcare organizations (HCO) are located in Madurai, Tirunelveli, Coimbatore, Pondicherry, Theni, and Salem. The constituents of accreditation include Structure, Process, and Outcome.")

questions = [
    "Where are Small healthcare organizations located?",
    "What is the vision of Aravinda Eyecare System?",
    "What are the constituents of accreditation?",
    "Where are Larger healthcare organizations located?",
    "What is the mission of Aravinda Eyecare System?",
    "What is the NABH process?",
    "What are the constituents of NABH?",
    "What is a Healthcare organization?",
    "What types of organizations does NABH accredit?",
]

def post_process_answer(answer, context):
    while answer and not answer.endswith(('.', '!', '?')):
        answer_end_index = context.find('.', len(answer))
        if answer_end_index == -1:  # No more sentences found
            break
        answer = context[:answer_end_index + 1]

    return answer.strip()

def get_answer_from_context(question, context):
    inputs = tokenizer(
        question,
        context,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
        return_offsets_mapping=True
    )

    offset_mapping = inputs.pop("offset_mapping")

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    start_probs = torch.softmax(start_logits, dim=-1)
    end_probs = torch.softmax(end_logits, dim=-1)

    start_idx = torch.argmax(start_probs)
    end_idx = torch.argmax(end_probs)

    if start_idx > end_idx:
        end_idx = start_idx + torch.argmax(end_probs[0, start_idx:start_idx + 15])

    for i in range(end_idx + 1, len(end_probs[0])):
        if end_probs[0][i] > 0.5:
            end_idx = i

    answer_start = offset_mapping[0][start_idx][0].item()
    answer_end = offset_mapping[0][end_idx][1].item()
    answer = context[answer_start:answer_end]

    return post_process_answer(answer, context)

for question in questions:
    answer = get_answer_from_context(question, context)

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")


###GPT2

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        answer = item['answer']

        input_text = f"Q: {question} A:"
        target_text = answer

        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)
        targets = self.tokenizer(target_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        input_ids = inputs['input_ids'].squeeze(0)
        target_ids = targets['input_ids'].squeeze(0)

        return {
            'input_ids': input_ids,
            'labels': target_ids
        }

def load_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = []
    for i in range(0, len(lines), 2):
        if i + 1 < len(lines):
            question = lines[i].strip().replace("Q: ", "")
            answer = lines[i + 1].strip().replace("A: ", "")
            data.append({'question': question, 'answer': answer})

    return data
)

data = load_data('train.txt')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
dataset = QADataset(data, tokenizer)
)

model = GPT2LMHeadModel.from_pretrained("gpt2")
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

model.save_pretrained("./fine-tuned-gpt2-qa")
tokenizer.save_pretrained("./fine-tuned-gpt2-qa")




ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

###FALCON

In [None]:
!pip install transformers torch datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
from datasets import Dataset as HFDataset, load_dataset

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        qas = item['qas'][0]
        question = qas['question']
        answer = qas['answers'][0]['text']
        answer_start = qas['answers'][0]['answer_start']

        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        start_position = inputs.char_to_token(answer_start)
        end_position = start_position + len(self.tokenizer.tokenize(answer)) - 1

        if start_position is None:
            start_position = self.max_length

        if end_position is None:
            end_position = self.max_length

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'start_positions': torch.tensor(start_position, dtype=torch.long),
            'end_positions': torch.tensor(end_position, dtype=torch.long)
        }

def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

data = load_data('train.json')
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")  # Replace with the specific Falcon model you are using

model = AutoModelForQuestionAnswering.from_pretrained("tiiuae/falcon-7b")  # Replace with the specific Falcon model you are using

dataset = QADataset(data, tokenizer)

def convert_to_dict(dataset):
    return {
        'input_ids': [d['input_ids'].tolist() for d in dataset],
        'attention_mask': [d['attention_mask'].tolist() for d in dataset],
        'start_positions': [d['start_positions'].item() for d in dataset],
        'end_positions': [d['end_positions'].item() for d in dataset]
    }

hf_dataset = HFDataset.from_dict(convert_to_dict(dataset))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset,
    eval_dataset=hf_dataset,
)

trainer.train()

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model.resize_token_embeddings(len(tokenizer))
model.save_pretrained("./fine-tuned-falcon-qa")
tokenizer.save_pretrained("./fine-tuned-falcon-qa")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
!pip uninstall -y pyarrow
!pip install pyarrow --no-cache-dir

Found existing installation: pyarrow 17.0.0
Uninstalling pyarrow-17.0.0:
  Successfully uninstalled pyarrow-17.0.0
Collecting pyarrow
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m293.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed pyarrow-17.0.0


###DistilBERT

In [None]:
!pip install datasets transformers torch



In [None]:
import json
import torch
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

train_file = 'train_2.json'
eval_file = 'test_2.json'

with open(train_file, 'r') as f:
    train_data = json.load(f)

with open(eval_file, 'r') as f:
    eval_data = json.load(f)

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return sum(len(item['qas']) for item in self.data)

    def __getitem__(self, idx):
        for item in self.data:
            if idx < len(item['qas']):
                qa_pair = item['qas'][idx]
                break
            idx -= len(item['qas'])

        context = item['context']
        question = qa_pair['question']
        answers = qa_pair['answers']

        inputs = self.tokenizer(
            question, context,
            max_length=self.max_length, truncation=True, padding="max_length", return_offsets_mapping=True, return_tensors="pt"
        )

        offset_mapping = inputs.pop("offset_mapping")[0]

        start_char = answers[0]['answer_start']
        end_char = start_char + len(answers[0]['text'])

        token_start_index = None
        token_end_index = None

        for i, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                token_start_index = i
            if start < end_char <= end:
                token_end_index = i
                break

        if token_start_index is None:
            token_start_index = 0 
        if token_end_index is None:
            token_end_index = inputs['input_ids'].shape[1] - 1

        inputs.update({
            'start_positions': torch.tensor(token_start_index, dtype=torch.long),
            'end_positions': torch.tensor(token_end_index, dtype=torch.long)
        })

        for key in inputs:
            inputs[key] = inputs[key].squeeze(0)

        return inputs

model_name = "distilbert/distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
train_dataset = QADataset(train_data, tokenizer)
eval_dataset = QADataset(eval_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

model = DistilBertForQuestionAnswering.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.0005,
    save_steps=5_000,
    save_total_limit=3,
    logging_dir='./logs',
    report_to="none",
    warmup_steps=500,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()

eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

model.save_pretrained("./fine-tuned-distilbert")
tokenizer.save_pretrained("./fine-tuned-distilbert")


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Evaluation results: {'eval_loss': 6.010899543762207, 'eval_runtime': 13.542, 'eval_samples_per_second': 0.665, 'eval_steps_per_second': 0.074, 'epoch': 3.0}


('./fine-tuned-distilbert/tokenizer_config.json',
 './fine-tuned-distilbert/special_tokens_map.json',
 './fine-tuned-distilbert/vocab.txt',
 './fine-tuned-distilbert/added_tokens.json',
 './fine-tuned-distilbert/tokenizer.json')

In [None]:

from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering
import torch

model = DistilBertForQuestionAnswering.from_pretrained("./fine-tuned-distilbert")
tokenizer = DistilBertTokenizerFast.from_pretrained("./fine-tuned-distilbert")


def answer_question(question, context):
    inputs = tokenizer(
        question, context,
        max_length=512, truncation=True, padding="max_length", return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    if start_index > end_index:
        end_index = start_index

    input_ids = inputs['input_ids'].tolist()[0]
    answer_tokens = input_ids[start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    question_tokens = tokenizer.tokenize(question)
    answer_tokens = tokenizer.tokenize(answer)

    for q_token in question_tokens:
        if q_token in answer_tokens and answer_tokens[0] == q_token:
            answer_tokens.pop(0)

    final_answer = tokenizer.convert_tokens_to_string(answer_tokens)

    # Further clean up the answer by removing any [SEP] tokens or extraneous text
    final_answer = final_answer.split('[SEP]')[0].strip()

    return final_answer


In [None]:

context = (
   "To handle mercury spill management: Step 1 is to isolate and ventilate the area by closing doors or blocking access. Avoid walking through the area and turn off HVAC (Heating, Ventilation, Air Conditioning) systems. Step 2 is to prepare to respond by removing jewelry, watches, and any other clothing with metal, and wearing gloves, a mask, and other appropriate clothing during the response. Step 3 involves clean-up. Bio Medical Waste is categorized into two main types: Infected Waste and Non-Infected Waste. Chemical Waste includes expired drugs, cytotoxic drugs, narcotic drugs, and radioactive drugs. Anatomical waste consists of human body parts, tissues, lab cultures and specimens, cotton and bandages, dressings, swabs, and solid plaster. To make a complaint at Aravind, contact the manager or coordinator in the department you consulted or the Patient Care Manager at 9443953561. The mission of Aravinda Eyecare System is to eliminate needless blindness by providing compassionate and quality eye care affordable to all. The quality policy of Aravinda Eye Care System is to ensure quality in every aspect of eye care, delivered in a timely manner with utmost safety to the patient, thus achieving a high level of patient satisfaction. Healthcare organizations can be classified into two main categories: Small healthcare organizations (SHCO), which have less than 50 beds, and larger healthcare organizations (HCO), which have more than 50 beds. Sharps include needles/blades, scalpels, lancets, and anything that can cut or puncture."
)
question = "mercury spill"

answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: mercury spill
Answer: into two main types : infected waste and non - infected waste. chemical waste includes expired drugs, cytotoxic drugs, narcotic drugs, and radioactive drugs. anatomical waste consists of human body parts, tissues, lab cultures and specimens, cotton and bandages, dressings, swabs, and solid plaster. to make a complaint at aravind, contact the manager or coordinator in the department you consulted or the patient care manager at 9443953561. the mission of aravinda eyecare system is to eliminate needless blindness by providing compassionate and quality eye care affordable to
