<a href="https://colab.research.google.com/github/Zhu-Pengming/Plant/blob/main/QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall accelerate -y
!pip install accelerate>=0.21.0

[0m

In [None]:
!pip install accelerate --upgrade




In [None]:
!pip uninstall transformers -y
!pip install transformers[torch]

import accelerate
print(accelerate.__version__)


Found existing installation: transformers 4.40.2
Uninstalling transformers-4.40.2:
  Successfully uninstalled transformers-4.40.2
Collecting transformers[torch]
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.40.2
0.30.1


In [None]:
import pandas as pd
from transformers import BertForQuestionAnswering, BertTokenizerFast, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset, DataLoader
import os
from google.colab import drive

# Force sync CUDA errors for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Step 1: Load data from Google Drive
drive.mount('/content/drive', force_remount=True)

# Load Excel file
excel_path = '/content/drive/My Drive/modified_hhhhh.xlsx'
xls = pd.ExcelFile(excel_path)
sheet_data = {sheet_name: xls.parse(sheet_name) for sheet_name in xls.sheet_names}

# Step 2: Prepare QA dataset
plant_data = sheet_data['Sheet1']
questions = []
answers = []

# Generate questions and answers from Excel data
for index, row in plant_data.iterrows():
    question = f"What is the {row['Aspect']} of {row['Species']}?"
    answer = f"{row['Information']} ({row['Measure']})"
    questions.append(question)
    answers.append(answer)


# Prepare Dataset
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
max_length = 512  # BERT's maximum input length

class PlantQADataset(Dataset):
    def __init__(self, questions, answers):
        self.encodings = []
        self.start_positions = []
        self.end_positions = []
        for question, answer in zip(questions, answers):
            context = f"{question} {answer}"
            encoding = tokenizer(question, context, truncation='only_second', padding='max_length', max_length=max_length, return_offsets_mapping=True)
            self.encodings.append(encoding)

            answer_start = context.find(answer)
            answer_end = answer_start + len(answer)

            # Convert character positions to token positions
            start_position, end_position = 0, 0
            for i, (offset_start, offset_end) in enumerate(encoding['offset_mapping']):
                if offset_start <= answer_start < offset_end:
                    start_position = i
                if offset_start < answer_end <= offset_end:
                    end_position = i

            self.start_positions.append(int(start_position))
            self.end_positions.append(int(end_position))

    def __getitem__(self, idx):
        encoding = self.encodings[idx]
        item = {key: torch.tensor(val) for key, val in encoding.items() if key != 'offset_mapping'}
        item['start_positions'] = torch.tensor(self.start_positions[idx])
        item['end_positions'] = torch.tensor(self.end_positions[idx])
        return item

    def __len__(self):
        return len(self.encodings)


train_dataset = PlantQADataset(questions, answers)
train_dataloader = DataLoader(train_dataset, batch_size=4)

# Model and Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Step 3: Evaluate and predict
def answer_question(question, context):
    input_ids = tokenizer.encode(question, context)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_index = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    outputs = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]), return_dict=True)
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)
    answer = ' '.join(tokens[start_index:end_index+1])
    answer = tokenizer.convert_tokens_to_string(answer.split())
    return answer

import numpy as np
from sklearn.metrics import accuracy_score

# Function to compute accuracy
def compute_accuracy(model, dataset):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in DataLoader(dataset, batch_size=4):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            start_preds = torch.argmax(outputs.start_logits, dim=-1)
            end_preds = torch.argmax(outputs.end_logits, dim=-1)

            # Convert the indices to strings
            for input_id, start_pred, end_pred, start_pos, end_pos in zip(input_ids, start_preds, end_preds, start_positions, end_positions):
                pred_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id[start_pred:end_pred+1]))
                true_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id[start_pos:end_pos+1]))
                all_preds.append(pred_answer)
                all_labels.append(true_answer)

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

accuracy = compute_accuracy(model, train_dataset)
print(f"Model Accuracy: {accuracy:.2f}")




Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.8413


Model Accuracy: 1.00


In [None]:
def answer_question(question, context):
    input_ids = tokenizer.encode(question, context)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_index = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # Move tensors to the same device as the model
    device = model.device
    input_ids = torch.tensor([input_ids]).to(device)
    segment_ids = torch.tensor([segment_ids]).to(device)

    outputs = model(input_ids, token_type_ids=segment_ids, return_dict=True)
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)
    answer = ' '.join(tokens[start_index:end_index+1])
    answer = tokenizer.convert_tokens_to_string(answer.split())
    return answer

# Testing the model on a sample question
def test_model_on_example(model, tokenizer, species, aspect, plant_data):
    question = f"What is the {aspect} of {species}?"

    row = plant_data[
        (plant_data['Species'].str.strip().str.lower() == species.strip().lower()) &
        (plant_data['Aspect'].str.strip().str.lower() == aspect.strip().lower())
    ]

    if not row.empty:
        context = f"{row.iloc[0]['Information']} ({row.iloc[0]['Measure']})"
        answer = answer_question(question, context)
        print(f"Question: {question}")
        print(f"Context: {context}")
        print(f"Answer: {answer}")
    else:
        print("No matching row found.")

# Sample example for testing
species = "Dandelion"
aspect = "Benefits"
test_model_on_example(model, tokenizer, species, aspect, plant_data)


Question: What is the Benefits of Dandelion?
Context: Dandelion has been used to treat women's health issues, such as breast inflammation and lymph node inflammation. It can also help to promote lactation in nursing mothers. (Dandelion can be used to treat women's health issues such as breast inflammation and lymph node inflammation. It can also help to promote lactation in nursing mothers. However, it should be used under the guidance of a healthcare professional.)
Answer: dandelion has been used to treat women ' s health issues, such as breast inflammation and lymph node inflammation. it can also help to promote lactation in nursing mothers. ( dandelion can be used to treat women ' s health issues such as breast inflammation and lymph node inflammation. it can also help to promote lactation in nursing mothers. however, it should be used under the guidance of a healthcare professional. )


In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer
import tensorflow as tf
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define the path for saving the model in Google Drive
save_directory = '/content/drive/My Drive/bert_qa_model'

# Load a pre-trained BERT model and tokenizer
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Save the model and tokenizer to your Drive
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print("Model and tokenizer have been saved to your Google Drive.")


Mounted at /content/drive


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer have been saved to your Google Drive.


In [None]:
from google.colab import drive
import tensorflow as tf

# Mount Google Drive (ensure this is done correctly)
drive.mount('/content/drive', force_remount=True)

# Specify the correct directory path for the saved TensorFlow model
model_directory = '/content/drive/My Drive/bert_qa_model_tf/saved_model/1'

# Initialize the TFLite converter with the correct path
try:
    converter = tf.lite.TFLiteConverter.from_saved_model(model_directory)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    tflite_model = converter.convert()

    # Specify the path to save the TFLite model file
    tflite_model_path = '/content/drive/My Drive/bert_qa_model.tflite'
    with open(tflite_model_path, 'wb') as f:
        f.write(tflite_model)
    print("TFLite model is saved to Google Drive.")
except Exception as e:
    print("An error occurred:", e)


Mounted at /content/drive
TFLite model is saved to Google Drive.
