<a href="https://colab.research.google.com/github/Zhu-Pengming/Flora-Talks/blob/main/3320.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set PAD token as EOS token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define the path to your Excel file
excel_file_path = '/content/drive/My Drive/dataset.xlsx'

# Load data from Excel
df = pd.read_excel(excel_file_path)
texts = [f"Q: {q}\nA: {a}" for q, a in zip(df.iloc[:, 0].tolist(), df.iloc[:, 1].tolist())]

# Define the TextDataset class
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()  # Labels are the same as input_ids
        return item

# Prepare data loader
dataset = TextDataset(texts, tokenizer)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Set the computation device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training function
def train(model, loader, epochs=3):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
    for epoch in range(epochs):
        for batch_idx, batch in enumerate(loader):
            inputs = batch['input_ids'].to(device)
            masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            if batch_idx % 10 == 0:
                print(f"Epoch {epoch + 1}, Batch {batch_idx}, Loss: {loss.item()}")

# Start training
train(model, loader)

# Save the fine-tuned model
model.save_pretrained('/content/drive/My Drive/3320/finetuned_gpt2')
tokenizer.save_pretrained('/content/drive/My Drive/3320/finetuned_gpt2')

# Import necessary libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('/content/drive/My Drive/3320/finetuned_gpt2')
model = GPT2LMHeadModel.from_pretrained('/content/drive/My Drive/3320/finetuned_gpt2')

# Set the model to evaluation mode
model.eval()
model.to(device)

# Function to generate text
def generate_text(prompt, max_length=100):
    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}

    # Generate text with sampling enabled
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,  # To encourage diversity in generated text
        top_p=0.95,  # Use nucleus sampling
        top_k=50,  # Use top-k sampling
        do_sample=True  # Enable sampling
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example prompt about preventing a plant disease
prompt = "Q: How to solve powdery mildew ?\nA:"

# Generate text
generated_text = generate_text(prompt)
print("Generated Text:", generated_text)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1, Batch 0, Loss: 8.442889213562012
Epoch 1, Batch 10, Loss: 0.39113685488700867
Epoch 2, Batch 0, Loss: 0.4559739828109741
Epoch 2, Batch 10, Loss: 0.28849339485168457
Epoch 3, Batch 0, Loss: 0.3130795359611511
Epoch 3, Batch 10, Loss: 0.18613019585609436
Generated Text: Q: How to solve powdery mildew?
A: Remove the yellow and white leaves from the stems of the plants.
