In [None]:
import pandas as pd

# Load cleaned data
data = pd.read_csv('cleaned_pubmed_articles.csv')

# Check the first few rows of the dataset
print(data.head())

   PubMed ID                                           Abstract  \
0   38968619  The purpose of this study was to determine the...   
1   38968594  Diabetic nephropathy (DN) is a severe complica...   
2   38968565  Recent studies have revealed the benefits of s...   
3   38968507  The current first-line treatment for atheroscl...   
4   38968490  Ustekinumab has been shown to be effective in ...   

                                        cleaned_text  
0  the purpose of this study was to determine the...  
1  diabetic nephropathy dn is a severe complicati...  
2  recent studies have revealed the benefits of s...  
3  the current firstline treatment for atheroscle...  
4  ustekinumab has been shown to be effective in ...  


In [None]:
from transformers import AutoTokenizer

# Initialize the tokenizer for DialoGPT
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

# Set the pad_token to be the eos_token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the data
def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding='max_length', max_length=512)

# Ensure the data is in the correct format
texts = data['cleaned_text'].dropna().tolist()  # Drop any NaN values and convert to a list
print(f"Number of texts: {len(texts)}")

# Tokenize the entire dataset
encodings = tokenize_function(texts)
print(encodings.keys())


Number of texts: 9596
dict_keys(['input_ids', 'attention_mask'])


In [None]:
import torch

class PubMedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create the dataset object
dataset = PubMedDataset(encodings)
print(f"Dataset size: {len(dataset)}")


Dataset size: 9596


In [None]:
############################ Same as Above   #############################333
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler

# Load the DialoGPT model and move it to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium").to(device)

# Set up the DataLoader with pinned memory for faster transfers
train_loader = DataLoader(dataset, batch_size=4, shuffle=True, pin_memory=True)  # Adjust batch size to your system's capacity

# Set up the optimizer, learning rate scheduler, and gradient scaler for mixed precision
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 3  # total steps = number of batches * number of epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
scaler = GradScaler()

# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    for batch in train_loader:
        optimizer.zero_grad()

        # Move tensors to GPU and perform forward pass using mixed precision
        inputs = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        with autocast():
            outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
            loss = outputs.loss

        # Backward pass and optimization step with scaled gradients
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()  # Update learning rate

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

    # Free up memory at the end of each epoch
    torch.cuda.empty_cache()

    # Optional: Early stopping condition could be added here


  scaler = GradScaler()
  with autocast():


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 0, Loss: 1.1868551969528198
Epoch: 0, Loss: 1.3956565856933594
Epoch: 0, Loss: 0.6123048663139343
Epoch: 0, Loss: 0.3887248933315277
Epoch: 0, Loss: 0.9494642019271851
Epoch: 0, Loss: 0.8536862134933472
Epoch: 0, Loss: 1.0573551654815674
Epoch: 0, Loss: 0.9795486927032471
Epoch: 0, Loss: 0.8344095945358276
Epoch: 0, Loss: 1.1826236248016357
Epoch: 0, Loss: 0.6024627089500427
Epoch: 0, Loss: 0.31862619519233704
Epoch: 0, Loss: 1.381318211555481
Epoch: 0, Loss: 0.7475625276565552
Epoch: 0, Loss: 0.6519586443901062
Epoch: 0, Loss: 0.4387052059173584
Epoch: 0, Loss: 0.5282917022705078
Epoch: 0, Loss: 0.7293857336044312
Epoch: 0, Loss: 0.6782596111297607
Epoch: 0, Loss: 0.5222831964492798
Epoch: 0, Loss: 1.11294686794281
Epoch: 0, Loss: 0.16322490572929382
Epoch: 0, Loss: 0.7817498445510864
Epoch: 0, Loss: 0.8199822306632996
Epoch: 0, Loss: 0.6584059596061707
Epoch: 0, Loss: 1.1170663833618164
Epoch: 0, Loss: 0.52577739

# Cell 2: Save the Fine-Tuned Model

In [3]:
# Saving the fine-tuned model
model_save_path = "fine_tuned_DialoGPT_model"
model.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

# Optionally, save the tokenizer as well if it was modified or fine-tuned
tokenizer.save_pretrained(model_save_path)


NameError: name 'model' is not defined

In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/fine_tuned_DialoGPT_model").to('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model.eval()  # Set the model to evaluation mode

# Function to ask a question to the chatbot
def ask_question(question, model, tokenizer, chat_history_ids=None):
    # Encode the new question with the chat history
    new_input_ids = tokenizer.encode(question + tokenizer.eos_token, return_tensors='pt').to(model.device)

    # Append the new input to the chat history
    if chat_history_ids is None:
        chat_history_ids = new_input_ids
    else:
        chat_history_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1)

    # Generate a response from the model
    reply_ids = model.generate(chat_history_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1)

    # Decode the response
    response = tokenizer.decode(reply_ids[:, chat_history_ids.shape[-1]:][0], skip_special_tokens=True)
    return response, reply_ids

# Initialize chat history
chat_history_ids = None

# Running the dialog
while True:
    question = input("Ask a question: ")
    if question.lower() == 'exit':
        print("Exiting the chat...")
        break
    response, chat_history_ids = ask_question(question, model, tokenizer, chat_history_ids)
    print(f"Chatbot: {response}")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Ask a question: How can I know I am on border line of getting diabetes?


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Chatbot: You can check the blood sugar levels in the blood of the patients with diabetes and compare them with those of the control group


KeyboardInterrupt: Interrupted by user

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


[link text](https://)