In [9]:
from transformers import BioGptForSequenceClassification, BioGptTokenizer, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

In [21]:

# Load the dataset
df = pd.read_csv('/Users/cagriefe/Git_pull/Disease-Prediction-Using-Machine-Learning/data/treatment_plan/healifyLLM_answer_dataset.csv')

# Preprocess the dataset
df['text'] = df['label'] + ' ' + df['answer']
df = df[['text', 'disease']]

# Check for missing values
print("Missing values in dataset:")
print(df.isnull().sum())

# Drop rows with missing values
df = df.dropna()

# Display the first few rows of the dataframe
print(df.head())

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

Missing values in dataset:
text       0
disease    0
dtype: int64
                                                text   disease
0  diabetes definition Diabetes mellitus is a met...  diabetes
1  diabetes symptoms Diabetes symptoms are caused...  diabetes
2  diabetes causes Different causes are associate...  diabetes
3  diabetes risks Certain factors increase your r...  diabetes
4  diabetes complications High blood sugar damage...  diabetes


In [22]:
class MedicalDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.disease
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        target = str(self.targets[index])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': target
        }

In [23]:
# Load the tokenizer and model
tokenizer = BioGptTokenizer.from_pretrained('microsoft/BioGPT')
model = BioGptForSequenceClassification.from_pretrained('microsoft/BioGPT')

# Create DataLoader for training and validation sets
train_dataset = MedicalDataset(train_df, tokenizer, max_len=512)
val_dataset = MedicalDataset(val_df, tokenizer, max_len=512)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


Some weights of BioGptForSequenceClassification were not initialized from the model checkpoint at microsoft/BioGPT and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
import accelerate
import transformers
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

KeyError: 31