In [None]:
!pip install torch==2.0.1 transformers==4.28.1 --no-cache-dir


Collecting torch==2.0.1
  Downloading torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting transformers==4.28.1
  Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/110.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_c

In [None]:
import torch
import pandas as pd

In [None]:
print("PyTorch version:", torch.__version__)  # Should print 2.0.1
print("CUDA available:", torch.cuda.is_available())  # Should be False

import transformers
print("Transformers version:", transformers.__version__)

PyTorch version: 2.0.1+cu117
CUDA available: True
Transformers version: 4.28.1


In [None]:
file_path = '/content/medquad.csv'
df = pd.read_csv(file_path)

In [None]:
# Check the first few rows
print(df.head())
# Check for missing values
print(df.isnull().sum())
df = df.dropna()
# Check data types
print(df.dtypes)

                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  
question       0
answer         5
source         0
focus_area    14
dtype: int64
question      object
answer        object
source        object
focus_area    object
dtype: object


In [None]:
from transformers import DistilBertTokenizer
import torch

# Force CPU usage
device = torch.device("cpu")

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Test tokenizer with explicit CPU tensors
sample_text = "What are the symptoms of Glaucoma?"
inputs = tokenizer(sample_text, return_tensors="pt")  # Returns PyTorch tensors
inputs = {key: value.to(device) for key, value in inputs.items()}  # Move to CPU

print("Tokenized Inputs:")
print(inputs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Tokenized Inputs:
{'input_ids': tensor([[  101,  2054,  2024,  1996,  8030,  1997,  1043, 17298,  9006,  2050,
          1029,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
# Check for GPU and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode focus_area labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["focus_area"])  # Converts focus_area to numerical labels

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from torch.utils.data import Dataset

# Load data
df = pd.read_csv("medquad.csv")

# Encode labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["focus_area"])
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Dataset Class
class IntentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = IntentDataset(train_df["question"].tolist(), train_df["label"].tolist())
val_dataset = IntentDataset(val_df["question"].tolist(), val_df["label"].tolist())

# Model (move to device)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(le.classes_)
).to(device)  # <--- KEY CHANGE: Move model to GPU/CPU

# Training arguments (remove `no_cuda=True`)
training_args = TrainingArguments(
    output_dir="./intent_classifier",
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    num_train_epochs=3,
    logging_dir="./logs"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train!
trainer.train()



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.we

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maryanator01[0m ([33maryanator01-stony-brook-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.




Epoch,Training Loss,Validation Loss
1,7.79,7.412205
2,6.0196,6.129959
3,4.9501,5.709927


TrainOutput(global_step=4926, training_loss=6.447179759463498, metrics={'train_runtime': 496.8219, 'train_samples_per_second': 79.278, 'train_steps_per_second': 9.915, 'total_flos': 478237030174458.0, 'train_loss': 6.447179759463498, 'epoch': 3.0})

In [None]:
from transformers import DistilBertTokenizer

# Reload the tokenizer from its original source
tokenizer1 = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Save it in the same directory as the trained model
tokenizer1.save_pretrained("./intent_classifier")

print("Tokenizer saved successfully!")




Tokenizer saved successfully!


In [None]:
import pickle
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Paths
model_path = "/content/intent_classifier/checkpoint-4500"  # The folder where your full model is saved
save_path = "./intent_classifier_minimal"  # Folder for minimal version

# Load trained model
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('./intent_classifier')

# Save only the essentials
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Minimal intent classifier saved at: {save_path}")


Minimal intent classifier saved at: ./intent_classifier_minimal


In [None]:
import shutil
zip_path = '/content/intent_classifier_minimal.zip'
shutil.make_archive(zip_path, 'zip', '/content/intent_classifier_minimal')

'/content/intent_classifier_minimal.zip.zip'

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Ensure no NaN values and convert to string
train_df = train_df.dropna(subset=["focus_area", "question", "answer"])
val_df = val_df.dropna(subset=["focus_area", "question", "answer"])

train_df["input_text"] = "Focus: " + train_df["focus_area"].astype(str) + " | Question: " + train_df["question"].astype(str)
val_df["input_text"] = "Focus: " + val_df["focus_area"].astype(str) + " | Question: " + val_df["question"].astype(str)

train_texts = train_df["input_text"].tolist()
train_answers = train_df["answer"].astype(str).tolist()
val_texts = val_df["input_text"].tolist()
val_answers = val_df["answer"].astype(str).tolist()

# Custom Dataset Class
class AnswerDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
        self.targets = tokenizer(targets, max_length=256, truncation=True, padding="max_length", return_tensors="pt")

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx]
        }

    def __len__(self):
        return len(self.inputs["input_ids"])

# Prepare datasets
train_gen_dataset = AnswerDataset(train_texts, train_answers)
val_gen_dataset = AnswerDataset(val_texts, val_answers)

# Load T5 model and move to device
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./answer_generator",
    per_device_train_batch_size=4,
    evaluation_strategy="epoch",
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs_gen"
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model_t5,
    args=training_args,
    train_dataset=train_gen_dataset,
    eval_dataset=val_gen_dataset
)

# Train the model
trainer.train()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["input_text"] = "Focus: " + train_df["focus_area"].astype(str) + " | Question: " + train_df["question"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df["input_text"] = "Focus: " + val_df["focus_area"].astype(str) + " | Question: " + val_df["question"].astype(str)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,2.044,1.828541
2,1.9112,1.752744
3,1.9015,1.733549


TrainOutput(global_step=9837, training_loss=2.0351242898404145, metrics={'train_runtime': 1686.0, 'train_samples_per_second': 23.336, 'train_steps_per_second': 5.835, 'total_flos': 1331255794728960.0, 'train_loss': 2.0351242898404145, 'epoch': 3.0})

In [None]:
# Save the trained model
trainer.save_model("./answer_generator")

# Save the tokenizer
tokenizer.save_pretrained("./answer_generator")

print("Model and tokenizer saved successfully!")


Model and tokenizer saved successfully!


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Path to the final checkpoint (for model)
model_path = "/content/answer_generator/checkpoint-9500"

# Path to the tokenizer (from main model directory)
tokenizer_path = "/content/answer_generator"

# Load the model from the checkpoint
model1 = T5ForConditionalGeneration.from_pretrained(model_path)

# Load the tokenizer from the main directory
tokenizer1 = T5Tokenizer.from_pretrained(tokenizer_path)

# Save only the essential files (model + tokenizer)
save_path = "/content/generator_minimal"
model1.save_pretrained(save_path)
tokenizer1.save_pretrained(save_path)

print(f"Minimal model saved at: {save_path}")


Minimal model saved at: /content/generator_minimal


In [None]:
import shutil
zip_path = '/content/generator_minimal.zip'
shutil.make_archive(zip_path, 'zip', '/content/generator_minimal')

'/content/generator_minimal.zip.zip'

In [None]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load minimal models
intent_model = DistilBertForSequenceClassification.from_pretrained("./intent_classifier_minimal").to(device)
intent_tokenizer = DistilBertTokenizer.from_pretrained("./intent_classifier_minimal")

answer_model = T5ForConditionalGeneration.from_pretrained("./generator_minimal").to(device)
answer_tokenizer = T5Tokenizer.from_pretrained("./generator_minimal")

# Load label encoder (ensure you saved it before)
import pickle
with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

# Function to predict the answer
def predict_answer(question):
    # Predict intent (focus area)
    inputs = intent_tokenizer(question, return_tensors="pt").to(device)  # Move inputs to device
    with torch.no_grad():
        logits = intent_model(**inputs).logits
    predicted_label = torch.argmax(logits).item()
    focus_area = le.inverse_transform([predicted_label])[0]  # Convert label to text

    # Generate answer
    input_text = f"Focus: {focus_area} | Question: {question}"
    inputs = answer_tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
    outputs = answer_model.generate(**inputs, max_length=256, no_repeat_ngram_size=2)

    return answer_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test
question = "What are the symptoms of Glaucoma?"
print(predict_answer(question))


What are the signs and symptoms of Glaucoma? The Human Phenotype Ontology provides the following list of signs or symptoms for Glauscomoa. If the information is available, the table below includes how often the symptom is seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these medical terms. Signs and Symptoms Approximate number of patients (when available) Abnormality of the skin 90% Abdominal glaudom 90% Acute swollen hair 90% Hypertonia 90% - Hypertension / adolescence – Hyperplasia  Hyperglycemia : Hyperspherical fibrosis edema 90% Autosomal recessive inheritance 5% Absorption of ophthalmophilus 90% Hypoporosis 90% Short limb 90% Low-limb neoplasm 90% Decreased lateral lobe. The frequency of this phena


In [None]:
question = "why do i have headache?"
print(predict_answer(question))

How often is diabetes a cause of headache? The cause is unknown. The reason for the headache is that it is not causing the symptoms of diabetes. If the condition is caused by faulty blood glucose, the blood sugar is absorbed into the body. It is important to know if the cause causes the disease. You may have sex with your doctor - unless you have diabetes, you may need to have an autoimmune disease, or he or she may be able to get sedentary medications. Symptoms of bowel disease include :  – Having numbness, bloating, and recurrent headaches.


In [None]:
import torch
import transformers
import pandas as pd
import numpy as np
import sklearn
from sklearn import __version__ as sklearn_version

print("PyTorch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)
print("Scikit-learn version:", sklearn_version)

PyTorch version: 2.0.1+cu117
Transformers version: 4.28.1
Pandas version: 2.2.2
NumPy version: 1.26.4
Scikit-learn version: 1.6.1
