In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
import pandas as pd

# Load the dataset
data_path = "/content/drive/My Drive/train.csv"
df = pd.read_csv(data_path)

# Display basic information about the dataset
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16407 entries, 0 to 16406
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   qtype     16407 non-null  object
 1   Question  16407 non-null  object
 2   Answer    16407 non-null  object
dtypes: object(3)
memory usage: 384.7+ KB
None
             qtype                                           Question  \
0   susceptibility  Who is at risk for Lymphocytic Choriomeningiti...   
1         symptoms  What are the symptoms of Lymphocytic Choriomen...   
2   susceptibility  Who is at risk for Lymphocytic Choriomeningiti...   
3  exams and tests  How to diagnose Lymphocytic Choriomeningitis (...   
4        treatment  What are the treatments for Lymphocytic Chorio...   

                                              Answer  
0  LCMV infections can occur after exposure to fr...  
1  LCMV is most commonly recognized as causing ne...  
2  Individuals of all ages who come into contact ..

In [None]:
df.isnull().sum()

Unnamed: 0,0
qtype,0
Question,0
Answer,0


In [None]:
df = df.drop_duplicates()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16359 entries, 0 to 16406
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   qtype     16359 non-null  object
 1   Question  16359 non-null  object
 2   Answer    16359 non-null  object
dtypes: object(3)
memory usage: 511.2+ KB


In [None]:
import sentencepiece as spm

# Train SentencePiece model
input_text = '/content/drive/My Drive/medicalqa.txt'  # Combine all text data into a single file
spm.SentencePieceTrainer.train(input=input_text, model_prefix='medicalqa', vocab_size=16000)

# Load the model
sp = spm.SentencePieceProcessor()
sp.load('medicalqa.model')

# Tokenize a sample sentence
sample_text = "Who is at risk for Lymphocytic Choriomeningitis?"
tokenized_text = sp.encode_as_pieces(sample_text)
print(tokenized_text)


['▁Who', '▁is', '▁at', '▁risk', '▁for', '▁Lymphocyt', 'ic', '▁Chorio', 'meningitis', '?']


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Tokenize the dataset
def tokenize_data(examples):
    inputs = [q for q in examples['Question']]
    targets = [a for a in examples['Answer']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Create a dataset from the pandas dataframe
from datasets import Dataset

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_data, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Start training
trainer.train()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/16359 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,1.1887,1.053991
2,1.1148,0.971834
3,1.0439,0.926721
4,1.0209,0.896995
5,0.9989,0.876198
6,0.9978,0.862887
7,0.9741,0.854785
8,0.9636,0.851941


TrainOutput(global_step=16360, training_loss=1.06311292391826, metrics={'train_runtime': 7927.5657, 'train_samples_per_second': 16.508, 'train_steps_per_second': 2.064, 'total_flos': 1.7712452242243584e+16, 'train_loss': 1.06311292391826, 'epoch': 8.0})

In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=3635c3800985a61753031e7d7383bfd8e61bd9444eb3db068abeb982e7f1f14a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from datasets import load_metric

# Load metrics
bleu = load_metric('bleu', trust_remote_code=True)
rouge = load_metric('rouge', trust_remote_code=True)

# Function to calculate metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels,)
    rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {'bleu': bleu_score['score'], 'rouge': rouge_score['rouge1'].mid.fmeasure}

# Update the Trainer to include the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model
results = trainer.evaluate()
print(results)


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 9.31 GiB. GPU 

In [None]:
import torch

# Generate text using the fine-tuned model
def generate_text(prompt, max_length=200, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.9):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        num_beams=5,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        early_stopping=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example generation
prompt = "What are the signs and symptoms of Lymphocytic Choriomeningitis?"
generated_text = generate_text(prompt)
print(generated_text)


What are the signs and symptoms of Lymphocytic Choriomeningitis? The Human Phenotype Ontology provides the following list of signs symptomatic and clinical signs of lymphocytes in the lymph nodes. If the information is available, the table below includes how often the symptômes are seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these medical terms. Signs and Symptoms Approximate number of patients (when available) Autosomal recessive inheritance - Cerebral cortical insufficiency of the fingernails and/or adolescence-induced hemorrhagic acidosis equinovarus 50% Hyperhidrosis 50% Muscular hypotonia 50% Hepatomegaly 50% Abnormality of blood clotting 7.5% Aplasia/H


In [None]:
import torch

# Generate text using the fine-tuned model
def generate_text(prompt, max_length=200, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.9):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        num_beams=5,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        early_stopping=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example generation
prompt = "Who is at risk for Lymphocytic Choriomeningitis (LCM)?"
generated_text = generate_text(prompt)
print(generated_text)


Lymphocytic choriomeningitis (LCM) is a rare disease in which the body's immune system attacks and destroys its own cells. It is more common in people of African descent than in other parts of the world. People who are at higher risk for LCMC are more likely to develop the disease than others.


In [None]:
import torch

# Generate text using the fine-tuned model
def generate_text(prompt, max_length=200, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.9):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        num_beams=5,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        early_stopping=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example generation
prompt = "what are marine toxins?"
generated_text = generate_text(prompt)
print(generated_text)


Key Points - Marine toxins are substances found in the body's tissues and organs, such as the liver, lungs, and the intestines. Marines are a group of substances that contain toxic substances called triglycerides, which are chemicals that build up in and out of the marine environment. These substances are found on the surface of marines and can be harmful to the environment and cause harm to other people.
