In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import DataLoader, Dataset

# Load and preprocess the dataset
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df = df[['transcription', 'Age', 'Disease', 'Symptoms']]
    df = df.dropna(subset=['Disease', 'Age', 'Symptoms'])
    df = df[(df['Disease'] != 'Not found') & (df['Age'] != 'Not found') & (df['Symptoms'] != 'Not found')]
    df['transcription'] = df['transcription'].apply(lambda x: re.sub('(\.,)', ". ", x))
    df['Combined'] = df.apply(lambda x: f"Age: {x['Age']} Disease: {x['Disease']} Symptoms: {x['Symptoms']}", axis=1)
    return df

# Define a PyTorch Dataset for T5
class T5Dataset(Dataset):
    def __init__(self, tokenizer, input_texts, target_texts, max_length):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        source_encoding = tokenizer(
            self.input_texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            self.target_texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        return {
            'source_text': self.input_texts[idx],
            'target_text': self.target_texts[idx],
            'source_input_ids': source_encoding['input_ids'].flatten(),
            'target_input_ids': target_encoding['input_ids'].flatten(),
            'source_attention_mask': source_encoding['attention_mask'].flatten(),
            'target_attention_mask': target_encoding['attention_mask'].flatten()
        }

# Train the T5 model
def train_model(model, dataloader, num_epochs=10, learning_rate=1e-4):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(num_epochs):
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch['source_input_ids']
            attention_mask = batch['source_attention_mask']
            labels = batch['target_input_ids']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}/{num_epochs} completed. Loss: {loss.item()}')

# Generate predictions using the trained model
def generate_predictions(model, tokenizer, test_data, max_length=50):
    model.eval()
    Predicted_Output = []

    for text in test_data:
        input_encoding = tokenizer(
            text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_encoding['input_ids'],
                attention_mask=input_encoding['attention_mask'],
                max_length=max_length
            )

            predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            Predicted_Output.append(predicted_text)

    return Predicted_Output

# Calculate average ROUGE-L score
def calculate_average_rouge_l(gold_labels, predictions):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    total_score = {'precision': 0, 'recall': 0, 'fmeasure': 0}
    num_samples = len(gold_labels)

    for gold_label, prediction in zip(gold_labels, predictions):
        score = scorer.score(gold_label, prediction)['rougeL']
        total_score['precision'] += score.precision
        total_score['recall'] += score.recall
        total_score['fmeasure'] += score.fmeasure

    # Calculate average
    avg_score = {key: val / num_samples for key, val in total_score.items()}
    return avg_score

# Main execution function
def main():
    # Load and preprocess the data
    df = load_and_preprocess_data("structured_data.csv")
    train_df, test_df = train_test_split(df, test_size=0.30)

    # Initialize tokenizer and dataset
    tokenizer = T5Tokenizer.from_pretrained('t5-large')
    dataset = T5Dataset(tokenizer, train_df['transcription'].tolist(), train_df['Combined'].apply(str).tolist(), max_length=512)
    loader = DataLoader(dataset, batch_size=4, shuffle=True)

    # Load and train the model
    model = T5ForConditionalGeneration.from_pretrained('t5-large')
    train_model(model, loader, num_epochs=10)

    # Generate predictions and add them to the test DataFrame
    test_df['Predicted Output'] = generate_predictions(model, tokenizer, test_df['transcription'].tolist())

    # Calculate ROUGE-L Score
    average_rouge_l_score = calculate_average_rouge_l(test_df['Combined'].astype(str), test_df['Predicted Output'])
    print("ROUGE-L Score:", average_rouge_l_score)

if __name__ == "__main__":
    main()

   


Epoch 1/10 completed. Loss: 23.721330642700195

Epoch 2/10 completed. Loss: 14.698890686035156

Epoch 3/10 completed. Loss: 12.633593559265137

Epoch 1/10 completed. Loss: 10.924788932256898

Epoch 2/10 completed. Loss: 8.9876482345697646

Epoch 3/10 completed. Loss: 7.6032560986229327

Epoch 1/10 completed. Loss: 5.3976321798732567

Epoch 2/10 completed. Loss: 2.0952670031356327

Epoch 3/10 completed. Loss: 1.9765299379027689

Epoch 1/10 completed. Loss: 0.7083568639389032

### Example:
    
gold_label = 'Age: 78.0 Disease: Prostate cancer with metastatic disease to bladder and skeletal system Symptoms: Hematuria, lower back pain, fatigue and weakness, no fever, no abdominal pain, no nausea or vomiting, no melena or hematochezia, no incontinence of urine or stool, no polyuria or polydipsia, no heat or cold intolerance, no easy bruising or bleeding, no focal weakness or numbness, no saddle paresthesia, no dizziness, syncope or near-syncope'
    
model_output = '78-year-old male has prostate cancer with metastatic disease to his bladder. he has been passing blood with very little urine. he denies any abdominal pain, no nausea or vomiting.'


###  ROUGE-L Score: {'precision': 0.26154411768649391, 'recall': 0.34594788676669863, 'fmeasure': 0.2978621295930462}

Precision (0.26154): This indicates the proportion of the words in the model's output that are found in the gold label. A precision of 0.26154 means that around 26.154% of the words in the model's output are correct as per the gold label.

Recall (0.3459): This shows the proportion of the words in the gold label that are captured in the model's output. A recall of 0.3459 means that only about 34.59% of the words in the gold label are present in the model's output.

F1 Score (0.2978): The F1 score is the harmonic mean of precision and recall. It provides a single metric that balances both precision and recall. An F1 score of 0.2978 is quite low, indicating that overall, the overlap between the model's output and the gold label is not very high.