## Data augmentation with LLM 

### Plan:

1. Make summaries for answers

2. Divide the context into chunks, after that generate question and answer
----

Models used: 

- Question generation: ```mrm8488/t5-base-finetuned-question-generation-ap```
- Answer generation: ```valhalla/t5-base-qa-qg-hl```

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('./MedQuAD.csv')

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install --no-cache-dir transformers sentencepiece

## part 1. Answers augmentation

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5 model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Function to generate summaries
def generate_summary(text):
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=2000, truncation=True)
    summary_ids = model.generate(input_ids, max_length=500, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Apply the function to create a new 'summary' column
df['answer_summary'] = df['answer'].apply(generate_summary)

# Display the resulting DataFrame
print(df[['context', 'answer', 'question', 'answer_summary']])

df.to_csv('./MedQuAD_with_augmented_answers.csv')

## part 2. Generate questions

In [None]:
# generate questions

import torch
from transformers import AutoModelWithLMHead, AutoTokenizer

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
model = model.to(device)

def generate_question(context, max_length=64):
    input_text = "context: %s </s>" % context
    features = tokenizer([input_text], return_tensors='pt').to(device)

    # Generate answer
    answer_output = model.generate(input_ids=features['input_ids'],
                                   attention_mask=features['attention_mask'],
                                   max_length=max_length)

    # Extract question
    question = tokenizer.decode(answer_output[0], skip_special_tokens=True)
    return question

In [None]:
data = []  # Initialize an empty list to store the results
max_length = 512  # Maximum sequence length supported by the model

# Iterate over the DataFrame and generate questions for each chunk of the context
for index, context in enumerate(df['context'].unique()):
    j = 0
    # Split the context into chunks of 512 tokens
    context_chunks = [context[i:i + max_length] for i in range(0, len(context), max_length)]

    # Generate answer and question for each chunk
    for chunk in context_chunks:
        generated_question = generate_question(chunk)

        # Append the results to the data list along with the index
        data.append((chunk, generated_question, index))
        j += 1


# Create a DataFrame from the data list
columns = ['context_chunk', 'generated_question']
result_df = pd.DataFrame(data, columns=columns)
result_df.to_csv('./questions_generated.csv')

## part 2. Generate answers

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5 model and tokenizer
model_name = "valhalla/t5-base-qa-qg-hl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Assuming your dataset is stored in a pandas DataFrame named 'df'
# with columns: 'context_chunk' and 'generated_question'

# Function to generate answers
def generate_answer(context_chunk, generated_question):
    input_text = f"answer: {context_chunk} context: {generated_question}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    answer_ids = model.generate(input_ids, max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)
    answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)
    return answer

# Apply the function to create a new 'generated_answer' column
result_df['generated_answer'] = result_df.apply(lambda row: generate_answer(row['context_chunk'], row['generated_question']), axis=1)

result_df.to_csv('./MedQuAD_generated.csv')

## Translate into russian

In [None]:
from deep_translator import GoogleTranslator

def translate(s):
  return GoogleTranslator('en', 'ru').translate(s)

# translate df with augmented answers 
df['context'] = df.context.apply(translate)
df['answer_summary'] = df.answer_summary.apply(translate)
df['question'] = df.question.apply(translate)

# translate df with generated answers and questions
result_df['context_chunk'] = result_df.context_chunk.apply(translate)
result_df['generated_question'] = result_df.generated_question.apply(translate)
result_df['generated_answer'] = result_df.generated_answer.apply(translate)


result_df.to_csv('./MedQuAD_qa_generated_russian.csv')
df.to_csv('./MedQuAD_answers_aigmented_russian.csv')