# Data Augmentation. Context Summarization


## I METHOD (for English) 
### Context summarization + Q-A pairs generation

- Context: abstractive summarization: t5-base 
- Question generation: t5-base-finetuned-question-generation-ap
- Question answering: t5-base
- Translation: google API

In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from deep_translator import GoogleTranslator


df = pd.read_csv('./MedQuAD.csv')

# Load T5-base model and tokenizer for summarization
summarization_model_name = "t5-base"
summarization_tokenizer = T5Tokenizer.from_pretrained(summarization_model_name)
summarization_model = T5ForConditionalGeneration.from_pretrained(summarization_model_name)

# Load T5-base-finetuned-question-generation-ap model and tokenizer
question_generation_model_name = "valhalla/t5-base-finetuned-question-generation-ap"
question_generation_tokenizer = T5Tokenizer.from_pretrained(question_generation_model_name)
question_generation_model = T5ForConditionalGeneration.from_pretrained(question_generation_model_name)

# Load T5-base model for answer generation
answer_generation_model_name = "t5-base"
answer_generation_tokenizer = T5Tokenizer.from_pretrained(answer_generation_model_name)
answer_generation_model = T5ForConditionalGeneration.from_pretrained(answer_generation_model_name)

# Function to generate summaries
def generate_summary(text):
    input_ids = summarization_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=2000, truncation=True)
    summary_ids = summarization_model.generate(input_ids, max_length=2000, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to generate questions
def generate_question(context):
    input_ids = question_generation_tokenizer.encode("generate question: " + context, return_tensors="pt", max_length=2000, truncation=True)
    question_ids = question_generation_model.generate(input_ids, max_length=2000, length_penalty=2.0, num_beams=4, early_stopping=True)
    question = question_generation_tokenizer.decode(question_ids[0], skip_special_tokens=True)
    return question

# Function to generate answers
def generate_answer(question):
    input_ids = answer_generation_tokenizer.encode("answer: " + question, return_tensors="pt", max_length=2000, truncation=True)
    answer_ids = answer_generation_model.generate(input_ids, max_length=2000, length_penalty=2.0, num_beams=4, early_stopping=True)
    answer = answer_generation_tokenizer.decode(answer_ids[0], skip_special_tokens=True)
    return answer

# Apply the functions to create new columns 'summary', 'generated_question', and 'generated_answer'
df['summary'] = df['context'].apply(generate_summary)
df['generated_question'] = df['context'].apply(generate_question)
df['generated_answer'] = df['generated_question'].apply(generate_answer)


def translate(s):
  return GoogleTranslator('en', 'ru').translate(s)

# translate df with augmented answers 
df['summary'] = df.summary.apply(translate)
df['generated_question'] = df.generated_question.apply(translate)
df['generated_answer'] = df.generated_answer.apply(translate)

df.to_csv('./MedQA_generated.csv')
