In [1]:
from IPython.display import clear_output

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Установка ruPrompts (вместе с библиотекой устанавливается и transformers)
#%pip install transformers[sentencepiece]
%pip install ruprompts
# Установка datasets
%pip install datasets
# Установка wandb для отслеживания обучения 
%pip install wandb 
%pip install jsonlines
clear_output()

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# wandb 
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Создаем проект на wandb
%env WANDB_PROJECT=ruprompts_binary_qa_29_04_23

env: WANDB_PROJECT=ruprompts_binary_qa_29_04_23


### Training and Generation

In [None]:
# Загружаем обучающую и валидационную выборки 
from datasets import load_dataset

datasets = load_dataset("json", data_files={"train": "/content/drive/MyDrive/Diploma 2.0/DaNetQA 2.0/train_prompt_3.json", "validation": "/content/drive/MyDrive/Diploma 2.0/DaNetQA 2.0/val_prompt_3.json"})
train_dataset = datasets["train"]
valid_dataset = datasets["validation"]

In [None]:
# Импорт модели
from transformers import GPT2LMHeadModel, AutoTokenizer

rugpt3 = "sberbank-ai/rugpt3large_based_on_gpt2"

model = GPT2LMHeadModel.from_pretrained(rugpt3)
tokenizer = AutoTokenizer.from_pretrained(rugpt3, pad_token="<pad>", eos_token="<pad>")

In [None]:
# Вычисляем какое среднее и максимальное количество токенов содержится в passage и question 
# для этого заводим три списка, которые будут представлять список из списков, в каждом списке – токены для question/passage/question+passage

questions_passages = []
questions_inputs = []
passages_inputs = []
for question, passage in zip(train_dataset['question'], train_dataset['passage']):
  q_p = "{} {}".format(question, passage)
  
  tokenized_qp = tokenizer(q_p)
  tokenized_q = tokenizer(question)
  tokenized_p = tokenizer(passage)
  
  questions_passages.append(len(tokenized_qp['input_ids']))
  questions_inputs.append(len(tokenized_q['input_ids']))
  passages_inputs.append(len(tokenized_p['input_ids']))

In [None]:
print("Максимальное кол-во токенов среди question:", max(questions_inputs))
print("Максимальное кол-во токенов среди passage:", max(passages_inputs))
print("Максимальное кол-во токенов среди question+passage:", max(questions_passages))

Максимальное кол-во токенов среди question: 38
Максимальное кол-во токенов среди passage: 335
Максимальное кол-во токенов среди question+passage: 348


In [None]:
# Вычисление 90 перцентиля
import numpy as np

qp_90 = np.percentile(questions_passages, 90)
q_90 = np.percentile(questions_inputs, 90)
p_90 = np.percentile(passages_inputs, 90)

print("90 квантиль question+passage:", qp_90, end=", ")
print("90 квантиль question:", q_90, end=", ")
print("90 квантиль passage:", p_90)

90 квантиль question+passage: 92.0, 90 квантиль question: 21.0, 90 квантиль passage: 78.0


In [None]:
# Задаем формат затравки, также настраиваем провайдер 
from ruprompts import Prompt, PromptFormat, LSTMPromptProvider
from transformers import set_seed

prompt_format = PromptFormat("<P*10>{answer}<P*10>{passage}<P*10>")

set_seed(1)
prompt_provider = LSTMPromptProvider()

prompt = Prompt(prompt_format, prompt_provider)
prompt.patch(model, tokenizer)

In [None]:
# Это препроцессинг, тут задаем target, то, что хотим, чтобы модель генерировала, 
# truncation – это то, на что модель смотрит и что обрезает

from ruprompts import Text2TextPreprocessor

preprocessor = Text2TextPreprocessor(
    prompt_format=prompt_format,
    tokenizer=tokenizer,
    target_field="question",
    max_tokens=200,
    truncation_field="passage"
)

train_dataset = train_dataset.map(preprocessor)
valid_dataset = valid_dataset.map(preprocessor)

In [None]:
print(max([x for elem in train_dataset for x in elem["labels"]]))
print(max([x for elem in valid_dataset for x in elem["labels"]]))
model.config.vocab_size

50253
50086


50257

In [None]:
# хотим еще дополнительно считать перплексию, т.е. насколько сгененированные примеры могут быть реальными 
import evaluate

def compute_metrics(eval_pred): 
    predictions, labels = eval_pred
    perplexity = evaluate.load("perplexity", module_type="metric")
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    results = perplexity.compute(model_id='rugpt3small_based_on_gpt2', predictions=decoded_preds)
    return {"Mean perplexity": round(results["mean_perplexity"], 2)}

In [None]:
# Задаем параметры обучения

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=".",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    eval_steps=100,
    save_steps=100,
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    learning_rate=0.1,
    max_steps = 2500, 
    report_to="wandb",
    logging_dir="ruprompts_logs",
    seed=1,
)

In [None]:
# Оптимизация 
# Также используем 
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup

optimizer = AdamW(prompt_provider.parameters(), lr=training_args.learning_rate)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=training_args.max_steps,
)



In [None]:
# Непосредственно обучение
from transformers import Trainer
from ruprompts.callbacks import (
    FreezeTransformerUnfreezePrompt,
    ReduceCheckpoint,
    SavePretrainedPrompt,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=preprocessor.collate_fn(),
    #compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
    callbacks=[FreezeTransformerUnfreezePrompt(), ReduceCheckpoint(), SavePretrainedPrompt(prompt)],
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33malexandra-fedorova1499[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,4.3437,2.265201
200,2.3768,2.080276
300,2.1714,1.86235
400,3.0848,3.259536
500,2.6117,2.016859
600,2.4425,1.944776
700,2.1841,1.863846
800,2.2835,1.883242
900,2.1162,1.900759
1000,2.1508,1.856413


There were missing keys in the checkpoint model loaded: ['transformer.wte.embedding.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.c_fc.weight', 'transformer.h.1.mlp.c_fc.bias', 'transf

TrainOutput(global_step=2500, training_loss=2.2382808044433595, metrics={'train_runtime': 8820.0415, 'train_samples_per_second': 4.535, 'train_steps_per_second': 0.283, 'total_flos': 2.6450905688064e+16, 'train_loss': 2.2382808044433595, 'epoch': 33.9})

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▃▃▂█▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▄▅▄▄▃▄▃▅▆▃█▁▂▁▁▂▂▂▂▂▅▄▂▂▄
eval/samples_per_second,▅▃▅▅▆▅▆▄▃▆▁█▇██▇▇▇▇▇▄▅▇▇▅
eval/steps_per_second,▅▃▅▅▆▅▆▄▃▆▁█▇██▇▇▇▇▇▄▅▇▇▅
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,▂▄▅▇████▇▇▇▆▆▅▅▄▃▃▂▂▂▁▁▁▁
train/loss,█▂▂▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.73611
eval/runtime,38.2417
eval/samples_per_second,11.872
eval/steps_per_second,5.936
train/epoch,33.9
train/global_step,2500.0
train/learning_rate,0.0
train/loss,1.9531
train/total_flos,2.6450905688064e+16
train/train_loss,2.23828


In [None]:
wandb.init()

In [None]:
eval_results = trainer.evaluate()

In [None]:
eval_results

{'eval_loss': 1.570906162261963,
 'eval_runtime': 16.246,
 'eval_samples_per_second': 13.542,
 'eval_steps_per_second': 6.771,
 'epoch': 27.97}

In [None]:
prompt.save_pretrained("/content/drive/MyDrive/Diploma 2.0/Pretrained Prompt Third (GPT-3 LARGE + warmup steps 500 + lr=0.1 + 2500 steps + lstm (29.04.23)")

## Pipeline

In [4]:
import csv
from tqdm import tqdm
import pandas as pd

In [None]:
# функция для генерации вопросов с ответом "да"
def generate_positive_binary_question(passage, answer, pipe, beams=1, return_seq=1):
    result = None
    questions = pipe({'answer':answer, 'passage':passage},
                     do_sample=False,
                     num_beams=beams,
                     num_return_sequences=return_seq)

    questions = [i["generated_text"].replace("<pad>", "") for i in questions]
    result = sorted(questions, key=len)[-1]  # get longest answer
    return result


# функция для генерации вопросов с ответом "нет"
def generate_negative_binary_question(passage, answer, pipe, return_seq=1):
    result = None
    questions = pipe({'answer':answer, 'passage':passage},
                     #do_sample=False,
                     num_return_sequences=return_seq)

    questions = [i["generated_text"].replace("<pad>", "") for i in questions]
    result = sorted(questions, key=len)[-1]  # get longest answer
    return result

In [None]:
from transformers import pipeline
from ruprompts import Prompt

prompt_qa = Prompt.from_pretrained("/content/drive/MyDrive/Diploma 2.0/Pretrained Prompt Third (GPT-3 LARGE + warmup steps 500 + lr=0.1 + 2500 steps + lstm (29.04.23)")

ppln = pipeline("text2text-generation-with-prompt", prompt=prompt_qa)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

rugpt3 = "sberbank-ai/rugpt3large_based_on_gpt2"

model = AutoModelForCausalLM.from_pretrained(rugpt3)
tokenizer = AutoTokenizer.from_pretrained(rugpt3, pad_token="<pad>", eos_token="<pad>")

In [None]:
# Файл с отобранными текстовыми фрагментами из Википедии (каждый passage не длиннее 3 предложений)
with open("/content/drive/MyDrive/Diploma 2.0/passages_to_generate.txt", "r") as fin:
    to_generate = [text.replace('\n', '') for text in fin.readlines()]


In [None]:
# Генерация вопросов с ответом "нет"
with open('/content/drive/MyDrive/Diploma 2.0/generated_questions_no.tsv', 'w') as fin: 
    writer = csv.DictWriter(fin, delimiter="\t", fieldnames=["text", "question"])
    writer.writeheader()
    for text in tqdm(to_generate):
        question = generate_negative_binary_question(passage=text, answer="нет", pipe=ppln, return_seq=5)
        writer.writerow({"text":text, "question":question})

In [None]:
# Генерация вопросов с ответом "да"
with open('/content/drive/MyDrive/Diploma 2.0/generated_questions_no.tsv', 'w') as fin: 
    writer = csv.DictWriter(fin, delimiter="\t", fieldnames=["text", "question"])
    writer.writeheader()
    for text in tqdm(to_generate):
        question = generate_negative_binary_question(passage=text, answer="нет", pipe=ppln, return_seq=5)
        writer.writerow({"text":text, "question":question})

In [None]:
# исключаем генерацию этих токенов для части вопросов с ответом "да" (т.к. без этого эти токены чаще всего начинают вопрос)
bad_words_ids = tokenizer(["Был", "Была", "Были", "Было", "Есть", "Входит", "Входила", "Входил", "Входило", "Входят"]).input_ids

In [None]:
# Генерация вопросов с ответом "да" с ограничением на слова – не генерируем 
with open('/content/drive/MyDrive/Diploma 2.0/generated_yes_questions_new.tsv', 'w') as fin: 
    writer = csv.DictWriter(fin, delimiter="\t", fieldnames=["text", "question"])
    writer.writeheader()
    for text in tqdm(to_generate):
        questions = ppln({'answer':"да", 'passage':text},
                             do_sample=False,
                             num_beams=3,
                             bad_words_ids=bad_words_ids)
        questions = [i["generated_text"].replace("<pad>", "") for i in questions]
        question = questions[0]
        writer.writerow({"text":text, "question":question})

In [5]:
%pip install evaluate
clear_output()

In [7]:
%pip install transformers
clear_output()

In [8]:
# evaluating predictions 
# хотим еще дополнительно считать перплексию, т.е. насколько сгененированные примеры могут быть реальными 
import evaluate

perplexity = evaluate.load("perplexity", module_type="metric")

In [10]:
print(perplexity.inputs_description)


Args:
    model_id (str): model used for calculating Perplexity
            NOTE: Perplexity can only be calculated for causal language models.
                    This includes models such as gpt2, causal variations of bert,
                    causal versions of t5, and more (the full list can be found
                    in the AutoModelForCausalLM documentation here:
                    https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )

    predictions (list of str): input text, each separate text snippet
        is one list entry.
    batch_size (int): the batch size to run texts through the model. Defaults to 16.
    add_start_token (bool): whether to add the start token to the texts,
        so the perplexity can include the probability of the first word. Defaults to True.
    device (str): device to run on, defaults to 'cuda' when available
Returns:
    perplexity: dictionary containing the perplexity scores for the texts
   

In [17]:
yes_generated_data = pd.read_csv("/content/yes questions after p-tuning.tsv", sep="\t")
yes_generated_data = yes_generated_data.dropna(subset=['passage'])
no_generated_data = pd.read_csv("/content/no questions after p-tuning.tsv", sep="\t")
no_generated_data = no_generated_data.dropna(subset=['passage'])

yes_questions, no_questions = yes_generated_data['question'].tolist(), no_generated_data['question'].tolist()
all_generated_questions = yes_questions + no_questions
print(len(all_generated_questions))

5967


In [19]:
results = perplexity.compute(predictions=all_generated_questions, model_id='gpt2', batch_size=2)
print(results)

Using pad_token, but it is not set yet.


  0%|          | 0/2984 [00:00<?, ?it/s]

{'perplexities': [15.50999927520752, 15.699748039245605, 16.292715072631836, 17.564855575561523, 18.535154342651367, 13.7576904296875, 9.756593704223633, 37.240020751953125, 11.7330904006958, 17.102441787719727, 15.295357704162598, 10.015762329101562, 17.226882934570312, 15.82883358001709, 10.553041458129883, 11.990108489990234, 29.739973068237305, 16.257783889770508, 15.097912788391113, 9.555357933044434, 15.961468696594238, 14.10372257232666, 11.283199310302734, 15.050777435302734, 14.355112075805664, 16.770235061645508, 9.131406784057617, 34.28558349609375, 18.471078872680664, 12.702224731445312, 19.7761287689209, 12.548633575439453, 17.268409729003906, 14.96661376953125, 9.222796440124512, 11.212621688842773, 12.918171882629395, 9.51819133758545, 20.7960205078125, 13.89037036895752, 14.732189178466797, 14.438624382019043, 18.00894546508789, 15.482407569885254, 18.65067481994629, 11.857394218444824, 20.80646514892578, 14.356884956359863, 19.999631881713867, 21.719818115234375, 18.71

In [21]:
print(f"Mean perplexity: {round(results['mean_perplexity'], 1)}")

Mean perplexity: 14.2
