Архитектура модели анализа кода

В данном файле проводится анализ архитектуры модели, токенизатора и подготовка к обучению модели

Импортируем необходимые модули

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import re

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from torch.utils.tensorboard import summary, writer, SummaryWriter
from tqdm import tqdm
import time
import datetime

Устанавливаем SEED

In [2]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Далее считываем исходный датасет и немного дорабатываем его

In [4]:
dataset_path = '/content/drive/MyDrive/upd_code_dataset.parquet'

In [5]:
code_dataset = pd.read_parquet(dataset_path)

In [6]:
code_dataset.head()

Unnamed: 0,response,focal_method,focal_cls,focal_method_ast,focal_cls_ast,focal_method_info,focal_cls_info,input_string_focal_method,input_string_focal_cls
0,"from microdot import Microdot, Response, abort...","<FUNC_TOKEN> def get(self, key, default=None):...",<CLS_TOKEN> <FUNC_TOKEN>,<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN>,<INFO_TOKEN>,<INFO_TOKEN>,"<FUNC_TOKEN> def get(self, key, default=None):...",<CLS_TOKEN> <FUNC_TOKEN> <INFO_TOKEN> <AST_TOKEN>
1,"from microdot import Microdot, Response, abort...","<FUNC_TOKEN> def get(self, url_pattern): retur...","<CLS_TOKEN> class Microdot: def route(self, ur...",<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN> Module( body=[ ClassDef( name='Mic...,<INFO_TOKEN> <DESCRIPTION_TOKEN> Decorator tha...,<INFO_TOKEN> Module( body=[ ClassDef( name='Mi...,"<FUNC_TOKEN> def get(self, url_pattern): retur...","<CLS_TOKEN> class Microdot: def route(self, ur..."
2,"from microdot import Microdot, Response, abort...","<FUNC_TOKEN> def post(self, url_pattern): retu...","<CLS_TOKEN> class Microdot: def route(self, ur...",<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN> Module( body=[ ClassDef( name='Mic...,<INFO_TOKEN> <DESCRIPTION_TOKEN> Decorator tha...,<INFO_TOKEN> Module( body=[ ClassDef( name='Mi...,"<FUNC_TOKEN> def post(self, url_pattern): retu...","<CLS_TOKEN> class Microdot: def route(self, ur..."
3,"from microdot import Microdot, Response, abort...","<FUNC_TOKEN> def mount(self, subapp, url_prefi...",<CLS_TOKEN> <FUNC_TOKEN>,<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN>,<INFO_TOKEN> <DESCRIPTION_TOKEN> Mount a sub-a...,<INFO_TOKEN>,"<FUNC_TOKEN> def mount(self, subapp, url_prefi...",<CLS_TOKEN> <FUNC_TOKEN> <INFO_TOKEN> <AST_TOKEN>
4,from pyner.named_entity.corpus import bio2bioe...,<FUNC_TOKEN> def iob2bio(tags): processed_tags...,<CLS_TOKEN> def split_tag(tag: str): if tag in...,<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN> Module( body=[ FunctionDef( name='...,<INFO_TOKEN> <DESCRIPTION_TOKEN> should be bio...,<INFO_TOKEN> Module( body=[ FunctionDef( name=...,<FUNC_TOKEN> def iob2bio(tags): processed_tags...,<CLS_TOKEN> def split_tag(tag: str): if tag in...


In [7]:
code_dataset = code_dataset.reset_index(drop=True)

Наконец, переходим к анализу архитектур нейросетей

Решено использовать подход, основанный на обучении (fine-tuning) нейросети CodeBERT, в основе которой лежит модель RoBERTa. Далее будем использовать метамодель в виде декодера (CodeGen или GPTBigCode)

In [8]:
from transformers import AutoTokenizer, AutoModel

Device:

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [10]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Dec 12 19:43:37 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   28C    P0              43W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Токенизаторы:

In [11]:
tokenizer_code_bert = AutoTokenizer.from_pretrained("microsoft/codebert-base")
tokenizerGPT = AutoTokenizer.from_pretrained("gpt2")
tokenizerGPT.add_special_tokens({'pad_token': '<PAD>'})

1

Посмотрим как работает базовый токенизатор для CodeBERT

Перед этим добавим новые служебные токены:

In [12]:
new_special_tokens = ['<FUNC_TOKEN>',
            '<INFO_TOKEN>',
            '<CLS_TOKEN>',
            '<AST_TOKEN>',
            '<DESCRIPTION_TOKEN>',
            '<COMMENTS_TOKEN>']

special_tokens_dict = {
    'additional_special_tokens': new_special_tokens
}

tokenizer_code_bert.add_special_tokens(special_tokens_dict)
# model_code_bert.resize_token_embeddings(len(tokenizer_code_bert))

6

In [13]:
def tokenization_example(input_str: str):
	'''Функция отображения результатов токенизации'''
	code_bert_tokens_example = tokenizer_code_bert.tokenize(input_str)
	code_bert_tokens_ids = tokenizer_code_bert.convert_tokens_to_ids(code_bert_tokens_example)
	code_bert_decoded = tokenizer_code_bert.decode(code_bert_tokens_ids)
	print(f"Длина закодированной последовательности: {len(code_bert_tokens_example)}")
	print(f"Как выглядят токены исходной фразы: {code_bert_tokens_example}")
	print(f"Индексы токенов: {code_bert_tokens_ids}")
	print(f"Декодированная строка: {code_bert_decoded}")

# tokenization_example(code_dataset['input_string_focal_method'].values[0])

In [14]:
code_dataset.head()

Unnamed: 0,response,focal_method,focal_cls,focal_method_ast,focal_cls_ast,focal_method_info,focal_cls_info,input_string_focal_method,input_string_focal_cls
0,"from microdot import Microdot, Response, abort...","<FUNC_TOKEN> def get(self, key, default=None):...",<CLS_TOKEN> <FUNC_TOKEN>,<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN>,<INFO_TOKEN>,<INFO_TOKEN>,"<FUNC_TOKEN> def get(self, key, default=None):...",<CLS_TOKEN> <FUNC_TOKEN> <INFO_TOKEN> <AST_TOKEN>
1,"from microdot import Microdot, Response, abort...","<FUNC_TOKEN> def get(self, url_pattern): retur...","<CLS_TOKEN> class Microdot: def route(self, ur...",<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN> Module( body=[ ClassDef( name='Mic...,<INFO_TOKEN> <DESCRIPTION_TOKEN> Decorator tha...,<INFO_TOKEN> Module( body=[ ClassDef( name='Mi...,"<FUNC_TOKEN> def get(self, url_pattern): retur...","<CLS_TOKEN> class Microdot: def route(self, ur..."
2,"from microdot import Microdot, Response, abort...","<FUNC_TOKEN> def post(self, url_pattern): retu...","<CLS_TOKEN> class Microdot: def route(self, ur...",<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN> Module( body=[ ClassDef( name='Mic...,<INFO_TOKEN> <DESCRIPTION_TOKEN> Decorator tha...,<INFO_TOKEN> Module( body=[ ClassDef( name='Mi...,"<FUNC_TOKEN> def post(self, url_pattern): retu...","<CLS_TOKEN> class Microdot: def route(self, ur..."
3,"from microdot import Microdot, Response, abort...","<FUNC_TOKEN> def mount(self, subapp, url_prefi...",<CLS_TOKEN> <FUNC_TOKEN>,<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN>,<INFO_TOKEN> <DESCRIPTION_TOKEN> Mount a sub-a...,<INFO_TOKEN>,"<FUNC_TOKEN> def mount(self, subapp, url_prefi...",<CLS_TOKEN> <FUNC_TOKEN> <INFO_TOKEN> <AST_TOKEN>
4,from pyner.named_entity.corpus import bio2bioe...,<FUNC_TOKEN> def iob2bio(tags): processed_tags...,<CLS_TOKEN> def split_tag(tag: str): if tag in...,<AST_TOKEN> Module( body=[ FunctionDef( name='...,<AST_TOKEN> Module( body=[ FunctionDef( name='...,<INFO_TOKEN> <DESCRIPTION_TOKEN> should be bio...,<INFO_TOKEN> Module( body=[ FunctionDef( name=...,<FUNC_TOKEN> def iob2bio(tags): processed_tags...,<CLS_TOKEN> def split_tag(tag: str): if tag in...


Далее необхоимо описать класс Dataset для нашей модели

In [15]:
class Code2TestDataset(Dataset):
	'''Класс датасет для задачи генерации тестов'''

	def __init__(self, code_dataset, tokenizer_code_bert, tokenizer_gpt, max_length=512):
		'''
		Конструктор датасета

		Параметры:
		- code_dataset: датасет pd.DataFrame
		- tokenizer_code_bert: токенизатор code_bert
		- tokenizer_gpt: токенизатор gpt
		- max_length: максимальная длина последовательности (default: 512)
		'''
		self.code_dataset = code_dataset
		self.tokenizer_code_bert = tokenizer_code_bert
		self.tokenizer_gpt = tokenizer_gpt
		self.max_length = max_length

	def __getitem__(self, idx, idx_to_token=False):
		'''
		Get-метод - возвращает сэмпл по индексу

		Параметры:
		- idx: индекс
		- idx_to_token: флаг для отображения токенов из индексов (default: False)
		'''
		focal_method_input = self.code_dataset.at[idx, 'input_string_focal_method']
		focal_cls_input = self.code_dataset.at[idx, 'input_string_focal_cls']
		response = self.code_dataset.at[idx, 'response']

		def encode_text(text, tokenizer, tokenizer_flag = True):
			encoding = tokenizer.encode_plus(
				text,
				add_special_tokens=True,
				max_length=self.max_length if tokenizer_flag else self.max_length * 2,
				padding='max_length',
				truncation=True,
				return_attention_mask=True,
				return_tensors='pt',
			)
			input_ids = encoding['input_ids'].flatten()
			attention_mask = encoding['attention_mask'].flatten()
			return input_ids, attention_mask

		input_ids_focal_method, attention_mask_focal_method = encode_text(focal_method_input, self.tokenizer_code_bert)
		input_ids_focal_cls, attention_mask_focal_cls = encode_text(focal_cls_input, self.tokenizer_code_bert)
		input_ids_response, attention_mask_response = encode_text(response, self.tokenizer_gpt, tokenizer_flag = False)
		input_ids_focal_method_decoder, attention_mask_focal_method_decoder = encode_text(focal_method_input, self.tokenizer_gpt)
		input_ids_focal_cls_decoder, attention_mask_focal_cls_decoder = encode_text(focal_cls_input, self.tokenizer_gpt)

		if idx_to_token:
			return {
				'input_ids_focal_method': self.tokenizer_code_bert.convert_ids_to_tokens(input_ids_focal_method),
				'attention_mask_focal_method': attention_mask_focal_method,
				'input_ids_focal_cls': self.tokenizer_code_bert.convert_ids_to_tokens(input_ids_focal_cls),
				'attention_mask_focal_cls': attention_mask_focal_cls,
				'ids_response': self.tokenizer_gpt.convert_ids_to_tokens(input_ids_response),
				'attention_mask_response': attention_mask_response
			}
		return {
			'input_ids_focal_method': input_ids_focal_method,
			'attention_mask_focal_method': attention_mask_focal_method,
			'input_ids_focal_cls': input_ids_focal_cls,
			'attention_mask_focal_cls': attention_mask_focal_cls,
			'ids_response': input_ids_response,
			'attention_mask_response': attention_mask_response,
			'input_ids_focal_method_decoder': input_ids_focal_method_decoder,
			'attention_mask_focal_method_decoder': attention_mask_focal_method_decoder,
			'input_ids_focal_cls_decoder': input_ids_focal_cls_decoder,
			'attention_mask_focal_cls_decoder': attention_mask_focal_cls_decoder
		}

	def __len__(self):
		'''Функция возвращает длину датасета. В качестве длины берется размер датасета по axis = 0'''
		return self.code_dataset.shape[0]


Тестируем написанный класс

In [16]:
code2test_dataset = Code2TestDataset(code_dataset=code_dataset,
                                     tokenizer_code_bert=tokenizer_code_bert,
                                     tokenizer_gpt=tokenizerGPT)

In [17]:
print(f"Длина датасета составляет: {len(code2test_dataset)}")

Длина датасета составляет: 280458


Всё работает корректно! Следующим шагом необходимо разбить датасет на train и val

In [18]:
def get_datasets(dataset_cls = Code2TestDataset,
				max_length = 512,
				data = code_dataset,
				tokenizer_code_bert = tokenizer_code_bert,
				tokenizer_gpt = tokenizerGPT,
				train_size = 0.7):
	'''
	Функция get_datasets() возвращает train и val датасеты на основе конструктора AccentDataset, делая train_val_spilt

	Параметры:
	-dataset_cls: класс датасета, конструктор которого будет вызываться (default: Code2TestDataset)
	-max_length: максимальная статья последовательности токенов
	-data: датасает pd.DataFrame (default: code_dataset)
	-tokenizer_code_bert: токенизатор codeBERT (default: tokenizer_code_bert)
	-tokenizer_gpt: токенизатор GPT2 (default: tokenizer_gpt)
	-train_size: размер тренировочной выборки (default: 0.7)

	'''

	dataset = dataset_cls(code_dataset = data,
					   	tokenizer_code_bert = tokenizer_code_bert,
						tokenizer_gpt=tokenizer_gpt,
						max_length=max_length)

	train_size = int(train_size * len(dataset))
	val_size = len(dataset) - train_size
	train_dataset, test_dataset = random_split(dataset, [train_size, val_size])

	return train_dataset, test_dataset

train_dataset, val_dataset = get_datasets(train_size=0.9)

Проверяем полученные датасеты

In [19]:
print(f"Количество данных в train и val выборках соответственно: {len(train_dataset), len(val_dataset)}")

Количество данных в train и val выборках соответственно: (252412, 28046)


In [20]:
def decode_sequence(tokens_ids, tokenizer):
	'''Декодирование последовательности токенов'''
	code_bert_decoded = tokenizer.decode(tokens_ids)
	print(f"Декодированная строка: {code_bert_decoded}")

Далее получим DataLoader, по которому будем итерироваться

In [21]:
def get_loaders(train_dataset = train_dataset,
			val_dataset = val_dataset,
			shuffle_train = True,
			shuffle_val = False,
			batch_size = 32):

	'''
	Функция get_loaders() для получения train, val даталоадеров

	Параметры:
	-train_dataset: тренировочный датасет (default: train_dataset)
	-val_dataset: валидационный датасет (default: val_dataset)
	-shuffle_train: флаг перемешивания для train (default: True)
	-shuffle_val: флаг перемешивания для val (default: False)
	-batch_size: размер батча данных (default: 32)
	'''

	# train_dataloader
	train_dataloader = DataLoader(
			train_dataset,
			batch_size = batch_size,
			shuffle = shuffle_train,
		)

	# validation_dataloader
	validation_dataloader = DataLoader(
			val_dataset,
			batch_size = batch_size,
			shuffle = shuffle_val,
		)

	# Возвращаем даталоадеры
	return train_dataloader, validation_dataloader

train_dataloader, validation_dataloader = get_loaders(batch_size=4)

Проверка итерирования

In [22]:
for i, batch in enumerate(tqdm(train_dataloader)):
    if i == 0:
        break
    pass

  0%|          | 0/63103 [00:00<?, ?it/s]


Корректно отрабатывает!

Далее, собираем архитектуру и готовимся обучать

In [23]:
# model_code_bert = AutoModel.from_pretrained("microsoft/codebert-base", output_hidden_states= True).to(device)
# model_code_bert.resize_token_embeddings(len(tokenizer_code_bert))

Как работает модель codeBERT:

In [24]:
# for i, batch in enumerate(train_dataloader):

# 	# Проверка корректности работы
# 	b_input_ids = batch['input_ids_focal_method'].to(device)
# 	b_input_mask = batch['attention_mask_focal_method'].to(device)

# 	outputs_code_bert = model_code_bert(b_input_ids, attention_mask=b_input_mask)
# 	last_hidden_state_code_bert = outputs_code_bert['last_hidden_state']
# 	print(last_hidden_state_code_bert.size())
# 	break

Таким образом, для каждого токена мы получим свое закодированное значение размерности 768

Модель GPT2:

In [25]:
from transformers import AutoConfig

modelGPT2Path = "gpt2"
# config = AutoConfig.from_pretrained(modelGPT2Path, is_decoder=True, add_cross_attention= True)
# config.add_cross_attention = True  # Включение cross-attention

# modelGPT2 = AutoModel.from_pretrained(modelGPT2Path, config=config).to(device)
# modelGPT2.resize_token_embeddings(len(tokenizerGPT))

Как работает модель GPTBigCode

In [26]:
# for i, batch in enumerate(train_dataloader):

# 	b_input_ids = batch['input_ids_focal_method'].to(device)
# 	b_input_mask = batch['attention_mask_focal_method'].to(device)

# 	outputs_code_bert = model_code_bert(b_input_ids, attention_mask=b_input_mask)
# 	last_hidden_state_code_bert = outputs_code_bert['last_hidden_state']

# 	print(last_hidden_state_code_bert.size())

# 	# Проверка корректности работы
# 	response_input_ids = batch['ids_response'].to(device)
# 	response_input_mask = batch['attention_mask_response'].to(device)
# 	gpt_output = modelGPT2(input_ids=response_input_ids,
# 							  attention_mask=response_input_mask,
# 							  encoder_hidden_states = last_hidden_state_code_bert)
# 	print(gpt_output['last_hidden_state'].size())


# 	# outputs_code_bert = model_code_bert(b_input_ids, attention_mask=b_input_mask)
# 	# last_hidden_state_code_bert = outputs_code_bert['last_hidden_state']
# 	# print(last_hidden_state_code_bert.size())
# 	break

Ну, как-то худо-бедно всё это дело запускается. Пробуем строить модель

In [40]:
from transformers import GPT2LMHeadModel

class LargeCodeModel(nn.Module):
	'''Класс для сложной языковой модели, которая обрабатывает входной код'''
	def __init__(self, bert_model_name, gpt2_name):
		super(LargeCodeModel, self).__init__()

		self.bert1 = AutoModel.from_pretrained(bert_model_name, output_hidden_states= True)
		self.bert2 = AutoModel.from_pretrained(bert_model_name, output_hidden_states= True)
		self.tokenizer_code_bert = AutoTokenizer.from_pretrained(bert_model_name)

		self.new_special_tokens = ['<FUNC_TOKEN>',
            '<INFO_TOKEN>',
            '<CLS_TOKEN>',
            '<AST_TOKEN>',
            '<DESCRIPTION_TOKEN>',
            '<COMMENTS_TOKEN>']

		self.special_tokens_dict = {
			'additional_special_tokens': new_special_tokens
		}

		self.tokenizer_code_bert.add_special_tokens(self.special_tokens_dict)
		self.bert1.resize_token_embeddings(len(self.tokenizer_code_bert))
		self.bert2.resize_token_embeddings(len(self.tokenizer_code_bert))

		self.gpt2_config = AutoConfig.from_pretrained(gpt2_name, is_decoder=True, add_cross_attention = True)
		self.gpt2_config.add_cross_attention = True  # Включение cross-attention
		self.tokenizerGPT = AutoTokenizer.from_pretrained(gpt2_name)
		self.tokenizerGPT.add_special_tokens({'pad_token': '<PAD>'})
		self.gpt2 = GPT2LMHeadModel.from_pretrained(gpt2_name, config=self.gpt2_config)
		self.gpt2.resize_token_embeddings(len(self.tokenizerGPT))

		self.layer_norm = nn.LayerNorm(self.bert1.config.hidden_size)

		self.projection = nn.Linear(
            self.bert1.config.hidden_size + self.bert2.config.hidden_size,
            self.gpt2.config.hidden_size
        )

	# forward call
	def forward(self, focal_method_input_ids,
			 			focal_method_attention_masks,
						focal_cls_input_ids,
						focal_cls_attention_masks,
						response_ids, response_attention_masks,
						decoder_input_ids_focal_method,
						decoder_input_ids_focal_cls,
						attention_mask_focal_method_decoder,
						attention_mask_focal_cls_decoder):

		# print(focal_method_input_ids.size())
		# print(focal_method_attention_masks.size())
		# print(type(focal_method_input_ids))
		# print(type(focal_method_attention_masks))
		# print(type(focal_cls_input_ids))
		# print(type(focal_cls_attention_masks))
		# print(type(response_ids))
		# print(type(response_attention_masks))

		decoder_input_ids = torch.cat([decoder_input_ids_focal_method,
		                                  decoder_input_ids_focal_cls], dim=1)

		decoder_attention_masks = torch.cat([attention_mask_focal_method_decoder,
		                                  attention_mask_focal_cls_decoder], dim=1)

		# print(decoder_input_ids.size())

		bert1_outputs = self.bert1(focal_method_input_ids, focal_method_attention_masks)
		last_hidden_state_bert1 = bert1_outputs['last_hidden_state']

		bert2_outputs = self.bert2(focal_cls_input_ids, focal_cls_attention_masks)
		last_hidden_state_bert2 = bert2_outputs['last_hidden_state']

		# print(last_hidden_state_bert1.size())
		# print(last_hidden_state_bert2.size())

		concat_hidden_states = torch.cat([last_hidden_state_bert1,
		                                  last_hidden_state_bert2], dim=1)

		# print(concat_hidden_states.size())

		# LayerNormalization
		normalized_hidden_states = self.layer_norm(concat_hidden_states)

		# Для BatchNorm
		# batch_norm_input = concat_hidden_states.view(-1, 768)
		# normalized_hidden_states = self.batch_norm(batch_norm_input)
		# normalized_hidden_states = normalized_hidden_states.view(2, 1024, 768)
		# print(normalized_hidden_states.size())
		# print(torch.cat([focal_method_attention_masks, focal_cls_attention_masks], dim=1).size())
		# print(response_ids.size())
		# print(response_input_mask.size())

		# print(response_attention_masks.size())

		# print('No problems')

		gpt2_outputs = self.gpt2(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_masks,
            encoder_hidden_states=normalized_hidden_states,
            encoder_attention_mask=torch.cat([focal_method_attention_masks, focal_cls_attention_masks], dim=1),
						labels=response_ids
        )

		return gpt2_outputs



Отлаживаем модель

In [41]:
CodeModel = LargeCodeModel(bert_model_name="microsoft/codebert-base",
                           gpt2_name="gpt2").to(device)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.11.crossat

Далее необходимо объявить функцию train-val loop

Для начала необходимо объявить дополнительные функции для отображения времени и подсчёта метрик качества

In [42]:
def format_time(elapsed):
	'''Функция форматирования времени'''
	return str(datetime.timedelta(seconds=int(round((elapsed)))))

def token_accuracy_calc(logits, labels, attention_mask):
	'''Функция подсчета accuracy для данных'''
	pred_flat = np.argmax(logits, axis=-1).flatten()
	labels_flat = labels.flatten()
	mask_flat = attention_mask.flatten()
	accuracy = np.sum(pred_flat[mask_flat] == labels_flat[mask_flat]) / len(labels_flat[mask_flat])
	return accuracy

def token_bleu_calc(logits, labels, attention_mask):
	'''Функция подсчета BLEU для данных'''

	pred_flat = np.argmax(logits, axis=-1)
	labels_flat = labels
	mask_flat = attention_mask

	pred_tokens = []
	true_tokens = []

	for i in range(pred_flat.shape[0]):
		pred_seq = pred_flat[i][mask_flat[i] == 1]
		true_seq = labels_flat[i][mask_flat[i] == 1]
		pred_tokens.append(pred_seq)
		true_tokens.append(true_seq)

	pred_strings = tokenizerGPT.batch_decode(pred_tokens, skip_special_tokens=True)
	true_strings = tokenizerGPT.batch_decode(true_tokens, skip_special_tokens=True)

	# Вычисление BLEU
	bleu_scores = []
	smoothing_function = SmoothingFunction().method1

	for pred, true in zip(pred_strings, true_strings):
		pred_tokens = pred.split()
		true_tokens = [true.split()]
		bleu_score = sentence_bleu(true_tokens, pred_tokens, smoothing_function=smoothing_function)
		bleu_scores.append(bleu_score)

	average_bleu = np.mean(bleu_scores)
	return average_bleu



Как минимимум оно запускается

Далее реализуем саму функцию train-val-loop

Перед этим объями дополнительные настройки обучения

In [43]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(CodeModel.parameters(), lr=3e-5)
num_epochs = 1
train_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_scheduler(
    name='linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=train_steps
)
tensorboard_log_dir = 'runs/CodeModelLogs/'
tensorboard_path_accuracy = 'runs/model_accuracy_score_{:.2f}.pth'
tensorboard_path_loss = 'runs/model_val_loss_{:.2f}.pth'
tensorboard_path_bleu = 'runs/model_bleu_score_{:.2f}.pth'

И, наконец, функция:

In [50]:
def train_val_loop_codeLM(model = CodeModel,
						train_loader = train_dataloader,
						val_loader = validation_dataloader,
						optimizer = optimizer,
						scheduler = lr_scheduler,
						num_epochs = num_epochs,
						device = 'cuda',
						model_save_path_accuracy = tensorboard_path_accuracy,
						model_save_path_loss = tensorboard_path_loss,
						model_save_path_bleu = tensorboard_path_bleu,
						tensorboard_log_dir = tensorboard_log_dir,
						gradient_accumulation_steps = 2,
						eval_every = 1,
						test_step_only = False):
	'''
	Функция для реализации train-val loop обучения нашей модели

	Параметры:
	-model: модель нейронной сети
	-train_loader: тренировочный датасет
	-val_loader: валидационный датасет
	-optimizer: оптимизатор
	-scheduler: изменение для learning_rate (расписание)
	-num_epochs: число эпох для обучения
	-device: устройство
	-model_save_path_accuracy: путь для сохранения весов модели (с лучшей accuracy)
	-model_save_path_loss: путь для сохранения весов модели (с лучим val_loss)
	-model_save_path_bleu: путь для сохранения весов модели (с лучим val_bleu_score)
	-tensorboard_log_dir: путь для записи логов в TensorBoard,
	-gradient_accumulation_steps: число шагов для накопления градиентов
	-eval_every: число шагов, через которые делаем валидацию
	-test_step_only: вспомогательная логика для тестирования обучения небольшого числа шагов (default: False)
	'''

	writer = SummaryWriter(log_dir=tensorboard_log_dir)
	history = {
		'train_loss': [],
		'train_bleu': [],
		'train_accuracy': [],
		'val_accuracy': [],
		'val_loss': [],
		'val_bleu': []
	}
	best_val_loss = float('inf')
	best_val_bleu = 0.0
	best_val_accuracy = 0.0
	model.to(device)

	for epoch in range(num_epochs):
		print("")
		print('======== Epoch {:} / {:} ========'.format(epoch + 1, num_epochs))
		print('Training...')

		t0 = time.time()
		model.train()
		total_train_loss = 0
		total_train_bleu = 0
		total_train_accuracy = 0
		num_train_steps = 0

		for step, batch in enumerate(tqdm(train_loader)):

			if test_step_only and step >= 800:  # Прерываем после первого батча
				break


			if step % 1500 == 0 and not step == 0:
				# Calculate elapsed time in minutes.
				elapsed = format_time(time.time() - t0)
				# Report progress.
				print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

			optimizer.zero_grad()

			focal_method_input_ids = batch['input_ids_focal_method'].to(device)
			focal_method_attention_masks = batch['attention_mask_focal_method'].to(device)

			focal_cls_input_ids = batch['input_ids_focal_cls'].to(device)
			focal_cls_attention_masks = batch['attention_mask_focal_cls'].to(device)

			response_ids = batch['ids_response'].to(device)
			response_attention_masks = batch['attention_mask_response'].to(device)

			input_ids_focal_method_decoder = batch['input_ids_focal_method_decoder'].to(device)
			input_ids_focal_cls_decoder = batch['input_ids_focal_cls_decoder'].to(device)

			attention_mask_focal_method_decoder = batch['attention_mask_focal_method_decoder'].to(device)
			attention_mask_focal_cls_decoder = batch['attention_mask_focal_cls_decoder'].to(device)

			output_codeLM = model(focal_method_input_ids, focal_method_attention_masks,
						focal_cls_input_ids, focal_cls_attention_masks,
						response_ids, response_attention_masks,
						input_ids_focal_method_decoder, input_ids_focal_cls_decoder,
						attention_mask_focal_method_decoder, attention_mask_focal_cls_decoder)

			loss = output_codeLM.loss
			logits = output_codeLM.logits

			logits = logits.detach().cpu().numpy()
			response_ids = response_ids.cpu().numpy()
			response_attention_masks = response_attention_masks.cpu().numpy()

			accuracy_train = token_accuracy_calc(logits, response_ids, response_attention_masks)
			bleu_train = token_bleu_calc(logits, response_ids, response_attention_masks)

			total_train_accuracy += accuracy_train
			total_train_bleu += bleu_train

			total_train_loss += loss.item()
			num_train_steps += 1

			loss.backward()

			if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
				torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Клипаем накопленные градиенты
				optimizer.step()
				scheduler.step()

		avg_train_loss = total_train_loss / num_train_steps
		avg_train_accuracy = total_train_accuracy / num_train_steps
		avg_train_bleu_score = total_train_bleu / num_train_steps

		training_time = format_time(time.time() - t0)

		print("")
		print("  Average training loss: {0:.2f}".format(avg_train_loss))
		# print("  Average training accuracy: {0:.2f}".format(avg_train_accuracy))
		# print("  Average training BLEU score: {0:.2f}".format(avg_train_bleu_score))
		print("  Training epoch took: {:}".format(training_time))

		history['train_loss'].append(avg_train_loss)
		history['train_accuracy'].append(avg_train_accuracy)
		history['train_bleu'].append(avg_train_bleu_score)

		# Логирование в TensorBoard для обучения
		writer.add_scalar("Train/Loss", avg_train_loss, epoch + 1)
		writer.add_scalar("Train/Accuracy", avg_train_accuracy, epoch + 1)
		writer.add_scalar("Train/BLEU_score", avg_train_bleu_score, epoch + 1)

		print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {avg_train_accuracy:.4f}, Train BLEU score: {avg_train_bleu_score:.4f}")

		print("")
		print("Running Validation...")


		t0 = time.time()

		# Put the model in evaluation mode--the dropout layers behave differently
		# during evaluation.
		model.eval()

		if (epoch + 1) % eval_every == 0:
			model.eval()
			total_eval_loss = 0
			total_eval_accuracy = 0
			total_eval_bleu = 0
			num_eval_steps = 0

		with torch.no_grad():
			for batch in tqdm(val_loader):
				if test_step_only and num_eval_steps >= 200:  # Прерываем если хотим проконтроллировать
					break

				focal_method_input_ids = batch['input_ids_focal_method'].to(device)
				focal_method_attention_masks = batch['attention_mask_focal_method'].to(device)

				focal_cls_input_ids = batch['input_ids_focal_cls'].to(device)
				focal_cls_attention_masks = batch['attention_mask_focal_cls'].to(device)

				response_ids = batch['ids_response'].to(device)
				response_attention_masks = batch['attention_mask_response'].to(device)

				input_ids_focal_method_decoder = batch['input_ids_focal_method_decoder'].to(device)
				input_ids_focal_cls_decoder = batch['input_ids_focal_cls_decoder'].to(device)

				attention_mask_focal_method_decoder = batch['attention_mask_focal_method_decoder'].to(device)
				attention_mask_focal_cls_decoder = batch['attention_mask_focal_cls_decoder'].to(device)

				output_codeLM = model(focal_method_input_ids, focal_method_attention_masks,
						focal_cls_input_ids, focal_cls_attention_masks,
						response_ids, response_attention_masks,
						input_ids_focal_method_decoder, input_ids_focal_cls_decoder,
						attention_mask_focal_method_decoder, attention_mask_focal_cls_decoder)

				loss = output_codeLM.loss
				logits = output_codeLM.logits

				logits = logits.detach().cpu().numpy()
				response_ids = response_ids.cpu().numpy()
				response_attention_masks = response_attention_masks.cpu().numpy()

				accuracy_val = token_accuracy_calc(logits, response_ids, response_attention_masks)
				bleu_val = token_bleu_calc(logits, response_ids, response_attention_masks)

				total_eval_accuracy += accuracy_val
				total_eval_bleu += bleu_val

				total_eval_loss += loss.item()
				num_eval_steps += 1

		avg_val_loss = total_eval_loss / num_eval_steps
		avg_val_accuracy = total_eval_accuracy / num_eval_steps
		avg_val_bleu_score = total_eval_bleu / num_eval_steps

		history['val_loss'].append(avg_val_loss)
		history['val_accuracy'].append(avg_val_accuracy)
		history['val_bleu'].append(avg_val_bleu_score)

		# Логирование в TensorBoard для валидации
		writer.add_scalar("Validation/Loss", avg_val_loss, epoch + 1)
		writer.add_scalar("Validation/Accuracy", avg_val_accuracy, epoch + 1)

		print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_accuracy:.4f},  Validation BLEU score: {avg_val_bleu_score:.4f}")

		# Ну вот тут надо настроить, чтобы
		# Если точность выше, то сохраняем веса
		# if avg_val_accuracy > best_val_accuracy:
		# 	best_val_accuracy = avg_val_accuracy
		# 	torch.save(model.state_dict(), model_save_path_accuracy.format(best_val_accuracy))
		# 	print(f"Model saved to {model_save_path_accuracy.format(best_val_accuracy)}")

		# # Если лосс ниже, то сохраняем веса
		# if avg_val_loss < best_val_loss:
		# 	best_val_loss = avg_val_loss
		# 	torch.save(model.state_dict(), model_save_path_loss.format(best_val_loss))
		# 	print(f"Model saved to {model_save_path_loss.format(best_val_loss)}")

		# # Если BLEU выше, то сохраняем веса
		# if avg_val_bleu_score > best_val_bleu:
		# 	best_val_bleu = avg_val_bleu_score
		# 	torch.save(model.state_dict(), model_save_path_bleu.format(best_val_bleu))
		# 	print(f"Model saved to {model_save_path_bleu.format(best_val_bleu)}")

	writer.close()
	return history

Наконец, пробуем запустить обучение

In [51]:
training_results = train_val_loop_codeLM(device='cuda', test_step_only = True, num_epochs = 1)


Training...


  1%|▏         | 800/63103 [17:08<22:15:15,  1.29s/it]



  Average training loss: 4.88
  Training epoch took: 0:17:09
Train Loss: 4.8787, Train Accuracy: 0.0110, Train BLEU score: 0.0000

Running Validation...


  3%|▎         | 200/7012 [03:15<1:50:57,  1.02it/s]

Validation Loss: 2.7915, Validation Accuracy: 0.0113,  Validation BLEU score: 0.0009





In [56]:
with torch.no_grad():
    for i, batch in enumerate(tqdm(validation_dataloader)):

      # print(batch)

      focal_method_input_ids = batch['input_ids_focal_method'].to(device)
      focal_method_attention_masks = batch['attention_mask_focal_method'].to(device)

      focal_cls_input_ids = batch['input_ids_focal_cls'].to(device)
      focal_cls_attention_masks = batch['attention_mask_focal_cls'].to(device)

      response_ids = batch['ids_response'].to(device)
      response_attention_masks = batch['attention_mask_response'].to(device)

      input_ids_focal_method_decoder = batch['input_ids_focal_method_decoder'].to(device)
      input_ids_focal_cls_decoder = batch['input_ids_focal_cls_decoder'].to(device)

      attention_mask_focal_method_decoder = batch['attention_mask_focal_cls_decoder'].to(device)
      attention_mask_focal_cls_decoder = batch['attention_mask_focal_cls_decoder'].to(device)

      output_codeLM = CodeModel(focal_method_input_ids, focal_method_attention_masks,
          focal_cls_input_ids, focal_cls_attention_masks,
          response_ids, response_attention_masks,
          input_ids_focal_method_decoder, input_ids_focal_cls_decoder,
						attention_mask_focal_method_decoder, attention_mask_focal_cls_decoder)

      loss = output_codeLM.loss
      logits = output_codeLM.logits

      logits = logits.detach().cpu().numpy()
      response_ids = response_ids.cpu().numpy()
      response_attention_masks = response_attention_masks.cpu().numpy()

      pred_flat = np.argmax(logits, axis=-1)
      # print("Shape:")
      # print(pred_flat[0][:20])

      print()
      # print("Исходная строка:")
      decode_sequence(focal_method_input_ids[3], CodeModel.tokenizer_code_bert)
      decode_sequence(focal_cls_input_ids[3], CodeModel.tokenizer_code_bert)

      # decode_sequence(focal_method_input_ids[0], CodeModel.tokenizer_code_bert)

      # print("GT:")
      decode_sequence(input_ids_focal_method_decoder[3], CodeModel.tokenizerGPT)
      decode_sequence(input_ids_focal_cls_decoder[3], CodeModel.tokenizerGPT)

      print("Prediction:")
      decode_sequence(pred_flat[2], CodeModel.tokenizerGPT)

      break




  0%|          | 0/7012 [00:00<?, ?it/s]


Декодированная строка: <s><FUNC_TOKEN> def align_sequences(read_seq, read_qual, ref_seq, alignment, gap_char='-'): read, qual, ref = [], [], [] read_pos, ref_pos = 0, 0 errors_per_read_pos = [0] * len(read_seq) for c in alignment.cigar_parts: cigar_type = c[-1] cigar_size = int(c[:-1]) if cigar_type == 'M': read.append(read_seq[read_pos:read_pos+cigar_size]) qual.append(read_qual[read_pos:read_pos+cigar_size]) ref.append(ref_seq[ref_pos:ref_pos+cigar_size]) for i in range(cigar_size): if read_seq[read_pos+i] != ref_seq[ref_pos+i]: errors_per_read_pos[read_pos+i] += 1 read_pos += cigar_size ref_pos += cigar_size if cigar_type == 'I': read.append(read_seq[read_pos:read_pos+cigar_size]) qual.append(read_qual[read_pos:read_pos+cigar_size]) ref.append(gap_char * cigar_size) for i in range(cigar_size): errors_per_read_pos[read_pos+i] += 1 read_pos += cigar_size if cigar_type == 'D': read.append(gap_char * cigar_size) qual.append(gap_char * cigar_size) ref.append(ref_seq[ref_pos:ref_pos+ciga


