In [1]:
!pip install torch --quiet

!pip install transformers sentence-transformers --quiet
!pip install requests tqdm --quiet

In [2]:
import torch
print('CUDA available:', torch.cuda.is_available())
print('Device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No CUDA')


CUDA available: True
Device: Tesla T4


In [3]:
#==================================================
#Text encoder with attention-based pooling utilities.
#==================================================

import contextlib
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

device = None

if torch.backends.mps.is_available():
	device = torch.device('mps')
elif torch.cuda.is_available():
	device = torch.device('cuda')
else:
	device = torch.device('cpu')


epsilon = 1e-8

class Norm:
	# Project to unit sphere
	#--------------------------------------------------
	@staticmethod
	def to_sphere(v):
		result = v
		if v.dim() == 1:
			result = v / (v.norm() + epsilon)
		else:
			result = v / (v.norm(dim=1, keepdim=True) + epsilon)
		return result


	# Alias for to_sphere
	#--------------------------------------------------
	@staticmethod
	def s(v):
		result = Norm.to_sphere(v)
		return result


	# Project to hypercube
	#--------------------------------------------------
	@staticmethod
	def to_hypercube(v):
		if v.dim() == 1:
			m = v.abs().max()
			return v / (m + epsilon)
		m = v.abs().max(dim=1, keepdim=True).values
		return v / (m + epsilon)


	# Alias for to_hypercube
	#--------------------------------------------------
	@staticmethod
	def h(v):
		result = Norm.to_hypercube(v)
		return result


class Encoder:
	def __init__(self, model_name=None):
		default_model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
		self.model_name    = model_name or default_model_name
		self.tokenizer     = AutoTokenizer.from_pretrained(self.model_name)
		self.model         = AutoModel.from_pretrained(self.model_name, attn_implementation='eager').to(device)
		self.model.eval()


	# Attention pooling over hidden states
	# --------------------------------------------------
	def attention_pool(self, hidden_states, attentions):
		'''
		hidden_states: [S, D]
		attentions: list of [batch, heads, S, S] for each layer
		'''
		att = torch.stack(attentions)   # [L, B, H, S, S]
		att = att[:, 0]                 # take batch 0 → [L, H, S, S]
		att = att.mean(dim=1)           # average over heads → [L, S, S]
		att = att.mean(dim=0)           # average over layers → [S, S]

		weights = att[0]
		weights = F.softmax(weights, dim=0)

		ap_next = (weights.unsqueeze(1) * hidden_states).sum(dim=0)
		result = Norm.to_hypercube(ap_next)
		return result


	# Encode a single text
	# --------------------------------------------------
	def encode(self, text, ap_prev=None, karma=1):
		amp_ctx = torch.cuda.amp.autocast if device.type == 'cuda' else contextlib.nullcontext
		with torch.inference_mode(), amp_ctx():
			tokens = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=1000)
			tokens = {k: v.to(device) for k, v in tokens.items()}

			out    = self.model(**tokens, output_attentions=True)
			hidden = out.last_hidden_state.squeeze(0)
			attn   = out.attentions

			ap_next = self.attention_pool(hidden, attn)

			if ap_prev is not None:
				ap_prev = ap_prev.to(device)
				ap_next = ap_next + karma * ap_prev
				ap_next = Norm.to_hypercube(ap_next)

		return ap_next.to('cpu')

	# Encode a sequence of texts
	# --------------------------------------------------
	def encode_sequence(self, texts, ap_prev=None, karma=1):
		embeddings = []
		for text in texts:
			ap_next = self.encode(text, ap_prev=ap_prev, karma=karma)
			embeddings.append(ap_next)
		result = torch.stack(embeddings)
		return result

	def test_encode_batch(self, texts, sample_size=20, batch_size=32):
		import random
		import torch.nn.functional as F

		# choose sample
		if len(texts) > sample_size:
			sample = random.sample(texts, sample_size)
		else:
			sample = texts

		print('\n=== encode() vs encode_batch_magic() consistency test ===')
		print(f'Comparing {len(sample)} texts...\n')

		# 1) magic batch encode (no padding → identical)
		ap_batch = self.encode_batch(sample)

		sims = []

		# 2) compare one-by-one
		for i, t in enumerate(sample):
			ap_single = self.encode(t)
			ap_b = ap_batch[i]

			sim = F.cosine_similarity(
				ap_single.unsqueeze(0),
				ap_b.unsqueeze(0)
			).item()

			sims.append(sim)
			print(f'[{i:02d}] sim={sim:.6f} | text={t}')

		sims = torch.tensor(sims)

		print('\n=== Summary ===')
		print(f'Min similarity:  {sims.min().item():.6f}')
		print(f'Max similarity:  {sims.max().item():.6f}')
		print(f'Mean similarity: {sims.mean().item():.6f}')

		return sims

	@torch.inference_mode()
	def encode_batch(self, texts, ap_prev_batch=None, karma=1, batch_size=32):
		amp_ctx = torch.cuda.amp.autocast if device.type == 'cuda' else contextlib.nullcontext
		with amp_ctx():
			# Tokenize
			tokens = self.tokenizer(
				texts,
				return_tensors='pt',
				padding=True,
				truncation=True
			)
			tokens = {k: v.to(device) for k, v in tokens.items()}

			out = self.model(**tokens, output_attentions=True)

			hidden = out.last_hidden_state      # [B, S, D]
			attns  = out.attentions             # tuple(L) of [B, H, S, S]
			mask   = tokens['attention_mask']   # [B, S]

			ap_list = []
			B = hidden.size(0)

			for b in range(B):
				seq_len = mask[b].sum().item()   # ignore padding so outputs match encode()
				hidden_b = hidden[b, :seq_len]   # [S, D]

				# Build tuple of [H, S, S] tensors as encode() expects
				# Unsqueeze batch dim to match attention_pool()'s [B, H, S, S] expectation
				att_b = tuple(layer[b, :, :seq_len, :seq_len].unsqueeze(0) for layer in attns)  # tuple(L), each [1, H, S, S]

				ap_next = self.attention_pool(hidden_b, att_b)

				# Optional recurrence
				if ap_prev_batch is not None:
					ap_prev = ap_prev_batch[b].to(device)
					ap_next = ap_next + karma * ap_prev
					ap_next = Norm.to_hypercube(ap_next)

				ap_list.append(ap_next.cpu())

		return torch.stack(ap_list)



In [4]:
#==================================================
# Load 20 samples and test batch encoding
#==================================================

import random

# 1. Load 20 random sentences from dataset
path = '/content/en_ru.sentences.1M.txt'

with open(path, 'r', encoding='utf-8') as f:
	lines = [line.strip() for line in f if line.strip()]

sample = random.sample(lines, 20)

print(f'Loaded {len(sample)} samples:')
for i, s in enumerate(sample):
	print(f'[{i:02d}] {s[:80]!r}')

# 2. Initialize encoder
enc = Encoder()

# 3. Run batch consistency test
enc.test_encode_batch(sample, sample_size=20, batch_size=20)


Loaded 20 samples:
[00] 'I was just kind of like your stand-in.'
[01] 'И если они ещё продлятся, запасы двигателей на Западе закончатся к тому моменту,'
[02] "There's no place like home."
[03] 'Господи боже. обещает 60 человек.'
[04] 'Почему мне кажется, что это нужно только мне?'
[05] 'Confirm, assault team of two to the beach.'
[06] 'И поставил их жизнь под угрозу.'
[07] 'Да, на всякий случай, чтобы произвести хорошее впечатление, чтобы они подумали..'
[08] 'Он он сказал, что может подождать пока тебе не станет лучше.'
[09] 'Каре будет интересно где мы находимся.'
[10] 'Jeremiah got the top score on eight Advanced Placement tests without ever taking'
[11] 'Я не могу... Оно настоящее.'
[12] 'Может быть, нужно просто смириться и жить с этим и делать то, что нужно делать.'
[13] 'Не говоря ни слова, они могут угрожать, умолять, манить, велеть...'
[14] 'Почему вы солгали нам о покупке бара у Джаффа Китсона, хотя продал его вам Эл Дж'
[15] 'No, no, no, no, no!'
[16] 'Она защищала тебя от р

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



=== encode() vs encode_batch_magic() consistency test ===
Comparing 20 texts...



  with amp_ctx():
  with torch.inference_mode(), amp_ctx():


[00] sim=1.000000 | text=I was just kind of like your stand-in.
[01] sim=1.000000 | text=И если они ещё продлятся, запасы двигателей на Западе закончатся к тому моменту, как ваши двигатели попадут на рынок.
[02] sim=1.000000 | text=There's no place like home.
[03] sim=1.000000 | text=Господи боже. обещает 60 человек.
[04] sim=1.000000 | text=Почему мне кажется, что это нужно только мне?
[05] sim=1.000000 | text=Confirm, assault team of two to the beach.
[06] sim=1.000000 | text=И поставил их жизнь под угрозу.
[07] sim=1.000000 | text=Да, на всякий случай, чтобы произвести хорошее впечатление, чтобы они подумали...
[08] sim=1.000000 | text=Он он сказал, что может подождать пока тебе не станет лучше.
[09] sim=1.000000 | text=Каре будет интересно где мы находимся.
[10] sim=1.000000 | text=Jeremiah got the top score on eight Advanced Placement tests without ever taking the Advanced Placement courses.
[11] sim=1.000000 | text=Я не могу... Оно настоящее.
[12] sim=1.000000 | text=Может быть, 

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000])

In [None]:
from math import ceil
from tqdm import tqdm
import torch
from transformers import GPT2TokenizerFast

encoder = Encoder()
batch_size = 512  # adjust if memory permits

# GPT-2 tokenizer setup
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

texts = []
ap_vectors = []
input_ids = []
attention_masks = []

# --------------------------------------------------
# 1. Load all texts
# --------------------------------------------------
with open('/content/en_ru.sentences.1M.txt', 'r', encoding='utf-8') as f:
	for line in tqdm(f, desc='loading'):
		t = line.strip()
		if t:
			texts.append(t)

# --------------------------------------------------
# 2. Batched AP encoding + GPT-2 tokenization
# --------------------------------------------------
num_batches = ceil(len(texts) / batch_size)

for i in tqdm(range(num_batches), desc='encoding AP + tokenizing'):
	batch = texts[i * batch_size : (i + 1) * batch_size]

	# ---- AP encoding ----
	ap_batch = encoder.encode_batch(batch)  # CPU tensors
	ap_vectors.extend(ap_batch)

	# ---- GPT-2 tokenization ----
	tok = tokenizer(
		batch,
		return_tensors='pt',
		padding='max_length',
		truncation=True,
		max_length=64
	)

	input_ids.append(tok['input_ids'])
	attention_masks.append(tok['attention_mask'])

# --------------------------------------------------
# 3. Stack everything
# --------------------------------------------------
aps    = torch.stack(ap_vectors)          # [N, 384]
ids    = torch.cat(input_ids, dim=0)      # [N, 40]
masks  = torch.cat(attention_masks, dim=0)  # [N, 40]

# --------------------------------------------------
# 4. Save dataset
# --------------------------------------------------
torch.save(
	{
		'aps': aps,
		'input_ids': ids,
		'attention_mask': masks
	},
	'dataset_ap.pt'
)

print('Saved dataset_ap.pt')


In [6]:
#==================================================
# SentenceDecoder — AP → Text Generator (v0.3)
# Freezes GPT-2 embeddings + lower transformer blocks,
# trains upper blocks + LM head + AP projection layer.
#==================================================

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm


class SentenceDataset(Dataset):
	def __init__(self, path):
		state = torch.load(path, map_location='cpu')

		self.aps   = state['aps']              # list[tensor]       shape [384]
		self.ids   = state['input_ids']        # list[tensor]       shape [max_len]
		self.masks = state['attention_mask']   # list[tensor]       shape [max_len]

	def __len__(self):
		return len(self.aps)

	def __getitem__(self, idx):
		return (
			self.aps[idx],     # AP vector
			self.ids[idx],     # token ids
			self.masks[idx]    # mask
		)




#==================================================
# SentenceDecoder
#==================================================

class SentenceDecoder(nn.Module):
	def __init__(self, ap_dim=384, model_name='gpt2', freeze_lower_k=10):
		super().__init__()

		self.gpt       = GPT2LMHeadModel.from_pretrained(model_name).to(device)
		self.tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

		# GPT-2 has no pad token → set pad = eos
		if self.tokenizer.pad_token is None:
			self.tokenizer.pad_token = self.tokenizer.eos_token
			self.gpt.config.pad_token_id = self.tokenizer.eos_token_id


		hidden = self.gpt.config.hidden_size
		self.proj = nn.Linear(ap_dim, hidden).to(device)


		#==================================================
		# FREEZE STRATEGY
		#==================================================

		# Freeze embeddings: token + positional
		self.gpt.transformer.wte.requires_grad_(False)
		self.gpt.transformer.wpe.requires_grad_(False)

		# Freeze lower K decoder blocks
		# Example: GPT-2 small has 12 layers → freeze 10 → train last 2
		for i, block in enumerate(self.gpt.transformer.h):
			if i < freeze_lower_k:
				for p in block.parameters():
					p.requires_grad = False

		# Keep upper blocks trainable (they remain unfrozen)
		# Train LM head

		for p in self.gpt.lm_head.parameters():
			p.requires_grad = True

		# Projection layer always trainable
		for p in self.proj.parameters():
			p.requires_grad = True

		self.state = {}

	#==================================================
	# LOAD / SAVE
	#==================================================

	def load(self, path):
		try:
			state = torch.load(path, map_location=device)
			self.load_state_dict(state['model'])
			self.state = state.get('meta', {})
			return True
		except:
			return False

	def save(self, path):
		torch.save(
			{
				'model': self.state_dict(),
				'meta':  self.state
			},
			path
		)
		return path

	#==================================================
	# INFERENCE
	#==================================================

	def decode(self, ap_vector, max_len=40, temperature=0.9, top_p=0.95):
		ap = Norm.to_hypercube(ap_vector).to(device)
		prefix = self.proj(ap).unsqueeze(0).unsqueeze(1)

		output = self.gpt.generate(
			inputs_embeds   = prefix,
			max_length      = max_len,
			do_sample       = True,
			temperature     = temperature,
			top_p           = top_p,
			pad_token_id    = self.tokenizer.eos_token_id
		)

		return self.tokenizer.decode(output[0], skip_special_tokens=True)

	#==================================================
	# TRAINING LOOP
	#==================================================

	def train_model(self, path, batch_size=16, lr=3e-5, epochs=1, accum_steps=2):
		dataset = SentenceDataset(path)
		loader  = DataLoader(dataset, batch_size=batch_size, shuffle=True)

		params = filter(lambda p: p.requires_grad, self.parameters())
		optim  = torch.optim.AdamW(params, lr=lr)

		total_steps = len(loader) * epochs
		scheduler = get_linear_schedule_with_warmup(
			optim,
			num_warmup_steps = int(0.05 * total_steps),
			num_training_steps = total_steps
		)

		amp_ctx = torch.cuda.amp.autocast if device.type == 'cuda' else contextlib.nullcontext
		scaler  = torch.cuda.amp.GradScaler() if device.type == 'cuda' else None

		self.train()
		step_counter = 0

		for epoch in range(epochs):
			progress = tqdm(loader, desc=f'Epoch {epoch+1}')

			for ap, ids, mask in progress:
				ap   = ap.to(device)
				ids  = ids.to(device)
				mask = mask.to(device)

				prefix = self.proj(ap).unsqueeze(1)
				inputs_embeds = self.gpt.transformer.wte(ids)
				inputs_embeds[:, 0, :] = prefix[:, 0, :]

				with amp_ctx():
					outputs = self.gpt(
						inputs_embeds = inputs_embeds,
						attention_mask = mask,
						labels = ids
					)
					loss = outputs.loss / accum_steps

				if scaler:
					scaler.scale(loss).backward()
				else:
					loss.backward()

				if (step_counter + 1) % accum_steps == 0:
					if scaler:
						scaler.step(optim)
						scaler.update()
					else:
						optim.step()

					optim.zero_grad()
					scheduler.step()

				step_counter += 1
				progress.set_postfix({'loss': f'{loss.item()*accum_steps:.4f}'})

			# --------------------------------------------------
			# SAVE AFTER EACH EPOCH
			# --------------------------------------------------
			save_path = f'sentence_decoder_epoch{epoch+1}.pt'
			torch.save(self.state_dict(), save_path)
			print(f'Epoch {epoch+1} saved → {save_path}')



In [7]:
print(device)

cuda


In [None]:
#==================================================
# CONFIG
#==================================================

dataset_path = '/content/dataset_ap.pt'


#--------------------------------------------------
# 2. Load SentenceDecoder
#--------------------------------------------------
model = SentenceDecoder(
	ap_dim = 384,
	model_name = 'gpt2',
	freeze_lower_k = 10
)

print('SentenceDecoder initialized.')


#--------------------------------------------------
# 3. Train
#--------------------------------------------------
print('Starting training...')

model.train_model(
	path       = dataset_path,   # ← dataset_ap.pt
	batch_size = 8,
	lr         = 3e-5,
	epochs     = 3
)

print('Training complete.')


#--------------------------------------------------
# 4. Save
#--------------------------------------------------
save_path = 'sentence_decoder.pt'
torch.save(model.state_dict(), save_path)

print(f'Model saved to {save_path}')


SentenceDecoder initialized.
Starting training...


  scaler  = torch.cuda.amp.GradScaler() if device.type == 'cuda' else None
  with amp_ctx():
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Epoch 1:   0%|          | 452/125000 [00:38<2:25:22, 14.28it/s, loss=6.3348]