In [1]:
from encoder_layer import Encoder_block
from positional_encoding import Positional_Encoding
from datasets import load_from_disk
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding

torch.Size([1, 3, 512])


In [2]:
dataset = load_from_disk("kp20k_local")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata'],
        num_rows: 530809
    })
    test: Dataset({
        features: ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata'],
        num_rows: 20000
    })
})

In [4]:
print("Sample from training dataset split")
train_sample = dataset["train"][0]
print("Fields in the sample: ", [key for key in train_sample.keys()])

Sample from training dataset split
Fields in the sample:  ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata']


In [5]:
print("Tokenized Document: ", train_sample["document"])
print("Document BIO Tags: ", train_sample["doc_bio_tags"])

Tokenized Document:  ['virtually', 'enhancing', 'the', 'perception', 'of', 'user', 'actions', 'This', 'paper', 'proposes', 'using', 'virtual', 'reality', 'to', 'enhance', 'the', 'perception', 'of', 'actions', 'by', 'distant', 'users', 'on', 'a', 'shared', 'application.', 'Here,', 'distance', 'may', 'refer', 'either', 'to', 'space', '(', 'e.g.', 'in', 'a', 'remote', 'synchronous', 'collaboration)', 'or', 'time', '(', 'e.g.', 'during', 'playback', 'of', 'recorded', 'actions).', 'Our', 'approach', 'consists', 'in', 'immersing', 'the', 'application', 'in', 'a', 'virtual', 'inhabited', '3D', 'space', 'and', 'mimicking', 'user', 'actions', 'by', 'animating', 'avatars.', 'We', 'illustrate', 'this', 'approach', 'with', 'two', 'applications,', 'the', 'one', 'for', 'remote', 'collaboration', 'on', 'a', 'shared', 'application', 'and', 'the', 'other', 'to', 'playback', 'recorded', 'sequences', 'of', 'user', 'actions.', 'We', 'suggest', 'this', 'could', 'be', 'a', 'low', 'cost', 'enhancement', 'for

In [6]:
print("Extractive/present Keyphrases: ", train_sample["extractive_keyphrases"])
print("Abstractive/absent Keyphrases: ", train_sample["abstractive_keyphrases"])

Extractive/present Keyphrases:  ['telepresence', 'avatars']
Abstractive/absent Keyphrases:  ['animation', 'application sharing', 'collaborative virtual environments']


In [7]:
# unique_tags = set()

# for sample in dataset["train"]:
#     unique_tags.update(sample["doc_bio_tags"])

# print("Unique BIO Tags:", unique_tags)
# Unique BIO Tags: {'B', 'I', 'O'}

In [8]:
print("\n-----------\n")


-----------



In [9]:
# sample from the validation split
print("Sample from validation dataset split")
validation_sample = dataset["validation"][0]
print("Fields in the sample: ", [key for key in validation_sample.keys()])

Sample from validation dataset split
Fields in the sample:  ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata']


In [10]:
print("Tokenized Document: ", validation_sample["document"])
print("Document BIO Tags: ", validation_sample["doc_bio_tags"])
print("Extractive/present Keyphrases: ", validation_sample["extractive_keyphrases"])
print("Abstractive/absent Keyphrases: ", validation_sample["abstractive_keyphrases"])
print("\n-----------\n")

Tokenized Document:  ['Real-Time', 'Data', 'Aggregation', 'in', 'Contention-Based', 'Wireless', 'Sensor', 'Networks', 'We', 'investigate', 'the', 'problem', 'of', 'delay', 'constrained', 'maximal', 'information', 'collection', 'for', 'CSMA-based', 'wireless', 'sensor', 'networks.', 'We', 'study', 'how', 'to', 'allocate', 'the', 'maximal', 'allowable', 'transmission', 'delay', 'at', 'each', 'node,', 'such', 'that', 'the', 'amount', 'of', 'information', 'collected', 'at', 'the', 'sink', 'is', 'maximized', 'and', 'the', 'total', 'delay', 'for', 'the', 'data', 'aggregation', 'is', 'within', 'the', 'given', 'bound.', 'We', 'formulate', 'the', 'problem', 'by', 'using', 'dynamic', 'programming', 'and', 'propose', 'an', 'optimal', 'algorithm', 'for', 'the', 'optimal', 'assignment', 'of', 'transmission', 'attempts.', 'Based', 'on', 'the', 'analysis', 'of', 'the', 'optimal', 'solution,', 'we', 'propose', 'a', 'distributed', 'greedy', 'algorithm.', 'It', 'is', 'shown', 'to', 'have', 'a', 'similar

In [11]:
print("Train rows:", dataset["train"].num_rows)
print("Validation rows:", dataset["validation"].num_rows)
print("Test rows:", dataset["test"].num_rows)

Train rows: 530809
Validation rows: 20000
Test rows: 20000


In [12]:
print("Columns:", dataset["train"].column_names)

Columns: ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata']


In [13]:
print(dataset["train"].data.nbytes / (1024**2), "MB")

1253.6750535964966 MB


In [9]:
from transformers import AutoTokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [12]:
def tokenize_batch(batch):

    # convert list of tokens to string
    texts = [" ".join(tokens) for tokens in batch["document"]]
    
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=256
    )

    # add word_ids manually (for each sample)
    tokenized["word_ids"] = [
        tokenized.word_ids(i) for i in range(len(texts))
    ]

    return tokenized

In [17]:
def align_tags_with_tokens(tags, word_ids):
    new_tags = []
    previous_word = None
    
    for word_id in word_ids:
        if word_id is None:
            new_tags.append(-100)
        elif word_id >= len(tags):   # <-- IMPORTANT FIX
            new_tags.append(-100)
        else:
            tag = tags[word_id]

            # convert B → I for subwords
            if word_id == previous_word and tag == 1:
                tag = 2

            new_tags.append(tag)

        previous_word = word_id

    return new_tags

In [18]:
tokenized_dataset = dataset.map(
    tokenize_batch,
    batched=True,
    num_proc=1
)

tag2id = {"O": 0, "B": 1, "I": 2}

def align_batch(batch):

    aligned = []
    
    for tags, word_ids in zip(batch["doc_bio_tags"], batch["word_ids"]):

        # convert "O","B","I" → 0,1,2
        tags = [tag2id[t] for t in tags]

        aligned_tags = align_tags_with_tokens(tags, word_ids)
        aligned.append(aligned_tags)

    batch["labels"] = aligned
    return batch

tokenized_dataset = tokenized_dataset.map(
    align_batch,
    batched=True,
    num_proc=1
)

In [19]:
sample = tokenized_dataset["train"][0]

print("len(input_ids):", len(sample["input_ids"]))
print("len(labels):", len(sample["labels"]))
print(sample["labels"][:50])

len(input_ids): 256
len(labels): 256
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [20]:
len(tokenized_dataset)

3

In [21]:
print(tokenized_dataset["train"].column_names)

['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels']


In [22]:
# for batch in train_loader:
#     print(batch.keys())
#     print(batch['input_ids'].shape)
#     print(batch['attention_mask'].shape)
#     break

In [23]:
sample = tokenized_dataset["train"][0]  # first example

print("Columns and sample data types or values:")
for col in sample.keys():
    print(f"{col}: type = {type(sample[col])}, sample value = {sample[col]}")

Columns and sample data types or values:
id: type = <class 'NoneType'>, sample value = None
document: type = <class 'list'>, sample value = ['virtually', 'enhancing', 'the', 'perception', 'of', 'user', 'actions', 'This', 'paper', 'proposes', 'using', 'virtual', 'reality', 'to', 'enhance', 'the', 'perception', 'of', 'actions', 'by', 'distant', 'users', 'on', 'a', 'shared', 'application.', 'Here,', 'distance', 'may', 'refer', 'either', 'to', 'space', '(', 'e.g.', 'in', 'a', 'remote', 'synchronous', 'collaboration)', 'or', 'time', '(', 'e.g.', 'during', 'playback', 'of', 'recorded', 'actions).', 'Our', 'approach', 'consists', 'in', 'immersing', 'the', 'application', 'in', 'a', 'virtual', 'inhabited', '3D', 'space', 'and', 'mimicking', 'user', 'actions', 'by', 'animating', 'avatars.', 'We', 'illustrate', 'this', 'approach', 'with', 'two', 'applications,', 'the', 'one', 'for', 'remote', 'collaboration', 'on', 'a', 'shared', 'application', 'and', 'the', 'other', 'to', 'playback', 'recorded',

In [24]:
columns_to_keep = ["input_ids", "token_type_ids", "attention_mask", "labels"]

tokenized_dataset_new = tokenized_dataset.remove_columns(
    [col for col in tokenized_dataset["train"].column_names if col not in columns_to_keep]
)

In [25]:
tokenized_dataset_new

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 530809
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
})

In [26]:
len(tokenized_dataset_new["train"][0]["input_ids"])

256

In [27]:
validation_sample = dataset["validation"][0]
len(validation_sample["doc_bio_tags"])

107

In [28]:
# len(tokenized_dataset_new["train"][0]["doc_bio_tags"])

In [29]:
len(tokenized_dataset["train"][0]["doc_bio_tags"])

106

In [30]:
len(tokenized_dataset_new["train"][0]["token_type_ids"])

256

In [31]:
len(tokenized_dataset_new["train"][0]["attention_mask"])

256

In [32]:
tokenized_dataset["train"]

Dataset({
    features: ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 530809
})

In [33]:
len(tokenized_dataset["train"])

530809

In [34]:
example = tokenized_dataset["train"][0]
len(example["input_ids"])

256

In [35]:
tokenized_dataset["train"][0]["input_ids"]

[101,
 8990,
 20226,
 1996,
 10617,
 1997,
 5310,
 4506,
 2023,
 3259,
 17146,
 2478,
 7484,
 4507,
 2000,
 11598,
 1996,
 10617,
 1997,
 4506,
 2011,
 6802,
 5198,
 2006,
 1037,
 4207,
 4646,
 1012,
 2182,
 1010,
 3292,
 2089,
 6523,
 2593,
 2000,
 2686,
 1006,
 1041,
 1012,
 1043,
 1012,
 1999,
 1037,
 6556,
 26351,
 8093,
 17175,
 2271,
 5792,
 1007,
 2030,
 2051,
 1006,
 1041,
 1012,
 1043,
 1012,
 2076,
 18245,
 1997,
 2680,
 4506,
 1007,
 1012,
 2256,
 3921,
 3774,
 1999,
 10047,
 16862,
 2075,
 1996,
 4646,
 1999,
 1037,
 7484,
 9613,
 7605,
 2686,
 1998,
 23150,
 6834,
 5310,
 4506,
 2011,
 2019,
 22835,
 22128,
 2015,
 1012,
 2057,
 19141,
 2023,
 3921,
 2007,
 2048,
 5097,
 1010,
 1996,
 2028,
 2005,
 6556,
 5792,
 2006,
 1037,
 4207,
 4646,
 1998,
 1996,
 2060,
 2000,
 18245,
 2680,
 10071,
 1997,
 5310,
 4506,
 1012,
 2057,
 6592,
 2023,
 2071,
 2022,
 1037,
 2659,
 3465,
 22415,
 2005,
 10093,
 13699,
 6072,
 10127,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [36]:
def extract_keywords(tokens, words, encoder, prepare_encoder_input, top_k=2):
    """
    tokens: list of token ids (ints)
    words: list of corresponding words (strings)
    encoder: your Encoder_block object (already initialized)
    prepare_encoder_input: function jo tokens ko encoder input me convert karta hai
    top_k: kitne keywords chahiye
    
    Returns:
        keywords: list of top-k keywords extracted using attention weights
    """
    
    # Step 1: Prepare input tensor for encoder
    x = prepare_encoder_input(tokens)  # assume ye function tensor return karta hai
    
    # Step 2: Forward pass through encoder
    output, attn = encoder(x)
    
    # Step 3: Average attention weights across heads
    attn_avg = attn.mean(dim=0)   # shape: (seq_len, seq_len)
    
    # Step 4: Take attention for CLS token or first token (index 0)
    attn_avg = attn_avg[0]        # shape: (seq_len,)
    
    # Step 5: Average attention across source positions to get word importance
    word_importance = attn_avg.mean(dim=0)  # shape: (seq_len,)
    
    # Step 6: Pick top-k words with highest importance
    values, indices = torch.topk(word_importance, k=top_k)
    indices = indices.tolist()
    
    # Step 7: Convert indices to actual words
    keywords = [words[i] for i in indices]
    
    return keywords

In [37]:
d_model = 512  # main model dimension
num_heads = 8  # number of heads
d_ff = 2048    # feedforward hidden dimension
seq_len = 128  # max input length
vocab_size = 30000
embedding_layer = nn.Embedding(vocab_size, d_model)
encoder_layer = Encoder_block(d_model=512, d_ff=2048, num_heads=8)
pos_encoding = Positional_Encoding(seq_len, d_model)

In [38]:
import torch
def prepare_encoder_input(token_ids):
    token_ids = torch.tensor(token_ids).unsqueeze(0)  # (1, seq_len)

    # 1. Convert token IDs → learned embeddings
    x = embedding_layer(token_ids)                      # (1, seq_len, d_model)

    # 2. Add sinusoidal positional encoding
    x = pos_encoding(x)                                 # (1, seq_len, d_model)

    return x

In [39]:
# input_tensor = prepare_encoder_input(tokens)

In [40]:
tokens = [1542, 1542, 1542]  # "I love you"
words = ["I", "love", "you"]

encoder_layer = Encoder_block(d_model=512, d_ff=2048, num_heads=8)
# prepare_encoder_input() tumhara input prepare karne wala function hai

keywords = extract_keywords(tokens, words, encoder_layer, prepare_encoder_input, top_k=2)
print("Extracted keywords:", keywords)

Extracted keywords: ['I', 'love']


In [41]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [42]:
def custom_collate(batch):
    # 1. Let HF collator pad input_ids, attention_mask, etc.
    batch_encoded = data_collator(batch)

    # 2. Pad labels to max batch length
    max_len = batch_encoded["input_ids"].shape[1]

    padded_labels = []
    for item in batch:
        labels = item["labels"]
        # pad using -100 → ignored in loss
        padded = labels + [-100] * (max_len - len(labels))
        padded_labels.append(padded)

    batch_encoded["labels"] = torch.tensor(padded_labels)

    return batch_encoded

In [56]:
# train_loader = DataLoader(
#     tokenized_dataset_new["train"],
#     batch_size=8,
#     shuffle=True,
#     collate_fn=data_collator,
# )
train_loader = DataLoader(
    tokenized_dataset_new["train"],
    batch_size=8,
    shuffle=True,
    collate_fn=data_collator,
    num_workers=4  # adjust based on your CPU cores
)

In [57]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x20e2dfe7980>

In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Encoder_block(d_model=512, d_ff=2048, num_heads=8)
model = model.to(device)  # agar GPU hai to

In [59]:
d_model = 512
num_classes = 3  # B, I, O
# Define classifier on top of encoder output
classifier = nn.Linear(d_model, num_classes).to(device)

In [60]:
model

Encoder_block(
  (ffn): feedforward(
    (fc1): Linear(in_features=512, out_features=2048, bias=True)
    (relu): ReLU()
    (fc2): Linear(in_features=2048, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (multi_att): MultiHeadAttention(
    (Q): Linear(in_features=512, out_features=512, bias=True)
    (K): Linear(in_features=512, out_features=512, bias=True)
    (V): Linear(in_features=512, out_features=512, bias=True)
    (fc_out): Linear(in_features=512, out_features=512, bias=True)
  )
  (norm_layer1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm_layer2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [61]:
weights = torch.tensor([0.1, 1.0, 1.2]).to(device)   # O, B, I
criterion  = nn.CrossEntropyLoss(weight=weights)

In [62]:
# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# Optimizer over both encoder and classifier params
optimizer = torch.optim.AdamW(list(model.parameters()) + list(classifier.parameters()), lr=5e-5)

In [63]:
# for batch in train_loader:
#     print(batch.keys())  # Batch ke andar available keys dikha dega
    
#     if 'input_ids' in batch:
#         print("input_ids present")
#     else:
#         print("input_ids NOT present")
        
#     if 'attention_mask' in batch:
#         print("attention_mask present")
#     else:
#         print("attention_mask NOT present")
    
#     # Bas pehla batch hi check karna hai toh loop break kar do
#     break

In [64]:
# for batch in train_loader:
#     for k, v in batch.items():
#         print(k, type(v))
#     break

In [17]:
d_model = 512  # main model dimension
num_heads = 8  # number of heads
d_ff = 2048    # feedforward hidden dimension
seq_len = 256  # max input length
vocab_size = 30000
embedding_layer = nn.Embedding(vocab_size, d_model).to(device)
encoder_layer = Encoder_block(d_model=512, d_ff=2048, num_heads=8).to(device)
pos_encoding = Positional_Encoding(seq_len, d_model).to(device)

def prepare_batch_encoder_input(input_ids):
    # input_ids = (batch_size, seq_len) already tensor
    x = embedding_layer(input_ids)      # (batch, seq_len, d_model)
    x = pos_encoding(x)                 # (batch, seq_len, d_model)
    return x

In [66]:
print(tokenized_dataset["train"].column_names)
print(tokenized_dataset_new["train"].column_names)

['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels']
['input_ids', 'token_type_ids', 'attention_mask', 'labels']


In [67]:
# from tqdm import tqdm

# num_epochs = 5

# model.train()
# classifier.train()

# for epoch in range(num_epochs):
#     total_loss = 0
    
#     # tqdm wrapper for train_loader with a description
#     loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
#     for batch in loop:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)  # (B, L)

#         optimizer.zero_grad()

#         x = prepare_batch_encoder_input(input_ids)  # (B, L, d_model)
        
#         encoder_output, attn_weights = model(x, mask=None)  # (B, L, d_model)
        
#         logits = classifier(encoder_output)  # (B, L, 3)
        
#         loss = criterion(
#             logits.view(-1, 3),  # (B*L, 3)
#             labels.view(-1)      # (B*L)
#         )
        
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#         # Update tqdm postfix to show loss dynamically
#         loop.set_postfix(loss=total_loss / (loop.n + 1))

In [68]:
import logging
import os
from tqdm import tqdm

# Create a logs directory if it doesn't exist
os.makedirs("logs", exist_ok=True)

logging.basicConfig(
    filename='logs/training.log', 
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

num_epochs = 5

model.train()
classifier.train()

checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

for epoch in range(num_epochs):
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # (B, L)

        optimizer.zero_grad()

        x = prepare_batch_encoder_input(input_ids)  # (B, L, d_model)

        encoder_output, attn_weights = model(x, mask=None)  # (B, L, d_model)

        logits = classifier(encoder_output)  # (B, L, 3)

        loss = criterion(
            logits.view(-1, 3),  # (B*L, 3)
            labels.view(-1)      # (B*L)
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=total_loss / (loop.n + 1))
    
    avg_loss = total_loss / len(train_loader)
    
    # Logging average loss after each epoch
    logging.info(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
    print(f"Epoch [{epoch+1}/{num_epochs}] completed. Average Loss: {avg_loss:.4f}")

    # Save checkpoint after each epoch
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch+1}.pt")
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'classifier_state_dict': classifier.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': avg_loss,
    }, checkpoint_path)
    
    logging.info(f"Checkpoint saved at {checkpoint_path}")

Epoch 1/5: 100%|█████████████████████████████████████████████████████| 66352/66352 [28:30<00:00, 38.79it/s, loss=0.779]


Epoch [1/5] completed. Average Loss: 0.7794


Epoch 2/5: 100%|█████████████████████████████████████████████████████| 66352/66352 [28:52<00:00, 38.31it/s, loss=0.737]


Epoch [2/5] completed. Average Loss: 0.7370


Epoch 3/5: 100%|██████████████████████████████████████████████████████| 66352/66352 [28:22<00:00, 38.98it/s, loss=0.72]


Epoch [3/5] completed. Average Loss: 0.7202


Epoch 4/5: 100%|██████████████████████████████████████████████████████| 66352/66352 [28:23<00:00, 38.94it/s, loss=0.71]


Epoch [4/5] completed. Average Loss: 0.7098


Epoch 5/5: 100%|█████████████████████████████████████████████████████| 66352/66352 [28:16<00:00, 39.10it/s, loss=0.702]

Epoch [5/5] completed. Average Loss: 0.7019





In [None]:
print(dataset["train"][0]["document"])
print(type(dataset["train"][0]["document"]))

In [None]:
train_texts

In [23]:
print(features.shape)

torch.Size([1, 256, 512])


In [24]:
def tokens_to_words(input_ids):
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
    return tokens

In [54]:
def extract_keywords_better(input_ids, attn_weights, top_k=5):
    
    # CASE 1 — attn_weights is a list of tensors (layers × heads)
    if isinstance(attn_weights, list):
        attn_stack = torch.stack(attn_weights)   # (L, H, S, S)
        cls_scores = attn_stack[:, :, 0, :].mean(dim=(0, 1))  # avg layers+heads
    
    # CASE 2 — attn_weights is a single tensor (heads × S × S)
    else:
        # attn_weights = (H, S, S)
        cls_scores = attn_weights[:, 0, :].mean(dim=0)  # avg heads

    # Remove CLS token (index 0)
    token_scores = cls_scores[1:]  # first token is CLS

    # Get top tokens
    top_indices = torch.topk(token_scores, top_k).indices

    # Get wordpiece IDs → convert to tokens
    top_token_ids = input_ids[0][1:][top_indices]   # skip CLS
    
    top_tokens = tokenizer.convert_ids_to_tokens(top_token_ids.tolist())
    return top_tokens

In [55]:
encoder_output, attn_weights, input_ids = encode_text("Transformers are amazing.")

keywords = extract_keywords(input_ids, attn_weights, top_k=4)

print("Extracted Keywords:", keywords)

Extracted Keywords: ['are', 'transformers', '.', 'amazing']


In [56]:
STOPWORDS = {"is", "are", "the", "a", "an", "of", "to", "and"}

def filter_keywords(words):
    return [w for w in words if w.lower() not in STOPWORDS]

In [57]:
keywords = extract_keywords(input_ids, attn_weights, top_k=4)
# keywords = merge_subwords(keywords)
keywords = filter_keywords(keywords)[:5]  # pick top 2 after filtering
print(keywords)

['transformers', '.', 'amazing']


In [58]:
import nltk
nltk.download("averaged_perceptron_tagger_eng")

from nltk import pos_tag

def prefer_nouns_adjectives(words):
    tags = pos_tag(words, tagset="universal")  # works in new NLTK
    ranked = sorted(tags, key=lambda x: 0 if x[1] in ("NOUN","ADJ") else 1)
    return [w for w, t in ranked]

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [59]:
keywords = extract_keywords_better(input_ids, attn_weights, top_k=10)
keywords = merge_subwords(keywords)
keywords = filter_stopwords(keywords)
keywords[:3]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [48]:
def extract_keywords_better(input_ids, attn_weights, top_k=3):
    # attn_weights = [num_layers, num_heads, seq, seq]
    attn_stack = torch.stack(attn_weights)         # combine layers
    cls_scores = attn_stack[:, :, 0, :].mean(dim=(0,1))  # avg across layers + heads
    
    token_ids = input_ids[0][1:]                # skip CLS
    cls_scores = cls_scores[1:]                 # skip CLS

    topk = torch.topk(cls_scores, k=top_k)
    keywords = [tokenizer.convert_ids_to_tokens(token_ids[i].item())
                for i in topk.indices.tolist()]
    return keywords

In [51]:
keywords = extract_keywords_better(input_ids, attn_weights, top_k=10)
keywords = merge_subwords(keywords)
keywords = filter_stopwords(keywords)
keywords = keywords[:2]

TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor