In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
from torch.nn import functional as F
from transformers import BertTokenizer, BertForMaskedLM
import torch


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_excel('/content/drive/MyDrive/Variable-Renaming/result.xlsx')

# Get the lists of method and their labels.
methods = df["method"].values
labels = df["label"].values

In [None]:
def tokenize_method(method, label):
  method_tokens = tokenizer(method, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
  label_tokens = tokenizer(label, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
  method_tokens["labels"] = label_tokens.input_ids.detach().clone()
  return method_tokens

In [None]:
inputs = None

for i in range(len(methods)):
  method = methods[i].replace('[BLANK]', tokenizer.mask_token)
  token = tokenize_method(method, labels[i])

  if inputs is None:
    inputs = token
  else:
    inputs["input_ids"] = torch.cat([inputs["input_ids"], token["input_ids"]])
    inputs["token_type_ids"] = torch.cat([inputs["token_type_ids"], token["token_type_ids"]])
    inputs["attention_mask"] = torch.cat([inputs["attention_mask"], token["attention_mask"]])
    inputs["labels"] = torch.cat([inputs["labels"], token["labels"]])


In [None]:
inputs["labels"][0]

In [None]:
inputs["input_ids"].shape

torch.Size([4074, 512])

In [None]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = MeditationsDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

In [None]:
from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)



In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 50

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        logits = outputs.logits




        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    if epoch / 5 == 0:
      torch.save(model, '/content/drive/MyDrive/Variable-Renaming/model.pt')

In [None]:
torch.save(model, '/content/drive/MyDrive/Variable-Renaming/model.pt')

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Variable-Renaming/model_state_dict.pt')

In [None]:
model1 = torch.load('/content/drive/MyDrive/Variable-Renaming/model.pt')

In [None]:
method = '''def action(user, reply, text):    [BLANK] = user.get_room_temp('rooms')    for (room_type, room_name) in [BLANK]:        loaded_room = roomloader.load_room(room_name, room_type, user)        if loaded_room.name == text or crypt(loaded_room.name) == text:            if random.random() < 0.1:                reply('Что-то пошло не так, ты увидел фезку пролетающую у тебя над головой. Ощущения будто был нарушен межпространственный континуум.')                user.open_room(reply)            else:                user.open_room(reply, room_type, room_name)            return    reply('Такого выбора тебе не давали.')'''
text = method.replace('[BLANK]', tokenizer.mask_token)
input = tokenizer.encode_plus(text, return_tensors = "pt")
input_cuda = {
    'input_ids': input['input_ids'].cuda(),
    'token_type_ids': input['token_type_ids'].cuda(),
    'attention_mask': input['attention_mask'].cuda()
}
output = model1(**input_cuda)
logits = output.logits
softmax = F.softmax(logits, dim = -1)


In [None]:
# Find the indices of the masked tokens in the input sequence
mask_token_indices = torch.where(input_cuda['input_ids'] == tokenizer.mask_token_id)[1]

# Iterate over each mask token index to get predictions
top_predictions_per_mask = []
for mask_index in mask_token_indices:
    # Get the predictions for the current masked token
    masked_token_logits = logits[0, mask_index, :]

    # Pick the top 5 candidate tokens for the masked position
    top_5_candidates = torch.topk(masked_token_logits, k=10, dim=-1)

    # Convert the predicted token IDs to the respective words
    predicted_token_ids = top_5_candidates.indices.tolist()
    predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

    # Save the predictions
    top_predictions_per_mask.append(predicted_tokens)

# Display the top 5 predictions for each masked token
for i, predictions in enumerate(top_predictions_per_mask):
    print(f"Mask {i+1} top predictions: {predictions}")


Mask 1 top predictions: ['i', 'n', 'x', 'p', 's', 't', 'range', 'r', 'm', 'y']
Mask 2 top predictions: ['i', 'range', 'x', 'n', 's', 'y', 'r', 't', 'k', 'm']
Mask 3 top predictions: ['i', 'x', 'range', 'r', 'p', 't', 'n', 'y', 'data', 's']


In [None]:
# Find the indices of the masked tokens in the input sequence
mask_token_indices = torch.where(input_cuda['input_ids'] == tokenizer.mask_token_id)[1]

# Iterate over each mask token index to get predictions
top_predictions_per_mask = []
for mask_index in range(30):
    # Get the predictions for the current masked token
    masked_token_logits = logits[0, mask_index, :]

    # Pick the top 5 candidate tokens for the masked position
    top_5_candidates = torch.topk(masked_token_logits, k=5, dim=-1)

    # Convert the predicted token IDs to the respective words
    predicted_token_ids = top_5_candidates.indices.tolist()
    predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

    # Save the predictions
    top_predictions_per_mask.append(predicted_tokens)

# Display the top 5 predictions for each masked token
for i, predictions in enumerate(top_predictions_per_mask):
    print(f"Mask {i+1} top predictions: {predictions}")

In [None]:
softmax.shape

torch.Size([1, 318, 30522])

In [None]:
# Find the top tokens predicted for the mask positions
mask_positions = (input_cuda['input_ids'] == tokenizer.mask_token_id).nonzero(as_tuple=True)

# Get the top 5 predictions for the mask positions
predicted_token_ids = softmax[mask_positions].topk(5).indices.squeeze()

In [None]:
# Depending on the number of mask tokens and resulting shape, handle accordingly
predicted_tokens = []
for idx, position in enumerate(mask_positions[1]):
    if predicted_token_ids.ndim == 1:  # Single mask token in the input
        top_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)
    else:  # More than one mask token in the input
        top_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids[:, idx])
    predicted_tokens.append(top_tokens)

# predicted_tokens now contains the top 5 predictions for each mask position
print(predicted_tokens)

[['path', 'if'], ['s', ',']]


In [None]:
import numpy as np
import pandas as pd
from torch.nn import functional as F
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

from tqdm import tqdm  # for our progress bar

epochs = 50

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        logits = outputs.logits

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    if epoch / 5 == 0:
      torch.save(model, '/content/drive/MyDrive/Variable-Renaming/model.pt')