In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm.notebook import tqdm
import random
import os

In [2]:
BATCH_SIZE=1

In [3]:
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONDASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_all(42)

In [4]:
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.code = data['code'].to_numpy()
        self.problem = data['label'].to_numpy()

    def __len__(self):
        return len(self.code)

    def __getitem__(self, idx):
        code1 = self.code[idx]
        standard = 500*self.problem[idx]
        r = np.random.random()
        #good
        if r < 0.5:
            tmp = np.random.randint(standard, standard + 500)
            code2 = self.code[tmp]
            label = 1
        #bad
        else:
            tmp = np.random.randint(standard + 500, len(self.code) + standard) % len(self.code)
            code2 = self.code[tmp]
            label = 0
        encoding = self.tokenizer(
            code1,
            code2,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_tensors="pt",
            padding='max_length'
        )

        return {'input_ids': encoding['input_ids'][0],
                'attention_mask': encoding['attention_mask'][0],
                #'token_type_ids' : encoding['token_type_ids'][0],
                'labels': torch.tensor(label, dtype=torch.long)}

In [5]:
model_name = "codistai/codeBERT-small-v2"

train_data = pd.read_csv('./all_code.csv')
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_dataset = NewsDataset(train_data, tokenizer, max_len=1024)

train_loader = DataLoader(train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          pin_memory=True,
                          num_workers=4)

model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at codistai/codeBERT-small-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model.zero_grad()
model.train()
for epoch in tqdm(range(0,5,1)):
    model.train()
    train_loss = 0
    acc = 0
    tmp_acc = 0
    cnt = 0
    print(f'\nepoch : {epoch+1}')
    for i, batch in tqdm(enumerate(train_loader),leave=False,total=len(train_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        acc += torch.sum(labels==preds)
        tmp_acc += torch.sum(labels==preds)
        cnt += torch.sum(preds)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        if i%500==499:
            print(tmp_acc.item()/ (500*BATCH_SIZE))
            model.save_pretrained(f"codesage-base-warmup-2048/cpp-{epoch}", from_pt=True)
            tmp_acc = 0
        optimizer.step()
        model.zero_grad()

  0%|          | 0/5 [00:00<?, ?it/s]


epoch : 1


  0%|          | 0/250000 [00:00<?, ?it/s]

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul with transpose_mat1 1 transpose_mat2 0 m 1024 n 1024 k 1024 mat1_ld 1024 mat2_ld 1024 result_ld 1024 abcType 0 computeType 68 scaleType 0

In [4]:
model_name = "neulab/codebert-cpp"

test_data = pd.read_csv('./test.csv')

tokenizer = AutoTokenizer.from_pretrained(model_name)
test_dataset = NewsDataset(test_data, tokenizer, max_len=512)

test_loader = DataLoader(test_dataset, batch_size=220, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("model_swip/cpp-2")
model.to(device)
model.eval()
with torch.no_grad():
    test_preds = []
    for i, batch in tqdm(enumerate(test_loader),total=len(test_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        test_preds.extend(preds.cpu().numpy())
submission = pd.DataFrame({'pair_id': test_data['pair_id'], 'similar': test_preds})
submission.to_csv('./submission.csv', index=False)

  0%|          | 0/2705 [00:00<?, ?it/s]

In [6]:
sum(pd.read_csv('submission.csv')['similar'])

256828