In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm.notebook import tqdm
import random
import os

In [2]:
BATCH_SIZE = 11

In [3]:
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONDASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_all(42)

In [4]:
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.code = data['code'].to_numpy()
        self.problem = data['label'].to_numpy()

    def __len__(self):
        return len(self.code)

    def __getitem__(self, idx):
        code1 = self.code[idx]
        standard = 500*self.problem[idx]
        r = np.random.random()
        #good
        if r < 0.5:
            tmp = np.random.randint(standard, standard + 500)
            code2 = self.code[tmp]
            label = 1
        #bad
        else:
            tmp = np.random.randint(standard + 500, len(self.code) + standard) % len(self.code)
            code2 = self.code[tmp]
            label = 0
        encoding = self.tokenizer(
            code1,
            code2,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_tensors="pt",
            padding='max_length'
        )

        return {'input_ids': encoding['input_ids'][0],
                'attention_mask': encoding['attention_mask'][0],
                #'token_type_ids' : encoding['token_type_ids'][0],
                'labels': torch.tensor(label, dtype=torch.long)}

In [5]:
#model_name = "neulab/codebert-cpp"
#model_name = 'neulab/codebert-cpp'
model_name = 'microsoft/graphcodebert-base'
train_data = pd.read_csv('./all_code.csv')

tokenizer = AutoTokenizer.from_pretrained(model_name)
train_dataset = NewsDataset(train_data, tokenizer, max_len=512)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=4)

model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
for epoch in tqdm(range(0,15,1)):
    model.train()
    train_loss = 0
    acc = 0
    tmp_acc = 0
    cnt = 0
    print(f'\nepoch : {epoch+1}')
    for i, batch in tqdm(enumerate(train_loader),leave=False,total=len(train_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        acc += torch.sum(labels==preds)
        tmp_acc += torch.sum(labels==preds)
        cnt += torch.sum(preds)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        if i%500==499:
            print(tmp_acc.item()/ (500*BATCH_SIZE))
            tmp_acc = 0
        #optimizer.step()
        optimizer.zero_grad()
    model.save_pretrained(f"model_infinite_gcbc_no_zero_grad/cpp-{epoch}", from_pt=True)
    print(f'train_loss : {train_loss}\nacc = {acc / len(train_data)}\ncount = {cnt}')

  0%|          | 0/8 [00:00<?, ?it/s]


epoch : 6


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8478181818181818
0.8474545454545455
0.8414545454545455
0.8418181818181818
0.8465454545454546
0.8392727272727273
0.8456363636363636
0.8445454545454546
0.8358181818181818
0.8445454545454546
0.8483636363636363
0.8434545454545455
0.8418181818181818
0.852909090909091
0.8429090909090909
0.8381818181818181
0.8416363636363636
0.8456363636363636
0.8490909090909091
0.8445454545454546
0.848909090909091
0.8421818181818181
0.8450909090909091
0.8458181818181818
0.8438181818181818
0.8467272727272728
0.8461818181818181
0.8483636363636363
0.8467272727272728
0.8456363636363636
0.8514545454545455
0.8429090909090909
0.8394545454545455
0.8503636363636363
0.848
0.8443636363636363
0.8534545454545455
0.8458181818181818
0.8521818181818182
0.8449090909090909
0.8434545454545455
0.8505454545454545
0.8412727272727273
0.8536363636363636
0.8481818181818181
train_loss : 5932.479431619344
acc = 0.8456799983978271
count = 139817

epoch : 7


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8510909090909091
0.8523636363636363
0.8441818181818181
0.846909090909091
0.8450909090909091
0.8538181818181818
0.8396363636363636
0.8532727272727273
0.8378181818181818
0.8490909090909091
0.8485454545454545
0.8443636363636363
0.8472727272727273
0.8552727272727273
0.8429090909090909
0.8481818181818181
0.8410909090909091
0.8389090909090909
0.8454545454545455
0.8518181818181818
0.854909090909091
0.8436363636363636
0.8458181818181818
0.8523636363636363
0.8567272727272728
0.8450909090909091
0.8427272727272728
0.8389090909090909
0.848909090909091
0.854909090909091
0.8472727272727273
0.8512727272727273
0.8550909090909091
0.8494545454545455
0.850909090909091
0.8550909090909091
0.8516363636363636
0.8476363636363636
0.8394545454545455
0.8483636363636363
0.8472727272727273
0.848
0.8547272727272728
0.8449090909090909
0.8490909090909091
train_loss : 5875.959559364535
acc = 0.8480839729309082
count = 139854

epoch : 8


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8452727272727273
0.8501818181818181
0.8454545454545455
0.8512727272727273
0.8447272727272728
0.8505454545454545
0.8547272727272728
0.8476363636363636
0.8478181818181818
0.8570909090909091
0.8505454545454545
0.8510909090909091
0.8556363636363636
0.8596363636363636
0.8538181818181818
0.850909090909091
0.8538181818181818
0.8558181818181818
0.8476363636363636
0.8478181818181818
0.8450909090909091
0.8550909090909091
0.8485454545454545
0.8605454545454545
0.852
0.8563636363636363
0.8438181818181818
0.8621818181818182
0.8478181818181818
0.8538181818181818
0.8594545454545455
0.8521818181818182
0.8578181818181818
0.8565454545454545
0.8578181818181818
0.85
0.8478181818181818
0.8634545454545455
0.8527272727272728
0.8503636363636363
0.8474545454545455
0.8567272727272728
0.852
0.8550909090909091
0.8505454545454545
train_loss : 5774.784787448414
acc = 0.8523600101470947
count = 140315

epoch : 9


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8465454545454546
0.8550909090909091
0.8592727272727273
0.8467272727272728
0.854909090909091
0.8585454545454545
0.8478181818181818
0.8634545454545455
0.854
0.8556363636363636
0.8587272727272727
0.8630909090909091
0.8554545454545455
0.852
0.848
0.8476363636363636
0.8581818181818182
0.8530909090909091
0.8545454545454545
0.8612727272727273
0.8554545454545455
0.8603636363636363
0.8550909090909091
0.8570909090909091
0.8587272727272727
0.860909090909091
0.8552727272727273
0.8574545454545455
0.8514545454545455
0.854909090909091
0.8578181818181818
0.8612727272727273
0.8612727272727273
0.8570909090909091
0.850909090909091
0.8532727272727273
0.8634545454545455
0.8621818181818182
0.8543636363636363
0.8576363636363636
0.8545454545454545
0.8550909090909091
0.864
0.8570909090909091
0.850909090909091
train_loss : 5666.577482101886
acc = 0.8560879826545715
count = 141278

epoch : 10


  0%|          | 0/22728 [00:00<?, ?it/s]

0.856
0.8521818181818182
0.8645454545454545
0.8538181818181818
0.8567272727272728
0.858909090909091
0.8576363636363636
0.8563636363636363
0.8578181818181818
0.8596363636363636
0.8627272727272727
0.8614545454545455
0.8590909090909091
0.8694545454545455
0.864909090909091
0.856909090909091
0.8610909090909091
0.848
0.8567272727272728
0.8607272727272727
0.858909090909091
0.8625454545454545
0.8536363636363636
0.8652727272727273
0.8587272727272727
0.8592727272727273
0.860909090909091
0.8687272727272727
0.858
0.8621818181818182
0.862
0.8563636363636363
0.8592727272727273
0.852
0.8585454545454545
0.862909090909091
0.8683636363636363
0.8607272727272727
0.8641818181818182
0.854909090909091
0.8587272727272727
0.8581818181818182
0.8554545454545455
0.864
0.8550909090909091
train_loss : 5559.7773018100415
acc = 0.8595319986343384
count = 141280

epoch : 11


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8678181818181818
0.8694545454545455
0.8532727272727273
0.8581818181818182
0.8567272727272728
0.864909090909091
0.8614545454545455
0.87
0.8618181818181818
0.8661818181818182
0.8596363636363636
0.8647272727272727
0.8558181818181818
0.8567272727272728
0.8674545454545455
0.8607272727272727
0.8603636363636363
0.8627272727272727
0.8574545454545455
0.8643636363636363
0.8594545454545455
0.8581818181818182
0.864909090909091
0.8618181818181818
0.8603636363636363
0.8630909090909091
0.8632727272727273
0.8638181818181818
0.8683636363636363
0.8663636363636363
0.8585454545454545
0.8625454545454545
0.8625454545454545
0.866909090909091
0.8638181818181818
0.8656363636363636
0.8672727272727273
0.862909090909091
0.8638181818181818
0.8696363636363637
0.8703636363636363
0.8658181818181818
0.8532727272727273
0.862
0.8605454545454545
train_loss : 5494.561061821441
acc = 0.8628360033035278
count = 139333

epoch : 12


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8630909090909091
0.864909090909091
0.8703636363636363
0.8738181818181818
0.8643636363636363
0.868
0.8678181818181818
0.8674545454545455
0.8658181818181818
0.8587272727272727
0.8685454545454545
0.8625454545454545
0.8654545454545455
0.8603636363636363
0.8738181818181818
0.8645454545454545
0.8603636363636363
0.8705454545454545
0.8678181818181818
0.8665454545454545
0.8570909090909091
0.8623636363636363
0.8696363636363637
0.8710909090909091
0.8661818181818182
0.8690909090909091
0.860909090909091
0.8670909090909091
0.866909090909091
0.8683636363636363
0.8665454545454545
0.8621818181818182
0.8647272727272727
0.8590909090909091
0.8685454545454545
0.8596363636363636
0.8630909090909091
0.8623636363636363
0.8672727272727273
0.8565454545454545
0.8570909090909091
0.8683636363636363
0.8638181818181818
0.866
0.862
train_loss : 5437.01136519108
acc = 0.8651000261306763
count = 138362

epoch : 13


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8634545454545455
0.8663636363636363
0.8683636363636363
0.8654545454545455
0.8745454545454545
0.8710909090909091
0.8703636363636363
0.8712727272727273
0.8712727272727273
0.8676363636363637
0.8774545454545455
0.8694545454545455
0.8707272727272727
0.8676363636363637
0.8678181818181818
0.8583636363636363
0.8678181818181818
0.8745454545454545
0.8658181818181818
0.8667272727272727
0.8709090909090909
0.8656363636363636
0.8654545454545455
0.8747272727272727
0.8692727272727273
0.8725454545454545
0.8683636363636363
0.8696363636363637
0.8712727272727273
0.8581818181818182
0.872
0.8721818181818182
0.868
0.8718181818181818
0.8781818181818182
0.8690909090909091
0.874
0.8767272727272727
0.8741818181818182
0.8612727272727273
0.8698181818181818
0.8705454545454545
0.8725454545454545
0.8747272727272727
0.8778181818181818
train_loss : 5340.985961112718
acc = 0.8699719905853271
count = 139257


In [4]:
class testDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.code1 = data['code1'].to_numpy()
        self.code2 = data['code2'].to_numpy()

    def __len__(self):
        return len(self.code1)

    def __getitem__(self, idx):
        code1 = self.code1[idx]
        code2 = self.code2[idx]
        
        encoding = self.tokenizer(
            code1,
            code2,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_tensors="pt",
            padding='max_length'
        )
        

        return {'input_ids': encoding['input_ids'][0],
                'attention_mask': encoding['attention_mask'][0]}


In [5]:
test_data = pd.read_csv('./test.csv')
model_name = 'neulab/codebert-cpp'
tokenizer = AutoTokenizer.from_pretrained(model_name)
test_dataset = testDataset(test_data, tokenizer, max_len=512)

test_loader = DataLoader(test_dataset, batch_size=220, shuffle=False, num_workers=4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
model = AutoModelForSequenceClassification.from_pretrained('model_infinite_cbc/cpp-0')
model.to(device)
model.eval()
with torch.no_grad():
    test_preds = []
    for i, batch in tqdm(enumerate(test_loader),total=len(test_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        test_preds.extend(preds.cpu().numpy())
submission = pd.DataFrame({'pair_id': test_data['pair_id'], 'similar': test_preds})
submission.to_csv('./submission.csv', index=False)

  0%|          | 0/2705 [00:00<?, ?it/s]

In [7]:
sum(pd.read_csv('submission.csv')['similar'])

211929