In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm.notebook import tqdm
import random
import os

In [2]:
BATCH_SIZE = 11

In [3]:
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONDASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_all(42)

In [4]:
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.code = data['code'].to_numpy()
        self.problem = data['label'].to_numpy()

    def __len__(self):
        return len(self.code)

    def __getitem__(self, idx):
        code1 = self.code[idx]
        standard = 500*self.problem[idx]
        r = np.random.random()
        #good
        if r < 0.5:
            tmp = np.random.randint(standard, standard + 500)
            code2 = self.code[tmp]
            label = 1
        #bad
        else:
            tmp = np.random.randint(standard + 500, len(self.code) + standard) % len(self.code)
            code2 = self.code[tmp]
            label = 0
        encoding = self.tokenizer(
            code1,
            code2,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_tensors="pt",
            padding='max_length'
        )

        return {'input_ids': encoding['input_ids'][0],
                'attention_mask': encoding['attention_mask'][0],
                #'token_type_ids' : encoding['token_type_ids'][0],
                'labels': torch.tensor(label, dtype=torch.long)}

In [5]:
#model_name = "neulab/codebert-cpp"
#model_name = 'neulab/codebert-cpp'
model_name = 'microsoft/graphcodebert-base'
train_data = pd.read_csv('./all_code.csv')

tokenizer = AutoTokenizer.from_pretrained(model_name)
train_dataset = NewsDataset(train_data, tokenizer, max_len=512)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=4)

model = AutoModelForSequenceClassification.from_pretrained('model_infinite_gcbc/cpp-12')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
for epoch in tqdm(range(0,13,1)):
    model.train()
    train_loss = 0
    acc = 0
    tmp_acc = 0
    cnt = 0
    print(f'\nepoch : {epoch+1}')
    for i, batch in tqdm(enumerate(train_loader),leave=False,total=len(train_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        acc += torch.sum(labels==preds)
        tmp_acc += torch.sum(labels==preds)
        cnt += torch.sum(preds)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        if i%500==499:
            print(tmp_acc.item()/ (500*BATCH_SIZE))
            tmp_acc = 0
        #optimizer.step()
        optimizer.zero_grad()
    model.save_pretrained(f"model_infinite_gcbc_no_zero_grad/cpp-{epoch}", from_pt=True)
    print(f'train_loss : {train_loss}\nacc = {acc / len(train_data)}\ncount = {cnt}')

  0%|          | 0/13 [00:00<?, ?it/s]


epoch : 1


  0%|          | 0/22728 [00:00<?, ?it/s]

0.864909090909091
0.8561818181818182
0.872
0.8772727272727273
0.8681818181818182
0.866
0.8698181818181818
0.8709090909090909
0.868
0.8710909090909091
0.868909090909091
0.8716363636363637
0.8710909090909091
0.8678181818181818
0.8661818181818182
0.8705454545454545
0.8592727272727273
0.864
0.8690909090909091
0.864909090909091
0.8650909090909091
0.8676363636363637
0.8643636363636363
0.8710909090909091
0.8747272727272727
0.8729090909090909
0.8625454545454545
0.8687272727272727
0.8661818181818182
0.8690909090909091
0.8714545454545455
0.8683636363636363
0.8598181818181818
0.8752727272727273
0.8583636363636363
0.8714545454545455
0.8654545454545455
0.8690909090909091
0.8687272727272727
0.862909090909091
0.8681818181818182
0.8747272727272727
0.8658181818181818
0.8645454545454545
0.8632727272727273
train_loss : 5446.361189865274
acc = 0.8676400184631348
count = 136506

epoch : 2


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8718181818181818
0.8763636363636363
0.868
0.8636363636363636
0.8732727272727273
0.8727272727272727
0.8656363636363636
0.8665454545454545
0.8650909090909091
0.8710909090909091
0.8676363636363637
0.876
0.8732727272727273
0.8694545454545455
0.8607272727272727
0.8672727272727273
0.8692727272727273
0.8650909090909091
0.8585454545454545
0.874
0.8636363636363636
0.8652727272727273
0.8714545454545455
0.8690909090909091
0.866909090909091
0.8674545454545455
0.8663636363636363
0.8623636363636363
0.8687272727272727
0.8607272727272727
0.862909090909091
0.8712727272727273
0.8625454545454545
0.868
0.8678181818181818
0.8687272727272727
0.8667272727272727
0.8683636363636363
0.8741818181818182
0.8750909090909091
0.8685454545454545
0.8665454545454545
0.8696363636363637
0.8687272727272727
0.864
train_loss : 5420.852346098865
acc = 0.8680520057678223
count = 136645

epoch : 3


  0%|          | 0/22728 [00:00<?, ?it/s]

0.8705454545454545
0.866909090909091
0.8714545454545455
0.8663636363636363
0.8681818181818182
0.8667272727272727
0.8769090909090909
0.8638181818181818
0.8625454545454545
0.8641818181818182
0.8712727272727273
0.8670909090909091
0.868909090909091
0.864
0.8676363636363637
0.8745454545454545
0.8716363636363637
0.862909090909091
0.8725454545454545
0.8705454545454545
0.8645454545454545
0.8747272727272727
0.8692727272727273
0.8632727272727273
0.8665454545454545
0.8763636363636363
0.8710909090909091
0.864909090909091
0.876
0.8658181818181818
0.8674545454545455
0.8652727272727273
0.8674545454545455
0.8787272727272727
0.8736363636363637
0.8734545454545455
0.8736363636363637
0.8598181818181818
0.8636363636363636
0.8667272727272727
0.8576363636363636
0.8705454545454545
0.8632727272727273
0.8670909090909091
0.8694545454545455
train_loss : 5425.038958998048
acc = 0.868511974811554
count = 136463

epoch : 4


  0%|          | 0/22728 [00:00<?, ?it/s]

Exception in thread Thread-9 (_pin_memory_loop):
Traceback (most recent call last):
  File "/home/server4/anaconda3/envs/jy/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/home/server4/anaconda3/envs/jy/lib/python3.11/threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "/home/server4/anaconda3/envs/jy/lib/python3.11/site-packages/torch/utils/data/_utils/pin_memory.py", line 51, in _pin_memory_loop
    do_one_step()
  File "/home/server4/anaconda3/envs/jy/lib/python3.11/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in do_one_step
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/server4/anaconda3/envs/jy/lib/python3.11/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/server4/anaconda3/envs/jy/lib/python3.11/site-packages/torch/multiprocessing/r

In [4]:
class testDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.code1 = data['code1'].to_numpy()
        self.code2 = data['code2'].to_numpy()

    def __len__(self):
        return len(self.code1)

    def __getitem__(self, idx):
        code1 = self.code1[idx]
        code2 = self.code2[idx]
        
        encoding = self.tokenizer(
            code1,
            code2,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_tensors="pt",
            padding='max_length'
        )
        

        return {'input_ids': encoding['input_ids'][0],
                'attention_mask': encoding['attention_mask'][0]}


In [5]:
test_data = pd.read_csv('./test.csv')
model_name = 'microsoft/graphcodebert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
test_dataset = testDataset(test_data, tokenizer, max_len=512)

test_loader = DataLoader(test_dataset, batch_size=220, shuffle=False, num_workers=4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
model = AutoModelForSequenceClassification.from_pretrained('model_infinite_gcbc/cpp-12')
model.to(device)
model.eval()
with torch.no_grad():
    test_preds = []
    for i, batch in tqdm(enumerate(test_loader),total=len(test_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        test_preds.extend(preds.cpu().numpy())
submission = pd.DataFrame({'pair_id': test_data['pair_id'], 'similar': test_preds})
submission.to_csv('./submission.csv', index=False)

  0%|          | 0/2705 [00:00<?, ?it/s]

In [7]:
sum(pd.read_csv('submission.csv')['similar'])

322766