In [None]:
#neulab/codebert-cpp
#microsoft/graphcodebert-base
#

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm.notebook import tqdm

In [2]:
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.code1 = data['code1'].to_numpy()
        self.code2 = data['code2'].to_numpy()
        if 'similar' in data.columns:
            self.labels = data['similar'].to_numpy()
        else:
            self.labels = None

    def __len__(self):
        return len(self.code1)

    def __getitem__(self, idx):
        code1 = self.code1[idx]
        code2 = self.code2[idx]
        
        encoding = self.tokenizer(
            code1,
            code2,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_tensors="pt",
            padding='max_length'
        )

        if self.labels is not None:
            label = self.labels[idx]
            return {'input_ids': encoding['input_ids'][0],
                    'attention_mask': encoding['attention_mask'][0],
                    'token_type_ids' : encoding['token_type_ids'][0],
                    'labels': torch.tensor(label, dtype=torch.long)}
        else:
            return {'input_ids': encoding['input_ids'][0],
                    'attention_mask': encoding['attention_mask'][0],
                    'token_type_ids' : encoding['token_type_ids'][0],
                   }


In [3]:
#model_name = "neulab/codebert-cpp"
model_name = 'OpenMatch/cocodr-base-msmarco'
train_data = pd.read_csv('./sample_train.csv')
train_data = pd.concat([train_data,pd.DataFrame({'code1_path':train_data['code2_path'],
                                     'code2_path':train_data['code1_path'],
                                     'code1':train_data['code2'],
                                     'code2':train_data['code1'],
                                     'similar':train_data['similar']})])
train_data = train_data.reset_index(drop=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
train_dataset = NewsDataset(train_data, tokenizer, max_len=512)

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True, pin_memory=True, num_workers=4)

model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

  return self.fget.__get__(instance, owner)()


In [4]:
for epoch in tqdm(range(100)):
    model.train()
    train_loss = 0
    acc = 0
    cnt = 0
    print(f'\nepoch : {epoch+1}')
    for i, batch in tqdm(enumerate(train_loader),leave=False,total=len(train_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        acc += torch.sum(labels==preds)
        cnt += torch.sum(preds)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    model.save_pretrained(f"model_msmacro/cpp-{epoch}", from_pt=True)
    print(f'train_loss : {train_loss}\nacc = {acc / len(train_data)}\ncount = {cnt}')

  0%|          | 0/100 [00:00<?, ?it/s]


epoch : 1


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 1959.6602111458778
acc = 0.6460750102996826
count = 18983

epoch : 2


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 1409.9522180370986
acc = 0.7660250067710876
count = 19203

epoch : 3


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 1218.9238634612411
acc = 0.8017249703407288
count = 19675

epoch : 4


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 1056.081231017597
acc = 0.8299749493598938
count = 19991

epoch : 5


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 919.0171136641875
acc = 0.8568750023841858
count = 20161

epoch : 6


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 785.224192088237
acc = 0.8825249671936035
count = 20191

epoch : 7


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 648.910987239331
acc = 0.9069749712944031
count = 20143

epoch : 8


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 511.5259664467012
acc = 0.9301249980926514
count = 20103

epoch : 9


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 402.00858118436736
acc = 0.9475749731063843
count = 20027

epoch : 10


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 311.4317935223371
acc = 0.9607499837875366
count = 20022

epoch : 11


  0%|          | 0/3334 [00:00<?, ?it/s]

train_loss : 243.6657302414369
acc = 0.9704749584197998
count = 20023

epoch : 12


  0%|          | 0/3334 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [3]:
model_name = 'OpenMatch/cocodr-base-msmarco'

test_data = pd.read_csv('./test.csv')

tokenizer = AutoTokenizer.from_pretrained(model_name)
test_dataset = NewsDataset(test_data, tokenizer, max_len=512)

test_loader = DataLoader(test_dataset, batch_size=252, shuffle=False, num_workers=4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
model = AutoModelForSequenceClassification.from_pretrained("model_msmacro/cpp-10")
model.to(device)
model.eval()
with torch.no_grad():
    test_preds = []
    for i, batch in tqdm(enumerate(test_loader),total=len(test_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        test_preds.extend(preds.cpu().numpy())
submission = pd.DataFrame({'pair_id': test_data['pair_id'], 'similar': test_preds})
submission.to_csv('./submission.csv', index=False)

  0%|          | 0/2362 [00:00<?, ?it/s]

In [5]:
sum(pd.read_csv('submission.csv')['similar'])

90364