In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaForSequenceClassification,AutoConfig, get_linear_schedule_with_warmup, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm.notebook import tqdm
import random
import os

In [2]:
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONDASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_all(42)

In [3]:
class testDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.code1 = data['code1'].to_numpy()
        self.code2 = data['code2'].to_numpy()

    def __len__(self):
        return len(self.code1)

    def __getitem__(self, idx):
        code1 = self.code1[idx]
        code2 = self.code2[idx]
        
        encoding = self.tokenizer(
            code1,
            code2,
            #add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_tensors="pt",
            padding='max_length'
        )
        

        return {'input_ids': encoding['input_ids'][0],
                'attention_mask': encoding['attention_mask'][0]}


In [4]:
test_data = pd.read_csv('./test_final.csv')
#test_data = pd.read_csv('./test_only_ascii_final.csv')
model_name = 'Lazyhope/unixcoder-clone-detection'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.truncation_side = 'left'


test_dataset = testDataset(test_data, tokenizer, max_len=1024)

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("./uni-cpp-4")
model.to(device)
model.eval()
with torch.no_grad():
    test_preds = []
    for i, batch in tqdm(enumerate(test_loader),total=len(test_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = logits.to('cpu')[:,1]
        test_preds.extend(preds.numpy())
submission = pd.DataFrame({'pair_id': test_data['pair_id'], 'similar': test_preds})
submission.to_csv('./submission_uni_cpp_4_soft.csv', index=False)

  0%|          | 0/9297 [00:00<?, ?it/s]

In [15]:
0 = diff 1 = same

tensor([-4.7924,  4.7726,  4.7775, -4.7914, -4.7924, -4.7937,  4.7748, -4.7920,
         4.7747,  4.7787,  2.5560,  4.7778,  4.7780,  4.7734, -4.7930, -4.7933,
        -4.7929, -4.7935, -4.7929,  4.7725, -4.7919, -4.7933,  4.7725, -4.7928,
        -4.7911, -4.7927, -4.7932, -4.7933, -4.7936, -4.7934, -4.7933, -4.7926,
         4.7785,  2.6947,  4.7760,  4.7779,  4.7794, -4.7923,  4.7764,  4.7739,
        -4.7936,  4.7740,  4.7723,  4.7735,  4.7757, -4.7932, -4.7917,  4.7781,
         4.7752,  4.4972,  4.7720, -4.7920, -4.7915,  3.3205,  2.3059, -4.7926])

In [6]:
sum(pd.read_csv('submission.csv')['similar'])

293359

In [8]:
sum(pd.read_csv('submission.csv')['similar'])#tmp_new

293656

In [6]:
sum(pd.read_csv('submission.csv')['similar'])#tmp_new_sc

293658

In [6]:
sum(pd.read_csv('submission.csv')['similar'])#tmp_new_only

293658

In [6]:
sum(pd.read_csv('submission.csv')['similar'])#pure

292533

In [7]:
292533*2

585066

In [7]:
sum(pd.read_csv('submission.csv')['similar'])#new_set

293514

In [6]:
sum(pd.read_csv('submission_uni_cpp_3.csv')['similar'])#new_set

293369

In [6]:
sum(pd.read_csv('submission_uni_cpp_4.csv')['similar'])#new_set

294227