In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
from code_function import preprocess_script, make_dataset, reduction_dataset
from sklearn.model_selection import train_test_split , KFold, StratifiedKFold

In [2]:
from glob import glob
problem_folders = glob("D:/code_preprocessing/clean/*.py")

In [3]:
def make_df(problem_folders):
    preproc_scripts = []
    problem_nums = []
    for problem_folder in tqdm(problem_folders):
        problem_num = os.path.basename(problem_folder).split(".")[0]
        with open(problem_folder, "rt", encoding='utf-8') as file:
            text = file.read()
            preproc_scripts.append(text)
        problem_nums.append(problem_num)
    df = pd.DataFrame(data = {'code':preproc_scripts, 'problem_script':problem_nums})
    df['problem_num'] = df['problem_script'].apply(lambda x: x.split("_")[0])
    return df

In [4]:
df = make_df(problem_folders)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45101/45101 [04:00<00:00, 187.70it/s]


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base-unimodal")
# df['tokens'] = df['code'].apply(tokenizer.tokenize)
# df['len'] = df['tokens'].apply(len)
# df['problem_num'] = df['problem_script'].apply(lambda x: x.split("_")[0])

In [9]:
global train_df
global valid_df

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=33)

for e, (train_index, test_index) in enumerate(skf.split(df, df['problem_num'])):
    if e==0:
        train_df, valid_df = df.iloc[train_index], df.iloc[test_index]

In [10]:
add_df = train_df[train_df['len']>512]
train_df = train_df[train_df["len"] <= 512]
add_df["code"] = add_df['code'].apply(lambda x :" ".join(x.split(" ")[-512:]))
train_df = pd.concat([train_df, add_df], ignore_index=True)
train_df = train_df.reset_index(drop=True)

add_df = valid_df[valid_df['len']>512]
valid_df = valid_df[valid_df["len"] <= 512]
add_df["code"] = add_df['code'].apply(lambda x :" ".join(x.split(" ")[-512:]))
valid = pd.concat([valid_df, add_df], ignore_index=True)
valid_df = valid_df.reset_index(drop=True)

In [11]:
train_code = []
for name, code in zip(train_df["problem_script"], train_df["code"]):
#     if os.path.exists(f"D:/open/executable/{name}.txt"):
#         temp = open(f"D:/open/executable/{name}.txt", "r").read()
    if os.path.exists(f"D:/code_preprocessing/executable/{name}.txt"):
        temp = open(f"D:/code_preprocessing/executable/{name}.txt", "r").read()
        temp = temp.replace('GOODJOBANDSUCCESS', ' ')
        temp = temp.replace("**START**", " ")
        temp = temp[:100]
        code = code +"\n" + temp
        train_code.append(code)
    
    else:
        train_code.append(code)

In [12]:
train_df["code_"] = train_code

In [13]:
train_df1 = train_df[["code", "problem_num"]]
train_df2 = train_df[["code_", "problem_num"]]
train_df2.columns = ["code", "problem_num"]
train_df = pd.concat([train_df1, train_df2], ignore_index=True)

In [14]:
train_data = make_dataset(train_df, tokenizer)
valid_data = make_dataset(valid_df, tokenizer)
train_pair_data = reduction_dataset(train_data)
valid_pair_data = reduction_dataset(valid_data)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [41:35<00:00,  8.32s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:06<00:00,  4.53it/s]


In [15]:
train_pair_data["similar"] = train_pair_data["similar"].map(float)
valid_pair_data["similar"] = valid_pair_data["similar"].map(float)
#그냥데이터
#3_txt파일 몇개 붙인거
#4augmentation
#5_1024데이터셋
#6_1024_last
#7clean_augmentation
train_pair_data.to_csv("D:/code_classification/python3_train7.csv", index=False)
valid_pair_data.to_csv("D:/code_classification/python3_valid7.csv", index=False)

In [5]:
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel, RobertaForSequenceClassification
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small")
# model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
model = RobertaForSequenceClassification.from_pretrained("microsoft/unixcoder-base-unimodal", num_labels=1)
# model = AutoModelForSequenceClassification.from_pretrained("microsoft/unixcoder-base")
model.to(device)

Some weights of the model checkpoint at microsoft/unixcoder-base-unimodal were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base-unimodal and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a dow

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Laye

In [6]:
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
# tokenizer.truncation_side = 'right'
tokenizer.truncation_side = 'left'

In [16]:
temp = pd.read_csv("D:/code_classification/python3_train15.csv")

In [19]:
temp_list = []
for i in temp.similar:
    temp_list.append(float(i))

In [20]:
temp.similar = temp_list

In [21]:
temp2 = pd.read_csv("D:/code_classification/python3_valid15.csv")

In [22]:
temp2_list = []
for i in temp2.similar:
    temp2_list.append(float(i))

In [13]:
temp = pd.read_csv("D:/code_classification/python3_train15.csv")
temp.similar = temp.similar.map("float")
temp2 = pd.read_csv("D:/code_classification/python3_valid15.csv")
temp2.similar = temp2.similar.map("float")

TypeError: 'str' object is not callable

In [None]:
MAX_LEN = 1024
# INPUT = {"train": "D:/code_classification/reduce_train.csv", "test": "D:/code_classification/reduce_valid.csv"}
# INPUT = {"train": "D:/code_classification/python3_train7.csv", "test": "D:/code_classification/python3_valid7.csv"}
INPUT = {"train": "D:/code_classification/python3_train16.csv", "test": "D:/code_classification/python3_valid16.csv"}
# INPUT = {"train": "D:/code_classification/python3_train9.csv", "test": "D:/code_classification/python3_valid9.csv"}

from datasets import load_dataset, load_metric
dataset = load_dataset("csv", data_files=INPUT)

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN, truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs


dataset = dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])

Using custom data configuration default-5a001bb4e6229900
Reusing dataset csv (C:\Users\Administrator\.cache\huggingface\datasets\csv\default-5a001bb4e6229900\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/225289 [00:00<?, ?ex/s]

In [8]:
# from torch.utils.checkpoint import checkpoint
# model.gradient_checkpointing_enable()
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")
args = TrainingArguments(
#     'D:/code_classification/fest_clean',
    'D:/code_classification/uni_aug',
    load_best_model_at_end = True,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    do_train=True,
    do_eval=True,
    fp16=True,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
)  


In [9]:
def metric_fn2(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.where(preds>0.5, 1, 0))
    return output
trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn2,
        )

trainer.train()

Using amp half precision backend
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0. If Unnamed: 0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 225289
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 70405


Epoch,Training Loss,Validation Loss


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [7]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

def metric_fn2(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.where(preds>0.5, 1, 0))
    return output

NameError: name 'load_metric' is not defined

In [21]:
from torch.utils.checkpoint import checkpoint
model.gradient_checkpointing_enable()

In [11]:
args = TrainingArguments(
#     'D:/code_classification/fest_clean',
    'D:/code_classification/fest_aug',
    load_best_model_at_end = True,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    do_train=True,
    do_eval=True,
    fp16=True,
    optim="adafactor",
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
)    

In [12]:
from transformers import EarlyStoppingCallback
class MyTrainer(Trainer):
    def __init__(self, loss_name, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_name= loss_name # 각인!
    def compute_loss(self, model, inputs, return_outputs=False):
        if self.loss_name == 'BinaryEntropy':
            custom_loss = torch.nn.BCEWithLogitsLoss()

        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        outputs = model(**inputs)

        if labels is not None:
            loss = custom_loss(outputs[0], labels)
        else:
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        return (loss, outputs) if return_outputs else loss

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-large", num_labels=1)
# model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large", num_labels=1)
# model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
# model = RobertaForSequenceClassification.from_pretrained(r"D:\code_classification\fest_clean\checkpoint-16606", num_labels=1)
# model = AutoModelForSequenceClassification.from_pretrained("microsoft/unixcoder-base", num_labels=1)
model.to(device)

loading configuration file https://huggingface.co/microsoft/deberta-large/resolve/main/config.json from cache at C:\Users\Administrator/.cache\huggingface\transformers\7c686202d9db9b0aee3e649d42a50257a76d278858dc7ad32b886f02cf8303e4.5286a902fea63d3276108ffa66a65e2b4355a7df6cfab5be091bf20f7eae85f8
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_at

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

In [15]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

def metric_fn2(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.where(sigmoid(preds)>0.5, 1, 0))
    return output

# trainer = MyTrainer(
#         loss_name='BinaryEntropy', 
#         model=model,
#         args=args,
#         data_collator=_collator,
#         train_dataset=dataset["train"],
#         eval_dataset=dataset["test"],
#         tokenizer=tokenizer,
#         compute_metrics=metric_fn2,
#         callbacks = [EarlyStoppingCallback(early_stopping_patience=2)])

# trainer.train()

In [107]:
TEST =  "D:/code_classification/indent_test.csv"
test_dataset = load_dataset("csv", data_files=TEST)['train']
test_dataset = test_dataset.map(example_fn, remove_columns=['code1', 'code2'])

Using custom data configuration default-e8616642f1ea6071
Reusing dataset csv (C:\Users\Administrator\.cache\huggingface\datasets\csv\default-e8616642f1ea6071\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Administrator\.cache\huggingface\datasets\csv\default-e8616642f1ea6071\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-c80dfb3026c49245.arrow


In [108]:
test_dataset

Dataset({
    features: ['pair_id', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 179700
})

In [43]:
# predictions = trainer.predict(test_dataset)
result = sigmoid(predictions.predictions)
result = np.where(result > 0.5, 1, 0)

In [44]:
sample_submission = pd.read_csv("D:/code_classification/sample_submission.csv")
sample_submission['similar'] = result
sample_submission.to_csv('D:/code_classification/graph_codebert3_0607.csv', index=False)

In [33]:
np.save(r"D:/code_classification/fest_aug/prediction.npy", arr=sigmoid(predictions.predictions))

In [120]:
train = pd.read_csv("D:/code_classification/python3_train11.csv")
train = train.iloc[int(len(train)*0.25):]
train.shape

(168942, 3)

In [121]:
train.to_csv("D:/code_classification/python3_train12.csv", index=False)

In [9]:
MAX_LEN = 2048
# INPUT = {"train": "D:/code_classification/reduce_train.csv", "test": "D:/code_classification/reduce_valid.csv"}
# INPUT = {"train": "D:/code_classification/python3_train7.csv", "test": "D:/code_classification/python3_valid7.csv"}
INPUT = {"train": "D:/code_classification/python3_train14.csv", "test": "D:/code_classification/python3_valid11.csv"}

from datasets import load_dataset, load_metric
dataset = load_dataset("csv", data_files=INPUT)

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN, truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs


dataset = dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])

Using custom data configuration default-24107654457307c9
Reusing dataset csv (C:\Users\Administrator\.cache\huggingface\datasets\csv\default-24107654457307c9\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/86666 [00:00<?, ?ex/s]

KeyboardInterrupt: 

In [45]:
from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-large")
# tokenizer.truncation_side = 'right'
tokenizer.truncation_side = 'left'

loading configuration file https://huggingface.co/microsoft/deberta-large/resolve/main/config.json from cache at C:\Users\Administrator/.cache\huggingface\transformers\7c686202d9db9b0aee3e649d42a50257a76d278858dc7ad32b886f02cf8303e4.5286a902fea63d3276108ffa66a65e2b4355a7df6cfab5be091bf20f7eae85f8
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.19.2",
  "type_vocab_size": 0,
  

In [10]:
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

In [53]:
from torch.utils.checkpoint import checkpoint
model.gradient_checkpointing_enable()
# args = TrainingArguments(
# #     'D:/code_classification/fest_clean',
#     'D:/code_classification/unix_aug',
#     load_best_model_at_end = True,
#     per_device_train_batch_size=16,
#     num_train_epochs=6,
#     do_train=True,
#     do_eval=True,
#     fp16=True,
#     optim="adafactor",
#     save_strategy="epoch",
#     logging_strategy="epoch",
#     evaluation_strategy="epoch",
# )    
args = TrainingArguments(
    'D:/code_classification/debertaV_3',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    num_train_epochs=4,
    load_best_model_at_end = True,
    do_train=True,
    do_eval=True,
    logging_steps= 100,
    eval_steps = 100,
    save_strategy="steps",
    logging_strategy="steps",
    evaluation_strategy="steps",
    save_total_limit = 10,
)    

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small")
# model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/unixcoder-base", num_labels=1)
model.to(device)

loading configuration file https://huggingface.co/microsoft/unixcoder-base/resolve/main/config.json from cache at C:\Users\Administrator/.cache\huggingface\transformers\f47f36c6d415b8e978f9685f6dbf2651cc9c951dea26b74fcf8bf62e44900449.b53aa458f35a3b932d45090e5916927053a2bf0e803f4eb410b7d1f922b60a05
Model config RobertaConfig {
  "_name_or_path": "microsoft/unixcoder-base",
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 1026,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "abs

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Laye

In [55]:
trainer = MyTrainer(
        loss_name='BinaryEntropy', 
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn2,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)])

trainer.train()

Using amp half precision backend
***** Running training *****
  Num examples = 224874
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 3512


Step,Training Loss,Validation Loss,Accuracy
100,0.2533,0.141568,0.560921
200,0.0861,0.095537,0.51352
300,0.0499,0.064011,0.801936
400,0.0367,0.050324,0.51352
500,0.0278,0.035859,0.51352
600,0.0248,0.02985,0.925003
700,0.0198,0.027852,0.932681
800,0.0173,0.024591,0.948036
900,0.0156,0.028592,0.956381


***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
Saving model checkpoint to D:/code_classification/debertaV\checkpoint-500
Configuration saved in D:/code_classification/debertaV\checkpoint-500\config.json
Model weights saved in D:/code_classification/debertaV\checkpoint-500\pytorch_model.bin
tokenizer config file saved in D:/code_classification/debertaV\checkpoint-500\tokenizer_config.json
Special tokens file saved in D:/code_classification/debertaV\checkpoint-500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
***** Running Evaluation *****
  Num examples = 8987
  Batch size 

KeyboardInterrupt: 

In [None]:
925 -> 932 -> 948 -> 96 ->:  96 -> 97 -> 97.5 -> 98 -> 98.5 -> 99 -> 99.5 -. 
5e-5, 2.5e-5

In [26]:
args = TrainingArguments(
    'D:/code_classification/deberta',
    learning_rate = 3e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    num_train_epochs=1,
    load_best_model_at_end = True,
    do_train=True,
    do_eval=True,
    logging_steps= 100,
    eval_steps = 100,
    save_steps = 100,
    save_strategy="steps",
    logging_strategy="steps",
    evaluation_strategy="steps",
)    

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [60]:
trainer = MyTrainer(
        loss_name='BinaryEntropy', 
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn2,
        overwrite_output_dir=True,
#         callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
        )

trainer.train()

Using amp half precision backend
***** Running training *****
  Num examples = 225256
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 1758


Step,Training Loss,Validation Loss,Accuracy
100,0.0151,0.019866,0.944587
200,0.0128,0.024153,0.513074


***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8


KeyboardInterrupt: 

In [126]:
trainer = MyTrainer(
        loss_name='BinaryEntropy', 
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn2,
#         callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
        )

trainer.train()

Using amp half precision backend
***** Running training *****
  Num examples = 168942
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 659


Step,Training Loss,Validation Loss,Accuracy
100,0.0122,0.018455,0.956381
200,0.0101,0.018158,0.843886


***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
Saving model checkpoint to D:/code_classification/debertaV_3\checkpoint-100
Configuration saved in D:/code_classification/debertaV_3\checkpoint-100\config.json
Model weights saved in D:/code_classification/debertaV_3\checkpoint-100\pytorch_model.bin
tokenizer config file saved in D:/code_classification/debertaV_3\checkpoint-100\tokenizer_config.json
Special tokens file saved in D:/code_classification/debertaV_3\checkpoint-100\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
Saving model checkpoint to D:/code_classification/debertaV_3\checkpoint-200
Configuration saved in D:/code_classification/debertaV_3\checkpoint-200\config.json
Model weights saved in D:/code_classification/debertaV_3\checkpoint-200\pytorch_model.bin
tokenizer config file saved in D:/code_classification/debertaV_3\checkpoint-200\tokenizer_config.json
Special tokens file saved in D:/code_classification/de

KeyboardInterrupt: 

In [174]:
trainer = MyTrainer(
        loss_name='BinaryEntropy', 
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn2,

#         callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
        )

trainer.train()

Using amp half precision backend
***** Running training *****
  Num examples = 86666
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 338


Step,Training Loss,Validation Loss,Accuracy
100,0.0225,0.021879,0.513186
200,0.0137,0.019589,0.931679


***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
Saving model checkpoint to D:/code_classification/deberta\checkpoint-100
Configuration saved in D:/code_classification/deberta\checkpoint-100\config.json
Model weights saved in D:/code_classification/deberta\checkpoint-100\pytorch_model.bin
tokenizer config file saved in D:/code_classification/deberta\checkpoint-100\tokenizer_config.json
Special tokens file saved in D:/code_classification/deberta\checkpoint-100\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8987
  Batch size = 8
Saving model checkpoint to D:/code_classification/deberta\checkpoint-200
Configuration saved in D:/code_classification/deberta\checkpoint-200\config.json
Model weights saved in D:/code_classification/deberta\checkpoint-200\pytorch_model.bin
tokenizer config file saved in D:/code_classification/deberta\checkpoint-200\tokenizer_config.json
Special tokens file saved in D:/code_classification/deberta\checkpoint-200\specia

KeyboardInterrupt: 

In [69]:
torch.save(model.state_dict(), "D:/code_classification/debertaV_2/deberta210steps.pt")

In [70]:
model.load_state_dict(torch.load("D:/code_classification/debertaV_2/deberta210steps.pt"))

<All keys matched successfully>

Prediction Time

In [176]:
model = AutoModelForSequenceClassification.from_pretrained(r"D:\code_classification\debertaV_3\checkpoint-200")
model.to(device)

loading configuration file D:\code_classification\debertaV_3\checkpoint-200\config.json
Model config DebertaConfig {
  "_name_or_path": "D:\\code_classification\\debertaV_3\\checkpoint-200",
  "architectures": [
    "DebertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 0,
  "vocab_size

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

In [17]:
#valid
MAX_LEN=512
def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN, truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs
valid =  "D:/code_classification/python3_valid13.csv"
valid_dataset = load_dataset("csv", data_files=valid)['train']
valid_dataset = valid_dataset.map(example_fn, remove_columns=['code1', 'code2'])

Using custom data configuration default-498b03d6d8487ae0
Reusing dataset csv (C:\Users\Administrator\.cache\huggingface\datasets\csv\default-498b03d6d8487ae0\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8983 [00:00<?, ?ex/s]

In [16]:
trainer = MyTrainer(
        loss_name='BinaryEntropy', 
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn2,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)])
valid_predictions = trainer.predict(valid_dataset)
valid_result = sigmoid(valid_predictions.predictions)


Using amp half precision backend
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: similar. If similar are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 8983
  Batch size = 8


RuntimeError: The expanded size of the tensor (1451) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [8, 1451].  Tensor sizes: [1, 514]

In [180]:
np.save(r"D:/code_classification/test_ensemble/deberta_valid.npy", arr = valid_result)

In [166]:
valid_result = np.where(valid_result > 0.61, 1, 0)

In [2]:
#model2
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel, RobertaForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained(r"D:\code_classification\fest_aug\checkpoint-27153")
model.to(device)

NameError: name 'device' is not defined

In [20]:
# trainer = MyTrainer(
#         loss_name='BinaryEntropy', 
#         model=model,
#         args=args,
#         data_collator=_collator,
#         train_dataset=dataset["train"],
#         eval_dataset=dataset["test"],
#         tokenizer=tokenizer,
#         compute_metrics=metric_fn2,
#         callbacks = [EarlyStoppingCallback(early_stopping_patience=2)])
# valid_predictions2 = trainer.predict(valid_dataset)
valid_result2 = sigmoid(valid_predictions2.predictions)

In [21]:
np.save(r"D:/code_classification/test_ensemble/graph_valid.npy", arr = valid_result2)

In [None]:
valid = pd.read_csv("D:/code_classification/python3_valid13.csv")

In [167]:
from sklearn.metrics import accuracy_score as acc
acc(valid.similar, valid_result)

0.97796817625459

In [None]:
#####################################################################

In [24]:
#TEST Prediction
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(r"D:\code_classification\debertaV_3\checkpoint-200")
model.to(device)


loading configuration file D:\code_classification\debertaV_3\checkpoint-200\config.json
Model config DebertaConfig {
  "_name_or_path": "D:\\code_classification\\debertaV_3\\checkpoint-200",
  "architectures": [
    "DebertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 0,
  "vocab_size

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

In [25]:
MAX_LEN=2048
def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN, truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs
TEST =  "D:/code_classification/indent_test.csv"
test_dataset = load_dataset("csv", data_files=TEST)['train']
test_dataset = test_dataset.map(example_fn, remove_columns=['code1', 'code2'])

Using custom data configuration default-e8616642f1ea6071
Reusing dataset csv (C:\Users\Administrator\.cache\huggingface\datasets\csv\default-e8616642f1ea6071\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/179700 [00:00<?, ?ex/s]

In [27]:
trainer = MyTrainer(
        loss_name='BinaryEntropy', 
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn2,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)])

Using amp half precision backend


In [28]:
test_prediction = trainer.predict(test_dataset)

The following columns in the test set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 8


In [29]:
deberta_pred = sigmoid(test_prediction.predictions)


In [46]:
np.save(r"D:/code_classification/deberta/prediction.npy", arr=deberta_pred)

In [None]:
1_set
#0.9713
2_set
#0.9779
3_set
#0.

In [30]:
graph_test_prediction = np.load(r"D:/code_classification/fest_aug/prediction.npy")

In [3]:
test_data = pd.read_csv("D:/code_classification/indent_test.csv")
test_data['tokens1'] = test_data['code1'].apply(tokenizer.tokenize)
test_data['len1'] = test_data['tokens1'].apply(len)
test_data['tokens2'] = test_data['code2'].apply(tokenizer.tokenize)
test_data['len2'] = test_data['tokens2'].apply(len)
test_data["total_len"] = test_data["len1"] + test_data["len2"]

In [5]:
pred = np.load('D:/code_classification/uni_aug/prediction1.npy')
pred2 = np.load(r"D:/code_classification/fest_aug/prediction.npy")

In [6]:
#submission
total_result = []
for g, d, l in zip(pred2.reshape(-1), pred.reshape(-1), test_data.total_len,):
    if l <= 512:
        r = (g+d)/2
        r = np.where(r>0.5,1,0)
        total_result.append(r)
    
    else:
        d = np.where(d>0.5,1,0)
        total_result.append(d)

In [7]:
sample_submission = pd.read_csv("D:/code_classification/sample_submission.csv")
sample_submission['similar'] = total_result
sample_submission.to_csv('D:/code_classification/sub0610_3.csv', index=False)