In [None]:
# Load model directly
from transformers import (
    AutoTokenizer, AutoModel,
    AutoModelForSequenceClassification, 
    Trainer,TrainingArguments,EvalPrediction,
    RobertaForSequenceClassification,        
    RobertaModel,
    RobertaConfig,
    FlaxRobertaForSequenceClassification,
    )
import pandas  as pd
import torch

In [None]:

tokenizer = AutoTokenizer.from_pretrained("../codebert-base")

model = RobertaForSequenceClassification.from_pretrained(
    '../codebert-base',
    num_labels=2, 
    problem_type="single_label_classification",
)
for param in model.roberta.parameters():
    param.requires_grad=False

In [None]:

df = pd.read_csv("./MSR_data_cleaned.csv")


In [None]:
from sklearn.model_selection import train_test_split
def process_data(df):
    df.iloc[0]
    funcs_len = [len(f) for f in df.func_before]
    df["func_len"] = funcs_len

    df_short = df[(df.func_len>50)&(df.func_len<tokenizer.model_max_length)]
    df_vul = df_short[df_short.vul==1]
    df_novul = df_short[df_short.vul==0].sample(len(df_vul))
    df_sample = pd.concat([df_vul,df_novul])
    df_sample = df_sample.sample(frac=1)[["func_before", "vul"]]
    df_sample.reset_index(drop=True)
    return df_sample

df_sample = process_data(df)

In [None]:
import tqdm
def tokenize(df,tokenizer):
    input_ids = []
    masks=[]
    for i in tqdm.tqdm(range(0,len(df),64)):
        res = tokenizer(df.func_before.to_list()[i:min(i+64, len(df))], padding="max_length", truncation=True,return_tensors="pt")
        input_ids.extend(res["input_ids"])
        masks.extend(res["attention_mask"])
    df_tokenized = df.copy()
    df_tokenized["input_ids"] = input_ids
    df_tokenized["attention_mask"]=masks
    return df_tokenized

def split_data(df):
    df_train, df_other = train_test_split(df, test_size=0.2, stratify=df_sample['vul'])
    df_val,df_test = train_test_split(df_other, test_size=0.5,  stratify=df_other['vul'])
    return df_train,df_val,df_test


In [None]:
df_sample = tokenize(df_sample,tokenizer)
df_train,df_val,df_test = split_data(df_sample)

In [None]:
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
class DetectDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.funcs = df.input_ids.to_list()
        self.masks=df.attention_mask.to_list()
        self.labels = df.vul.to_list()

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self,i):          
        return {
            'input_ids': self.funcs[i],           
            'labels':  torch.tensor(self.labels[i],dtype=torch.long),
            'attention_mask':self.masks[i],
        }
    
    def len(self):
        return self.__len__()
    
    def getitem(self,i):
        return self.__getitem__(i)
        

def my_compute_metrics(pred:EvalPrediction):
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(pred.label_ids, preds, average='binary', zero_division=0.0)
    acc = accuracy_score(pred.label_ids, preds)
    return {
        'accuracy': acc,
        'f-score': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# test
from torch.utils.data.dataloader import DataLoader

def test(model_test, data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loader = DataLoader(dataset=DetectDataset(df=data),batch_size=128)
    model_test = model_test.to(device)
    labels = []
    preds = []
    with torch.no_grad():
        for _, batch in tqdm.tqdm(enumerate(loader)):  
            batch["input_ids"] = torch.mul(batch["input_ids"], batch["attention_mask"]).to(device)   
            batch["attention_mask"] = batch["attention_mask"].to(device) 
            labels.extend(batch["labels"].tolist())
            del batch["labels"]             
            outputs = model_test(**batch)
            pred= outputs["logits"].cpu().argmax(-1).tolist()
            preds.extend(pred)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0.0)
    acc = accuracy_score(labels, preds)

    print(f"test reulsts: acc {acc}, f1 {f1},  precision {precision}, recall {recall} ")

In [None]:
training_args = TrainingArguments(
    output_dir="./output",  # output directory
    num_train_epochs=30,  # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=256,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.001,  # strength of weight decay
    logging_dir="./logs",  # directory for storing logs
    logging_steps=10,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=10,
    learning_rate=1e-3,
    save_strategy="steps",
    save_steps=0.05,
)


trainer = Trainer(
    model=model, 
    train_dataset=DetectDataset(df_train), 
    eval_dataset=DetectDataset(df_val),
    compute_metrics=my_compute_metrics,
    args=training_args,  
)
trainer.train()

In [None]:
test(model,df_test)

In [None]:
def save_data(df_train,df_val,df_test):
    df_1 = df_train.drop(["input_ids", "attention_mask"],axis=1)
    df_2 = df_val.drop(["input_ids", "attention_mask"],axis=1)
    df_3 = df_test.drop(["input_ids", "attention_mask"],axis=1)

    df_1["split"]=["train"]*len(df_1)
    df_2["split"]=["val"]*len(df_2)
    df_3["split"]=["test"]*len(df_3)

    df_save = pd.concat([df_1, df_2,df_3])
    df_save.reset_index(inplace=True)

    df_save.to_csv("./msr.csv",index=False)
    
save_data(df_train, df_val,df_test )


In [None]:
trainer.save_model("./checkpoint")
tokenizer.save_pretrained("./checkpoint")
flax_model = FlaxRobertaForSequenceClassification.from_pretrained('./checkpoint', from_pt=True)
flax_model.save_pretrained('./checkpoint', use_msgpack=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("./checkpoint")
model = RobertaForSequenceClassification.from_pretrained("./checkpoint")

df_msr = pd.read_csv("./checkpoint/msr.csv")
df_1 = df_msr[df_msr["split"]=="test"]
df_1 = tokenize(df_1,tokenizer)

test(model,df_1)

In [None]:
df_1["len"] = [len(f) for f in df_1.func_before]
df_1.len.hist(bins=50)

In [None]:
df_2 = df_1[df_1.len<400]
test(model,df_2)

In [None]:
flax_model1 = FlaxRobertaForSequenceClassification.from_pretrained('./checkpoint1')

In [None]:
df_msr = pd.read_csv("./checkpoint/msr.csv")
df_1 = df_msr[df_msr["split"]=="test"]

In [None]:
df_tt = df_sample.copy()
df_tt["len"] = [len(f) for f in df_tt.func_before]

In [None]:

len(df_tt)