In [23]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from torch.cuda.amp import autocast, GradScaler


In [2]:
TEST_SIZE = 0.2
DROP_OUT_P = 0.1
num_epochs = 100
checkpoint = "codesage/codesage-small"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, add_eos_token=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

Using device:  cuda


In [20]:
import torch.nn as nn

new_wte = nn.Embedding(49154, 2048)
nn.init.normal_(new_wte.weight, mean=0, std=0.02)
model.wte = new_wte

In [21]:
model

CodeSageModel(
  (wte): Embedding(49154, 2048)
  (wpe): Embedding(2048, 1024)
  (drop): Dropout(p=0.2, inplace=False)
  (h): ModuleList(
    (0-5): 6 x CodeSageBlock(
      (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (attn): CodeSageAttention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attention_dropout): Dropout(p=0.1, inplace=False)
        (residual_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (mlp): CodeSageMLP(
        (c_fc): Conv1D()
        (act): NewGELUActivation()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)

In [7]:
df = pd.read_csv("/home/naver/Individual/Extened/CodeSage/data.csv")
train_data, valid_data, train_labels, valid_labels = train_test_split(df['code'].values, df['label'].values, test_size=TEST_SIZE, random_state=42)

In [8]:
df.head(5)

Unnamed: 0,code,label
0,bool virtio_scsi_handle_cmd_req_prepare(VirtIO...,1
1,static void term_forward_char(void)\n\n{\n\n ...,0
2,static int vaapi_build_decoder_config(VAAPIDec...,0
3,void cpu_reset (CPUMIPSState *env)\n{\n mem...,1
4,static struct dirent *local_readdir(FsContext ...,0


In [25]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, add_eos_token=True)
config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=True)
config.hidden_size = 512
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

In [28]:
config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=True)
config.hidden_size = 512
model = AutoModel.from_config(config,trust_remote_code=True).to(device)

In [29]:
model = AutoModel.from_pretrained(checkpoint, config=config, trust_remote_code=True, ignore_mismatched_sizes=True).to(device)


Some weights of CodeSageModel were not initialized from the model checkpoint at codesage/codesage-small and are newly initialized because the shapes did not match:
- transformer.wte.weight: found shape torch.Size([49154, 1024]) in the checkpoint and torch.Size([49154, 512]) in the model instantiated
- transformer.wpe.weight: found shape torch.Size([2048, 1024]) in the checkpoint and torch.Size([2048, 512]) in the model instantiated
- transformer.h.0.ln_1.weight: found shape torch.Size([1024]) in the checkpoint and torch.Size([512]) in the model instantiated
- transformer.h.0.ln_1.bias: found shape torch.Size([1024]) in the checkpoint and torch.Size([512]) in the model instantiated
- transformer.h.0.attn.c_attn.weight: found shape torch.Size([1024, 3072]) in the checkpoint and torch.Size([512, 1536]) in the model instantiated
- transformer.h.0.attn.c_attn.bias: found shape torch.Size([3072]) in the checkpoint and torch.Size([1536]) in the model instantiated
- transformer.h.0.attn.c_proj

In [30]:
model

CodeSageModel(
  (wte): Embedding(49154, 512)
  (wpe): Embedding(2048, 512)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-5): 6 x CodeSageBlock(
      (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): CodeSageAttention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attention_dropout): Dropout(p=0.1, inplace=False)
        (residual_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (mlp): CodeSageMLP(
        (c_fc): Conv1D()
        (act): NewGELUActivation()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)

In [31]:
print("Training LSTM model...")

TEST_SIZE = 0.2
DROP_OUT_P = 0.1
num_epochs = 200
checkpoint = "codesage/codesage-small"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, add_eos_token=True)
config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=True)
config.hidden_size = 512  
model = AutoModel.from_pretrained(checkpoint, config=config, trust_remote_code=True, ignore_mismatched_sizes=True).to(device)

Training LSTM model...
Using device:  cuda


Some weights of CodeSageModel were not initialized from the model checkpoint at codesage/codesage-small and are newly initialized because the shapes did not match:
- transformer.wte.weight: found shape torch.Size([49154, 1024]) in the checkpoint and torch.Size([49154, 512]) in the model instantiated
- transformer.wpe.weight: found shape torch.Size([2048, 1024]) in the checkpoint and torch.Size([2048, 512]) in the model instantiated
- transformer.h.0.ln_1.weight: found shape torch.Size([1024]) in the checkpoint and torch.Size([512]) in the model instantiated
- transformer.h.0.ln_1.bias: found shape torch.Size([1024]) in the checkpoint and torch.Size([512]) in the model instantiated
- transformer.h.0.attn.c_attn.weight: found shape torch.Size([1024, 3072]) in the checkpoint and torch.Size([512, 1536]) in the model instantiated
- transformer.h.0.attn.c_attn.bias: found shape torch.Size([3072]) in the checkpoint and torch.Size([1536]) in the model instantiated
- transformer.h.0.attn.c_proj