In [1]:
! pip install -U accelerate
! pip install -U transformers
!pip install datasets
!pip install evaluate
!pip install wget
!pip install torch
!pip install tensorflow
!pip install pandas
!pip install scikit-learn
!pip install numpy




In [2]:
import pandas as pd

# Get test data
df_reveal_test = pd.read_csv("/content/drive/MyDrive/rv_vd/test_func.csv", sep="\t", names=['code','label'])
df_d2a_test = pd.read_csv("/content/drive/MyDrive/d2a_vd/test_func.csv", sep="\t", names=['code','label'])
df_cxg_test = pd.read_csv("/content/drive/MyDrive/cxg_vd/test_func.csv", sep="\t", names=['code','label'])

# Get train data
df_reveal_train = pd.read_csv("/content/drive/MyDrive/rv_vd/train_func.csv", sep="\t", names=['code','label'])
df_d2a_train = pd.read_csv("/content/drive/MyDrive/d2a_vd/train_func.csv", sep="\t", names=['code','label'])
df_cxg_train = pd.read_csv("/content/drive/MyDrive/cxg_vd/train_func.csv", sep="\t", names=['code','label'])

# Get validation data
df_d2a_valid = pd.read_csv("/content/drive/MyDrive/d2a_vd/valid_func.csv", sep="\t", names=['code','label'])
df_cxg_valid = pd.read_csv("/content/drive/MyDrive/cxg_vd/valid_func.csv", sep="\t", names=['code','label'])

In [3]:
print(df_reveal_train.shape)
print(df_d2a_train.shape)
print(df_cxg_train.shape)

# print the number of unique labels
print(df_reveal_train['label'].nunique())
print(df_reveal_test['label'].nunique())
print(df_reveal_train['label'].unique())



(15868, 2)
(4644, 2)
(21855, 2)
3
3
['label' '0' '1']


In [4]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [31]:
# Import necessary libraries
# Get REVEAL metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
import torch

# Custom PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length,
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the encoder to your labels and transform them
# Assuming '0' and '1' are the class labels
train_labels = label_encoder.fit_transform(df_reveal_train['label'])
test_labels = label_encoder.transform(df_reveal_test['label'])

# Initialize the tokenizer and model for sequence classification
model_name = "microsoft/graphcodebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Define batch size
batch_size = 32

# Create custom datasets
train_dataset = CustomDataset(df_reveal_train['code'], train_labels, tokenizer, max_length=64)
test_dataset = CustomDataset(df_reveal_test['code'], test_labels, tokenizer, max_length=64)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=batch_size,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_total_limit=2,
    save_steps=500,
    eval_steps=500,
    save_strategy="steps",
)

# Define a Trainer to facilitate training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_graphcodebert")

# Forward pass through the fine-tuned model to get predictions on the test dataset
predictions = trainer.predict(test_dataset)

# Calculate evaluation metrics on the entire test dataset
accuracy = accuracy_score(test_labels, predictions.predictions.argmax(-1))
f1 = f1_score(test_labels, predictions.predictions.argmax(-1), average='weighted')
recall = recall_score(test_labels, predictions.predictions.argmax(-1), average='weighted')
precision = precision_score(test_labels, predictions.predictions.argmax(-1), average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Percision: {precision}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
500,0.3077,0.277934
1000,0.2697,0.267058


Accuracy: 0.9021164021164021
F1 Score: 0.8879176812979794
Recall: 0.9021164021164021
Percision: 0.881192055498702


  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
# GET D2A Metrics
# Import necessary libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
import torch

# Custom PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length,
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Assuming you have separate DataFrames for training, validation, and test datasets
train_labels = label_encoder.fit_transform(df_d2a_train['label'])
val_labels = label_encoder.transform(df_d2a_valid['label'])
test_labels = label_encoder.transform(df_d2a_test['label'])

model_name = "microsoft/graphcodebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

batch_size = 32

train_dataset = CustomDataset(df_d2a_train['code'], train_labels, tokenizer, max_length=64)
val_dataset = CustomDataset(df_d2a_valid['code'], val_labels, tokenizer, max_length=64)
test_dataset = CustomDataset(df_d2a_test['code'], test_labels, tokenizer, max_length=64)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=batch_size,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_total_limit=2,
    save_steps=500,
    eval_steps=500,
    save_strategy="steps",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()
trainer.save_model("./fine_tuned_graphcodebert")

predictions = trainer.predict(test_dataset)

accuracy = accuracy_score(test_labels, predictions.predictions.argmax(-1))
f1 = f1_score(test_labels, predictions.predictions.argmax(-1), average='weighted')
recall = recall_score(test_labels, predictions.predictions.argmax(-1), average='weighted')
precision = precision_score(test_labels, predictions.predictions.argmax(-1), average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Accuracy: 0.5799676898222941
F1 Score: 0.5793835429994733
Recall: 0.5799676898222941
Precision: 0.5812627006811175


  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# GET CGX_DV Metrics
# Import necessary libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
import torch

# Custom PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length,
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Assuming you have separate DataFrames for training, validation, and test datasets
train_labels = label_encoder.fit_transform(df_cxg_train['label'])
val_labels = label_encoder.transform(df_cxg_valid['label'])
test_labels = label_encoder.transform(df_cxg_test['label'])

model_name = "microsoft/graphcodebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

batch_size = 32

train_dataset = CustomDataset(df_cxg_train['code'], train_labels, tokenizer, max_length=64)
val_dataset = CustomDataset(df_cxg_valid['code'], val_labels, tokenizer, max_length=64)
test_dataset = CustomDataset(df_cxg_test['code'], test_labels, tokenizer, max_length=64)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=batch_size,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_total_limit=2,
    save_steps=500,
    eval_steps=500,
    save_strategy="steps",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()
trainer.save_model("./fine_tuned_graphcodebert")

predictions = trainer.predict(test_dataset)

accuracy = accuracy_score(test_labels, predictions.predictions.argmax(-1))
f1 = f1_score(test_labels, predictions.predictions.argmax(-1), average='weighted')
recall = recall_score(test_labels, predictions.predictions.argmax(-1), average='weighted')
precision = precision_score(test_labels, predictions.predictions.argmax(-1), average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
500,0.6987,0.672099
1000,0.6714,0.657139
1500,0.6458,0.646187
2000,0.615,0.649738


Accuracy: 0.5989754848152213
F1 Score: 0.5975278231388413
Recall: 0.5989754848152213
Precision: 0.5971458860592844


  _warn_prf(average, modifier, msg_start, len(result))
