## Loading Dependencies

In [1]:
%%capture
!pip install datasets transformers[torch] scikit-learn numpy pandas

# Fine-tuning a model on a code classification task

In [5]:
task = "sst2"
model_checkpoint = "neulab/codebert-python"
batch_size = 16

## Loading the dataset

In [6]:
from datasets import load_dataset, load_metric, Dataset

In [11]:
dataset = load_dataset('csv', data_files="code_samples.csv", column_names=['code','label'])
metric = load_metric('accuracy')

In [13]:
import pandas as pd
df = pd.read_csv("code_samples.csv")

In [14]:
df.isna().sum()

code     1
label    0
dtype: int64

In [15]:
df = df.dropna(how='any',axis=0)
df.isna().sum()

code     0
label    0
dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

df = df.sample(frac=1, random_state=42)

# Perform train-test split
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Save train and test data frames to separate CSV files
train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

In [None]:
data_files = {"train": "train.csv", "test": "test.csv"}
dataset = load_dataset('csv', data_files=data_files)

In [19]:
dataset['train'][0]['code']

'# Requirements:\n#     pip install numpy\n#     sudo apt-get install python-openCV\n# Program:\n#     opens your webcam, and records.\n\nimport cv2\n\ncap = cv2.VideoCapture(0)\n\n# Obtain resolutions, convert resolutions from float to integer\nframes_width = int(cap.get(3))\nframes_height = int(cap.get(4))\n\n# Specify the video codec\n# FourCC is plateform dependent, however MJPG is a safe choice.\nfourcc = cv2.VideoWriter_fourcc(*"MJPG")\n\n# Create video writer object. Save file to recording.avi\nout = cv2.VideoWriter("recording.avi", fourcc, 20.0, (frames_width, frames_height))\n\nwhile True:\n    # Capture frame-by-frame\n    ret, frame = cap.read()\n\n    if ret == True:\n\n        # Write frame to recording.avi\n        out.write(frame)\n\n        # Our operations on the frame come here\n        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)\n\n        # Display the resulting frame\n        cv2.imshow("frame", gray)\n        if cv2.waitKey(1) & 0xFF == ord("q"):\n            b

In [20]:
metric = load_metric('accuracy')

## Preprocessing the data

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [22]:
tokenizer("""'''
Given a binary tree and a sum, determine if the tree has a root-to-leaf path such that adding up all the values along the path equals the given sum.

For example:
Given the below binary tree and sum = 22,
              5
             / \
            4   8
           /   / \
          11  13  4
         /  \      \
        7    2      1
return true, as there exist a root-to-leaf path 5->4->11->2 which sum is 22.
'''
# Definition for a  binary tree node
# class TreeNode:
#     def __init__(self, x):
#         self.val = x
#         self.left = None
#         self.right = None

class Solution:
    # @param root, a tree node
    # @param sum, an integer
    # @return a boolean
    def hasPathSum(self, root, sum):
        if not root:
            return False
        k = 0
        return self.hasPathSumHelper(root, sum, k)

    def hasPathSumHelper(self, root, sum, k):
        if not root:
            return False
        if not root.left and not root.right:
            if root.val + k == sum:
                return True
            else:
                return False
        return self.hasPathSumHelper(root.left, sum, k+ root.val) or self.hasPathSumHelper(root.right, sum, k+ root.val)""", truncation=True, padding=True)

{'input_ids': [0, 17809, 108, 50118, 18377, 10, 32771, 3907, 8, 10, 6797, 6, 3094, 114, 5, 3907, 34, 10, 9749, 12, 560, 12, 24999, 2718, 215, 14, 1271, 62, 70, 5, 3266, 552, 5, 2718, 27601, 5, 576, 6797, 4, 50118, 50118, 2709, 1246, 35, 50118, 18377, 5, 874, 32771, 3907, 8, 6797, 5457, 820, 6, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 195, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1589, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 204, 1437, 1437, 290, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1589, 1437, 1437, 1589, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 365, 1437, 508, 1437, 204, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1589, 1437, 44128, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 262, 1437, 1437, 1437, 132, 1437, 1437, 1437, 1437, 1437, 112, 50118, 30921, 1528, 6, 25, 89, 5152, 10, 97

In [23]:
def preprocess_function(examples):
    return tokenizer(examples['code'], truncation=True)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

## Fine-tuning the model

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
%%capture
!pip install accelerate -U

In [27]:
metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    model_name,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [28]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [29]:
validation_key = "test"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [30]:
%%time
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.357648,0.813278
2,No log,0.297813,0.860996
3,No log,0.279056,0.875519
4,No log,0.309761,0.879668
5,No log,0.303008,0.877593




CPU times: user 4min 32s, sys: 8.23 s, total: 4min 40s
Wall time: 2min 54s


TrainOutput(global_step=305, training_loss=0.28658617363601435, metrics={'train_runtime': 174.2394, 'train_samples_per_second': 55.326, 'train_steps_per_second': 1.75, 'total_flos': 2536390573670400.0, 'train_loss': 0.28658617363601435, 'epoch': 5.0})

In [31]:
%%time
trainer.evaluate()



CPU times: user 3.3 s, sys: 85.7 ms, total: 3.39 s
Wall time: 3.06 s


{'eval_loss': 0.3097607493400574,
 'eval_accuracy': 0.8796680497925311,
 'eval_runtime': 3.0553,
 'eval_samples_per_second': 157.759,
 'eval_steps_per_second': 5.237,
 'epoch': 5.0}

In [32]:
trainer.save_model("./model")

## Inference

In [12]:
code_file = """
import pandas as pd

f = pd.read_csv("path")
print(len(f.index))
"""

code_file2 = """
import os
import pandas as pd
import csv

def read_file(filename):
    content = ""
    try:
        with open(filename, 'r') as file:
            content = file.read()
    except UnicodeDecodeError:
                print(f"Error reading file: {filename}. Skipping...")
    
    return content

def read_directory(directory):
    file_contents = []
    for filename in os.listdir(directory):
        if filename.endswith('.py'):
            file_path = os.path.join(directory, filename)
            content = read_file(file_path)
            file_contents.append(content)
    return file_contents

def create_csv(file_contents, labels, output_file):
    # print(f'"{file_contents[0].strip()}"')
    df = pd.DataFrame({'Code': file_contents, 'Label': labels})
    df['Code'] = df['Code'].apply(lambda x: f'"{x.strip()}"')
    df.to_csv(output_file, index=False, quoting=csv.QUOTE_NONE, escapechar=" ")
    # df.to_csv(output_file, index=False)
    # # Read the CSV file
    df = pd.read_csv('code_samples.csv')

    # # Add quotation marks around the text in the "Code" column
    # df['Code'] = df['Code'].apply(lambda x: f'"{x.strip()}"')
    print(df['Code'][0])



low_quality_dir = 'LowQualityCodes'  # Directory path for low-quality code files
high_quality_dir = 'HighQualityCodes'  # Directory path for high-quality code files

low_quality_contents = read_directory(low_quality_dir)
high_quality_contents = read_directory(high_quality_dir)

low_quality_labels = ['0'] * len(low_quality_contents)
high_quality_labels = ['1'] * len(high_quality_contents)

file_contents = low_quality_contents + high_quality_contents
labels = low_quality_labels + high_quality_labels

output_file = 'code_samples.csv'  # Name of the output CSV file
csv_data = [(file_contents[i], labels[i]) for i in range(len(file_contents))]
# create_csv(file_contents, labels, output_file)
with open(output_file, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['code', 'label'])
    writer.writerows(csv_data)

print("CSV file created successfully.")

df = pd.read_csv('code_samples.csv')
print(df['code'][0])

"""

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neulab/codebert-python', use_fast=True)
inputs = tokenizer(code_file2, return_tensors="pt", truncation=True, max_length=500)

In [14]:
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("../model_6SP")
with torch.no_grad():
    logits = model(**inputs).logits

In [15]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'LABEL_0'

In [23]:
inputs = tokenizer(code_file2, return_tensors="pt")

In [24]:
with torch.no_grad():
    logits = model(**inputs).logits

In [25]:
predicted_class_id = logits.argmax().item()
# model.config.id2label[predicted_class_id]

In [26]:
predicted_class_id

0