**Goal:** Clean the datasets, particularly the Java dataset due to containing comment styling characters and URLs.

*Disclaimer: This new version of the dataset is not used during experimentation.*

In [1]:
import re

from datasets import load_dataset

### Load the data:

In [2]:
dataset = load_dataset("NLBSE/nlbse25-code-comment-classification")

In [3]:
dataset

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

In [4]:
def clean_comment_line(s):
    # Remove leading and trailing whitespaces
    s = s.strip()
    # Remove opening '/*' or '/**'
    s = re.sub(r'^/\*+ *', "", s)
    # Remove closing '*/'
    s = re.sub(r'\*/$', "", s)
    # Remove leading '*' characters
    s = re.sub(r'^ *\* ?', "", s)
    # Remove leading '//' or '// '
    s = re.sub(r'^// ?', "", s)
    # Mask URLs
    s = re.sub(r'https?:\s*//(?:[-\w.]|(?:%[\da-fA-F]{2}))+(/[^\s"()]*)?', "[URL]", s)
    return s

def clean_comment(s):
    # Split the string into lines
    lines = s.strip().split("\n")
    cleaned_lines = []
    for line in lines:
        # Clean each line individually
        cleaned_line = clean_comment_line(line)
        cleaned_lines.append(cleaned_line)
    # Join the cleaned lines back into a single string
    return "\n".join(cleaned_lines)

def prepare_java_comment(x):
    x["combo"] = clean_comment(x["comment_sentence"]) + " | " + x["class"]
    return x

In [5]:
ds_java_train = dataset["java_train"].map(prepare_java_comment)
ds_java_test = dataset["java_test"].map(prepare_java_comment)

Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [6]:
def clean_comment(s):
    # Remove leading and trailing whitespaces
    s = s.strip()
    # Remove leading '#' characters
    s = re.sub(r'^ *# ?', '', s)
    return s

def prepare_python_comment(x):
    x["combo"] = clean_comment(x["comment_sentence"]) + " | " + x["class"]
    return x

In [None]:
ds_python_train = dataset["python_train"].map(prepare_python_comment)
ds_python_test = dataset["python_test"].map(prepare_python_comment)

In [8]:
dataset["java_train"] = ds_java_train
dataset["java_test"] = ds_java_test

dataset["python_train"] = ds_python_train
dataset["python_test"] = ds_python_test

In [10]:
dataset.save_to_disk("../data/nlbse25_v2")

Saving the dataset (0/1 shards):   0%|          | 0/7614 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1725 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1884 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/406 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/289 [00:00<?, ? examples/s]