In [16]:
from datasets import Dataset, DatasetDict, load_dataset

langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

In [17]:
# Accessing the train and test splits for each language
java_train = ds['java_train']
java_test = ds['java_test']

python_train = ds['python_train']
python_test = ds['python_test']

pharo_train = ds['pharo_train']
pharo_test = ds['pharo_test']

# Extracting features (X) and labels (y) for each split
java_X_train = java_train['comment_sentence']
java_y_train = java_train['labels']

java_X_test = java_test['comment_sentence']
java_y_test = java_test['labels']

python_X_train = python_train['comment_sentence']
python_y_train = python_train['labels']

python_X_test = python_test['comment_sentence']
python_y_test = python_test['labels']

pharo_X_train = pharo_train['comment_sentence']
pharo_y_train = pharo_train['labels']

pharo_X_test = pharo_test['comment_sentence']
pharo_y_test = pharo_test['labels']

In [13]:
print (pharo_y_test)

[[0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


# Preprocessing text data using TfidfVectorizer
vectorizer = TfidfVectorizer()

java_X_train_tfidf = vectorizer.fit_transform(java_X_train)
java_X_test_tfidf = vectorizer.transform(java_X_test)

python_X_train_tfidf = vectorizer.fit_transform(python_X_train)
python_X_test_tfidf = vectorizer.transform(python_X_test)

pharo_X_train_tfidf = vectorizer.fit_transform(pharo_X_train)
pharo_X_test_tfidf = vectorizer.transform(pharo_X_test)

In [None]:
# Using KNN Classifier
knn_java = KNeighborsClassifier(n_neighbors=3)
knn_java.fit(java_X_train_tfidf, java_y_train)
java_predictions = knn_java.predict(java_X_test_tfidf)
java_accuracy = accuracy_score(java_y_test, java_predictions)

knn_python = KNeighborsClassifier(n_neighbors=5)
knn_python.fit(python_X_train_tfidf, python_y_train)
python_predictions = knn_python.predict(python_X_test_tfidf)
python_accuracy = accuracy_score(python_y_test, python_predictions)

knn_pharo = KNeighborsClassifier(n_neighbors=5, p=1)
knn_pharo.fit(pharo_X_train_tfidf, pharo_y_train)
pharo_predictions = knn_pharo.predict(pharo_X_test_tfidf)
pharo_accuracy = accuracy_score(pharo_y_test, pharo_predictions)

# Reporting success rates
print(f"Java Accuracy: {java_accuracy:.4f}")
print(f"Python Accuracy: {python_accuracy:.4f}")
print(f"Pharo Accuracy: {pharo_accuracy:.4f}")

It's interesting to note that for the baselines provided by the event, the f1 score was worst for Pharo (python was pretty bad too), but for kNN the Pharo f1 score was the highest by far. The kNN accuracy was almost as high as the baseline (~.6)

In [18]:
# Preprocess text data using TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit on all training data across datasets
all_train_data = java_X_train + python_X_train + pharo_X_train
vectorizer.fit(all_train_data)

# Transform datasets
java_X_train_tfidf = vectorizer.transform(java_X_train)
java_X_test_tfidf = vectorizer.transform(java_X_test)
java_X_train_dense = java_X_train_tfidf.toarray()
java_X_test_dense = java_X_test_tfidf.toarray()


python_X_train_tfidf = vectorizer.transform(python_X_train)
python_X_test_tfidf = vectorizer.transform(python_X_test)
python_X_train_dense = python_X_train_tfidf.toarray()
python_X_test_dense = python_X_test_tfidf.toarray()


pharo_X_train_tfidf = vectorizer.transform(pharo_X_train)
pharo_X_test_tfidf = vectorizer.transform(pharo_X_test)
pharo_X_train_dense = pharo_X_train_tfidf.toarray()
pharo_X_test_dense = pharo_X_test_tfidf.toarray()