## Step-4 - Data Labeling

In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

drive_path = "/content/drive/MyDrive/Dissertation/"

Mounted at /content/drive


In [2]:
!pip install transformers



In [3]:
import pandas as pd
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import classification_report

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    print("Using GPU")
else:
    print("Using CPU")


Using GPU


In [4]:
fine_tuned_model_path = drive_path + "fine_tuned_sileod_deberta-v3-large-tasksource-nli/"

### Testing the fine-tuned model on test dataset

In [5]:
test_df = pd.read_csv(drive_path + "test.csv")


# Create the pipeline and move to GPU if available
classifier = pipeline("zero-shot-classification",
                      model= fine_tuned_model_path,
                      device=0 if device == "cuda" else -1)  # device=0 means use the first GPU


# Define the candidate labels
candidate_labels = ["Authentication", "Accessibility", "Consistency", "Chat", "Contacts", "Aggregation"]

def classify_user_stories_batch(texts):
    outputs = classifier(texts, candidate_labels, multi_label=False)
    labels = []
    scores = []

    for output in outputs:
        label_scores = output['scores']
        label_names = output['labels']

        # sort the scores in descending order and get the indices
        sorted_indices = sorted(range(len(label_scores)), key=lambda k: -label_scores[k])

        # get the labels for the highest score
        labels.append(label_names[sorted_indices[0]])
        scores.append(label_scores[sorted_indices[0]])

    return labels, scores

# Batch size adjusted based on memory availability
BATCH_SIZE = 256
labels_list = []
scores_list = []

for i in range(0, len(test_df), BATCH_SIZE):
    batch_texts = test_df['text'].iloc[i:i+BATCH_SIZE].tolist()
    batch_labels, batch_scores = classify_user_stories_batch(batch_texts)
    labels_list.extend(batch_labels)
    scores_list.extend(batch_scores)

test_df['zero_shot_label'] = labels_list
test_df['zero_shot_score'] = scores_list

report = classification_report(test_df['label'], test_df['zero_shot_label'])
print(report)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


                precision    recall  f1-score   support

 Accessibility       0.77      0.91      0.83        11
   Aggregation       0.92      1.00      0.96        11
Authentication       0.90      0.90      0.90        10
          Chat       1.00      0.83      0.91        18
   Consistency       1.00      1.00      1.00        11
      Contacts       1.00      1.00      1.00        16

      accuracy                           0.94        77
     macro avg       0.93      0.94      0.93        77
  weighted avg       0.94      0.94      0.94        77



## Applying Saved Fine-Tuned Model on Unlabelled Dataset

In [6]:
unlabeled_df = pd.read_csv(drive_path + "unlabeled.csv")


# Create the pipeline and move to GPU if available
classifier = pipeline("zero-shot-classification",
                      model= fine_tuned_model_path,
                      device=0 if device == "cuda" else -1)  # device=0 means use the first GPU


# Define the candidate labels
candidate_labels = ["Authentication", "Accessibility", "Consistency", "Chat", "Contacts", "Aggregation"]

def classify_user_stories_batch(texts):
    outputs = classifier(texts, candidate_labels, multi_label=False)
    labels = []
    scores = []

    for output in outputs:
        label_scores = output['scores']
        label_names = output['labels']

        # sort the scores in descending order and get the indices
        sorted_indices = sorted(range(len(label_scores)), key=lambda k: -label_scores[k])

        # get the labels for the highest score
        labels.append(label_names[sorted_indices[0]])
        scores.append(label_scores[sorted_indices[0]])

    return labels, scores

# Batch size can be adjusted based on memory availability
BATCH_SIZE = 256
labels_list = []
scores_list = []

for i in range(0, len(unlabeled_df), BATCH_SIZE):
    batch_texts = unlabeled_df['text'].iloc[i:i+BATCH_SIZE].tolist()
    batch_labels, batch_scores = classify_user_stories_batch(batch_texts)
    labels_list.extend(batch_labels)
    scores_list.extend(batch_scores)

unlabeled_df['label'] = labels_list
#unlabeled_df['zero_shot_score'] = scores_list

unlabeled_df.to_csv(drive_path + "sileod_labelled.csv", index=False)

unlabeled_df['label'].value_counts()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Accessibility     701
Consistency       537
Aggregation       393
Authentication    221
Contacts          133
Chat               25
Name: label, dtype: int64

## Concatenating test and the labelled dataset by the model

In [7]:
test_df = pd.read_csv(drive_path + "test.csv")

user_story_df = pd.concat([test_df, unlabeled_df], axis=0)
user_story_df

Unnamed: 0,text,label,text_length
0,As an app developer aiming for low-resource en...,Chat,103
1,"As a community member, I want to create a Pod/...",Chat,105
2,"As a existing Solid user, I would like to use ...",Authentication,174
3,"As a Solid Identity Provider, I would like it ...",Authentication,142
4,"As a Solid Identity Provider, I would like to ...",Authentication,126
...,...,...,...
2005,"As a UMD employee, I want to be able to access...",Accessibility,98
2006,"As a UMD employee, I want the system to start ...",Accessibility,118
2007,"As a UMD employee, I want a platform that can ...",Accessibility,138
2008,"As a UMD employee, I want the software to be a...",Accessibility,122


In [8]:
user_story_df.to_csv(drive_path + "labelled_user_stories.csv", index=False)

In [9]:
user_story_df['label'].value_counts()

Accessibility     712
Consistency       548
Aggregation       404
Authentication    231
Contacts          149
Chat               43
Name: label, dtype: int64