In [1]:
import pandas as pd
import numpy as np
import os
import torch
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, pipeline
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics.pairwise import cosine_similarity

IS_RUNNING_ON_COLAB = False
drive_path = "/content/drive/MyDrive/Dissertation/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the raw data
user_story_df = pd.read_csv('../artifacts/data/raw.csv')
user_story_df

text     0
label    0
dtype: int64

In [4]:
if not os.path.isdir('../artifacts/data_verification_model'):
    
    X = list(user_story_df['text'])
    y = list(user_story_df['label'])

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 18)

    # Initialize the tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    # Tokenize the user stories
    #encoded_input = tokenizer(user_story_df['user_story'].tolist(), truncation=True, padding=True, max_length=512)

    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=512)

    # Define a custom dataset class
    class MyDataset(Dataset):
        def __init__(self, input_ids, attention_mask, labels):
            self.input_ids = input_ids
            self.attention_mask = attention_mask
            self.labels = labels

        def __len__(self):
            return len(self.input_ids)

        def __getitem__(self, idx):
            return {
                'input_ids': torch.tensor(self.input_ids[idx]),
                'attention_mask': torch.tensor(self.attention_mask[idx]),
                'labels': torch.tensor(self.labels[idx])
            }

    # Initialize the model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

    # Prepare the training & Testing dataset
    #labels = user_story_df['class'].values

    train_dataset = MyDataset(train_encodings['input_ids'], train_encodings['attention_mask'], y_train)
    test_dataset = MyDataset(test_encodings['input_ids'], test_encodings['attention_mask'], y_test)


    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # Initialize the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset = test_dataset,
    )

    # Train the model
    trainer.train()

    # Use the model to predict the labels
    predictions = trainer.predict(test_dataset).predictions
    predictions = np.argmax(predictions, axis=1)

    # Compare the predictions with the actual labels
    print(classification_report(y_test, predictions))

    if IS_RUNNING_ON_COLAB:
        trainer.save_model(drive_path + "data_verification_model")
    else:
        trainer.save_model("../artifacts/data_verification_model")

### Validating raw user story data

In [5]:
# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('../artifacts/data_verification_model')

# Create an empty list to store the predicted labels
predicted_labels = []

# Iterate over the rows of the dataframe
for index, row in user_story_df.iterrows():
    try:
        text = row['text']

        # Prepare the input data
        inputs = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors="pt")

        # Forward pass through the model
        with torch.no_grad():
            outputs = model(**inputs)

        # Get the predicted label
        predicted_label = torch.argmax(outputs.logits, dim=1).item()
        predicted_labels.append(predicted_label)
    except:
        predicted_labels.append(0)

# Add the predicted labels to the dataframe
user_story_df['is_valid'] = predicted_labels

In [6]:
user_story_df[user_story_df['is_valid'] == 0]

Unnamed: 0,text,label,is_valid
7,As a Solid user I want to know when I am a Sol...,Authentication,0
416,which data to perform data analysis,Unknown,0
435,"As UHOPE, I want to enable patients to cancel ...",Unknown,0
444,"As UHOPE, I want to bill the patient for the m...",Unknown,0
457,"As UHOPE, I want to have at least one back-up ...",Unknown,0
840,"As a Mike, I want to be able to see some metri...",Unknown,0
978,Auditing & Reporting.,Unknown,0
991,bidirectionally with the repository.,Unknown,0
1514,to support data migration integrity testing.,Unknown,0
2048,"As IFA, I want the system to be localizable so...",Unknown,0


In [7]:
user_story_df[user_story_df['is_valid'] == 0].to_csv('../artifacts/data/invalid_data.csv',index=False)

In [8]:
validated_user_story = user_story_df[user_story_df['is_valid'] == 1]
validated_user_story    

Unnamed: 0,text,label,is_valid
0,As an app developer aiming for low-resource en...,Chat,1
1,"As a community member, I want to create a Pod/...",Chat,1
2,As a governmental agency providing Pods for ci...,Authentication,1
3,"As a existing Solid user, I would like to use ...",Authentication,1
4,"As a Solid Identity Provider, I would like it ...",Authentication,1
...,...,...,...
2147,"As a UMD employee, I want to be able to access...",Unknown,1
2148,"As a UMD employee, I want the system to start ...",Unknown,1
2149,"As a UMD employee, I want a platform that can ...",Unknown,1
2150,"As a UMD employee, I want the software to be a...",Unknown,1


In [9]:
validated_user_story.drop('is_valid', axis=1, inplace=True)
validated_user_story.drop_duplicates(inplace=True)

validated_user_story.to_csv('../artifacts/data/validated.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validated_user_story.drop('is_valid', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validated_user_story.drop_duplicates(inplace=True)


In [10]:
# # Initialize the tokenizer and the model
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# # Tokenize the user stories
# encoded_input = tokenizer(user_story_df['user_story'].tolist(), truncation=True, padding=True, max_length=512)

# # Check if the user stories are valid or not
# predictions = model(encoded_input['input_ids'])
# user_story_df['validity'] = predictions.argmax(dim=-1).tolist()

# # Initialize the sentiment analysis pipeline
# sentiment_analysis = pipeline('sentiment-analysis')

# # Analyze the sentiment of the user stories
# user_story_df['sentiments'] = [result['label'] for result in sentiment_analysis(user_story_df['user_story'].tolist())]

# # Initialize the NER pipeline
# ner = pipeline('ner')

# # Identify the entities in the user stories
# user_story_df['entities'] = [ner(story) for story in user_story_df['user_story'].tolist()]

# # Extract features from the user stories
# features = model(encoded_input['input_ids'])

# # Calculate the cosine similarity between the features
# similarity = cosine_similarity(features)

# # Since similarity is a matrix, we'll just store the average similarity for each user story
# user_story_df['average_similarity'] = similarity.mean(axis=1)

# # Save the DataFrame to a new CSV file
# user_story_df.to_csv('analyzed_user_stories.csv', index=False)
