### Use github data set to pretrain a classifier. SO that we can label the datasets

In [46]:
import numpy as np
import os
import pandas as pd

# Only to be used for solid_git dataset for 'Contacts' label
predefined_labels = ['Accessibility','Aggregation','Authentication','Consistency','Contacts','Chat']

### Source - 1

In [47]:
# Directory path containing the text files
directory_path = "../dataset/0"

# Initialize an empty list to store the extracted data
data = []

# Iterate over each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory_path, filename)
        
        # Read the contents of the file
        with open(file_path, "r", encoding='utf-8', errors='ignore') as file:
            lines = file.readlines()
            
        # Remove leading/trailing whitespaces and append each line as a separate user story
        for line in lines:
            title = line.strip().lstrip("<ï»¿>").rstrip("<ï»¿>")
            
            # Append the data to the list
            #data.append({"user_story": title , "file_name": filename})
            data.append({"text": title})

# Create a dataframe from the extracted data
user_story_df= pd.DataFrame(data)

# Print the resulting dataframe
user_story_df

Unnamed: 0,text
0,"﻿As a Data user, I want to have the 12-19-2017..."
1,"As a UI designer, I want to redesign the Resou..."
2,"As a UI designer, I want to report to the Agen..."
3,"As a UI designer, I want to move on to round 2..."
4,"As a UI designer, I want to move on to round 2..."
...,...
2130,"As a UMD employee, I want to be able to access..."
2131,"As a UMD employee, I want the system to start ..."
2132,"As a UMD employee, I want a platform that can ..."
2133,"As a UMD employee, I want the software to be a..."


In [48]:
user_story_df.drop_duplicates(inplace=True)
user_story_df

Unnamed: 0,text
0,"﻿As a Data user, I want to have the 12-19-2017..."
1,"As a UI designer, I want to redesign the Resou..."
2,"As a UI designer, I want to report to the Agen..."
3,"As a UI designer, I want to move on to round 2..."
4,"As a UI designer, I want to move on to round 2..."
...,...
2130,"As a UMD employee, I want to be able to access..."
2131,"As a UMD employee, I want the system to start ..."
2132,"As a UMD employee, I want a platform that can ..."
2133,"As a UMD employee, I want the software to be a..."


In [49]:
user_story_df.isna().sum()

text    0
dtype: int64

In [50]:
user_story_df['label'] = 'Unknown'
user_story_df.to_csv('../artifacts/data/source_1.csv', index=False)

In [51]:
#user_story_df.drop('file_name', axis=1, inplace=True)

In [52]:
user_story_df.to_csv('../artifacts/data/raw.csv', index=False)

### Source-2

### Github Dataset

In [53]:
import pandas as pd
import os
from github import Github

def extract_issues_from_repo(repo_url):
    # Create a GitHub instance without an access token
    g = Github()

    # Get the repository from the URL
    repo = g.get_repo(repo_url)

    issues = []
    for issue in repo.get_issues():
        issues.append({
            'text': issue.title,
            'Body': issue.body,
            'State': issue.state,
            'Created At': issue.created_at,
            'Updated At': issue.updated_at,
            'Closed At': issue.closed_at,
            'Labels': [label.name for label in issue.labels],
            'Comments': issue.comments
        })

    # Create a Pandas DataFrame from the extracted issues
    df = pd.DataFrame(issues)
    return df

file_path = "../dataset/1/github_user_story.csv"

# check for GitHub user stories
if not os.path.exists(file_path):

    repo_url = 'solid/user-stories'
    git_dataset = extract_issues_from_repo(repo_url)

    git_labelled_dataset = git_dataset[git_dataset['Labels']
                                                .apply(lambda x: any(elem in x for elem in predefined_labels))]

    git_labelled_dataset['label'] = git_labelled_dataset['Labels'].apply(lambda x: next((elem for elem in x if elem in predefined_labels), None))

    git_labelled_dataset = git_labelled_dataset[['text','label']]

    git_labelled_dataset.drop_duplicates(inplace=True)

    git_labelled_dataset.to_csv(file_path, index=False)

else:
    print('GitHub labelled dataset found!')

GitHub labelled dataset found!


In [54]:
solid_git_df = pd.read_csv('../dataset/1/github_user_story.csv')
solid_git_df

Unnamed: 0,text,label
0,As an app developer aiming for low-resource en...,Chat
1,"As a community member, I want to create a Pod/...",Chat
2,As a governmental agency providing Pods for ci...,Authentication
3,"As a existing Solid user, I would like to use ...",Authentication
4,"As a Solid Identity Provider, I would like it ...",Authentication
...,...,...
74,"As a Developer, I want to ensure that our webs...",Accessibility
75,"As a Quality Assurance Engineer, I want to tes...",Accessibility
76,"As a Project Manager, I want to ensure that ac...",Accessibility
77,"As a Developer, I want to implement keyboard n...",Accessibility


In [55]:
solid_git_df['label'].value_counts()

label
Chat              18
Contacts          15
Authentication    13
Aggregation       11
Accessibility     11
Consistency       11
Name: count, dtype: int64

In [56]:
solid_git_df.to_csv('../artifacts/data/source_2.csv', index=False)

### Merging both the data sources

In [57]:
print(len(user_story_df))
print(len(solid_git_df))

merged_dataset = pd.concat([solid_git_df,user_story_df], ignore_index=True)

print(len(merged_dataset))

2073
79
2152


In [58]:
merged_dataset.drop_duplicates(inplace=True)
merged_dataset

Unnamed: 0,text,label
0,As an app developer aiming for low-resource en...,Chat
1,"As a community member, I want to create a Pod/...",Chat
2,As a governmental agency providing Pods for ci...,Authentication
3,"As a existing Solid user, I would like to use ...",Authentication
4,"As a Solid Identity Provider, I would like it ...",Authentication
...,...,...
2147,"As a UMD employee, I want to be able to access...",Unknown
2148,"As a UMD employee, I want the system to start ...",Unknown
2149,"As a UMD employee, I want a platform that can ...",Unknown
2150,"As a UMD employee, I want the software to be a...",Unknown


In [59]:
merged_dataset.to_csv('../artifacts/data/raw.csv', index=False)

### __TEXT CLASSIFICATION__

#### https://github.com/RDA-DMP-Common/user-stories

### IMP -> https://medium.com/analytics-vidhya/step-by-step-text-classification-using-different-models-and-compare-them-8a34204c34f8

#### https://link.springer.com/article/10.1007/s11704-022-8262-9

https://www.sciencedirect.com/science/article/pii/S0950584922000246
https://github.com/awkale/user-story-best-practice

In [60]:
# from transformers import BertForSequenceClassification
# from transformers import BertTokenizerFast

# # Initialize the tokenizer
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('../artifacts/data_verification_model.h5')

In [61]:
# import torch

# # Create an empty list to store the predicted labels
# predicted_labels = []

# # Iterate over the rows of the dataframe
# for index, row in git_dataset.iterrows():
#     text = row['Title']  # Assuming the column name is 'text'

#     # Prepare the input data
#     inputs = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors="pt")

#     # Forward pass through the model
#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Get the predicted label
#     predicted_label = torch.argmax(outputs.logits, dim=1).item()
#     predicted_labels.append(predicted_label)

# # Add the predicted labels to the dataframe
# git_dataset['Is_Valid'] = predicted_labels

# git_dataset[git_dataset['Is_Valid'] == 0]

In [62]:
# git_dataset.iloc[28].Title

In [63]:
# git_dataset[git_dataset['Is_Valid'] == 1]['Title']

In [64]:
# git_dataset[git_dataset['Is_Valid'] == 1].to_csv('../artifacts/validated_git_data.csv', index=False)