In [3]:
experience_data = {
    "Usually testers don't check syncing with multiple accounts much" : "High Priority",
    "Usually testers don't test long time wait checks very much" : "Medium Priority",
    "Testers test obivous feature too much" : "low priority",
    "Testers often overlook accessibility testing": "High Priority",
    "Security testing is frequently underestimated by testers": "Medium Priority",
    "Exploratory testing is not given enough emphasis": "Medium Priority",
    "Cross-browser testing is sometimes neglected": "Low Priority",
    "Load testing is often skipped in the testing process": "High Priority",
    "Mobile app testing is not always thorough": "Medium Priority",
    "Integration testing is sometimes underestimated": "Low Priority",
    "Usability testing is occasionally overlooked": "Medium Priority",
    "Edge cases are not always covered in testing scenarios": "High Priority",
    "Automated testing is not implemented effectively in some projects": "High Priority"
}

In [4]:
test_cases = {
    "TID1": "Sync microsoft account",
    "TID2": "Sync Google account",
    "TID3": "Wait 30min and see if it hangs",
    "TID4": "Test pen tool. Pen tool is a obvious feature.",
    "TID5": "Check login functionality with invalid credentials",
    "TID6": "Verify password reset functionality",
    "TID7": "Test in different browsers (Chrome, Firefox, Safari)",
    "TID8": "Validate file upload functionality",
    "TID9": "Perform boundary testing on input fields",
    "TID10": "Verify the application's response to network interruptions",
    "TID11": "Check for proper handling of special characters in input",
    "TID12": "Test user authentication across multiple devices",
    "TID13": "Validate the application's behavior with low network bandwidth",
    "TID14": "Verify the functionality of the search feature",
    "TID15": "Test the application's compatibility with various screen resolutions",
}


In [5]:
import pandas as pd

experience_dataset = pd.DataFrame(list(experience_data.items()), columns=['Sentence', 'Priority'])
experience_dataset

Unnamed: 0,Sentence,Priority
0,Usually testers don't check syncing with multi...,High Priority
1,Usually testers don't test long time wait chec...,Medium Priority
2,Testers test obivous feature too much,low priority
3,Testers often overlook accessibility testing,High Priority
4,Security testing is frequently underestimated ...,Medium Priority
5,Exploratory testing is not given enough emphasis,Medium Priority
6,Cross-browser testing is sometimes neglected,Low Priority
7,Load testing is often skipped in the testing p...,High Priority
8,Mobile app testing is not always thorough,Medium Priority
9,Integration testing is sometimes underestimated,Low Priority


## Training Language Model

In [6]:
!pip install gensim nltk

Defaulting to user installation because normal site-packages is not writeable


In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd


# Convert data to DataFrame
experience_dataset = pd.DataFrame(list(experience_data.items()), columns=['Sentence', 'Priority'])

# Tokenize sentences
tokenized_data = [word_tokenize(sentence.lower()) for sentence in experience_dataset['Sentence']]

# Create TaggedDocuments
tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(tokenized_data)]

# Train Doc2Vec model
model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Function to convert a sentence to a vector
def sentence_to_vector(sentence):
    tokenized_sentence = word_tokenize(sentence.lower())
    vector = model.infer_vector(tokenized_sentence)
    return vector

# Add vectors to the DataFrame
experience_dataset['Vector'] = experience_dataset['Sentence'].apply(sentence_to_vector)

# Display the DataFrame with vectors
print(experience_dataset)


                                             Sentence         Priority  \
0   Usually testers don't check syncing with multi...    High Priority   
1   Usually testers don't test long time wait chec...  Medium Priority   
2               Testers test obivous feature too much     low priority   
3        Testers often overlook accessibility testing    High Priority   
4   Security testing is frequently underestimated ...  Medium Priority   
5    Exploratory testing is not given enough emphasis  Medium Priority   
6        Cross-browser testing is sometimes neglected     Low Priority   
7   Load testing is often skipped in the testing p...    High Priority   
8           Mobile app testing is not always thorough  Medium Priority   
9     Integration testing is sometimes underestimated     Low Priority   
10       Usability testing is occasionally overlooked  Medium Priority   
11  Edge cases are not always covered in testing s...    High Priority   
12  Automated testing is not implement

In [8]:
model.save("test_experience_doc2vec_model")

In [10]:
doc_vectors = experience_dataset["Priority"]
labels = experience_dataset["Vector"]

## Classification

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, labels, test_size=0.2, random_state=42)

In [14]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def find_relevant_documents_knn(query_sentence, dataset, k=5):
    query_vector = sentence_to_vector(query_sentence)
    dataset_vectors = np.array(list(experience_dataset['Vector']))
    
    knn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn_model.fit(dataset_vectors)
    
    _, indices = knn_model.kneighbors([query_vector])
    relevant_documents = dataset.iloc[indices[0]]['Sentence'].tolist()
    
    return relevant_documents

In [15]:
query_sentence = "Testers often overlook security testing"
relevant_documents = find_relevant_documents_knn(query_sentence, experience_dataset, k=3)

print("Query Sentence:")
print(query_sentence)
print("\nRelevant Documents:")
for i, doc in enumerate(relevant_documents):
    print(f"{i + 1}. {doc}")

Query Sentence:
Testers often overlook security testing

Relevant Documents:
1. Usually testers don't test long time wait checks very much
2. Usually testers don't check syncing with multiple accounts much
3. Load testing is often skipped in the testing process
