Transforming requirements into DataFrame

In [25]:
import arff
import pandas as pd

data = arff.load('PROMISE_exp.arff')

df = pd.DataFrame(data)

df.columns =['Project', 'Requirement', 'Class', 'Ambiguity', 'Ambiguity Type']

df.head()

Unnamed: 0,Project,Requirement,Class,Ambiguity,Ambiguity Type
0,1,The system shall refresh the display every 60 ...,PE,0,
1,1,The application shall match the color of the s...,LF,0,
2,1,If projected the data must be readable. On a ...,US,0,
3,1,The product shall be available during normal b...,A,1,VG
4,1,If projected the data must be understandable. ...,US,1,VG


Showing database balance

In [26]:
df['Ambiguity'].value_counts(normalize=True)

0    0.896447
1    0.103553
Name: Ambiguity, dtype: float64

Feature Extraction

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

In [28]:
count_vect = CountVectorizer(stop_words='english')
X_requirement_counts = stemmed_count_vect.fit_transform(df['Requirement'])
print(X_requirement_counts.shape)

(985, 1336)


TF-IDF (Term Frequency—Inverse Document Frequency)

In [29]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_requirement_tfidf = tfidf_transformer.fit_transform(X_requirement_counts)
X_requirement_tfidf.shape



(985, 1336)

Split Test and Train sets

In [30]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(X_requirement_tfidf, df['Ambiguity'], test_size=0.3, stratify=df['Ambiguity'])


Balancing traing sample

In [31]:
from imblearn.over_sampling import SMOTE

print(train_x.shape)
oversample = SMOTE(sampling_strategy=1)
train_x, train_y = oversample.fit_resample(train_x, train_y)
train_x.shape


(689, 1336)


(1236, 1336)

Run ML Models

In [32]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix

y_scores = []

clf_mnb = MultinomialNB().fit(train_x, train_y)
y_score_mnb = clf_mnb.predict(test_x)
y_scores.append(('Multinomial Naive Bayes', y_score_mnb, clf_mnb))

clf_knc = KNeighborsClassifier().fit(train_x, train_y)
y_score_knc = clf_knc.predict(test_x)
y_scores.append(('K-Nearest Neighbor', y_score_knc, clf_knc))

clf_svc = SVC().fit(train_x, train_y)
y_score_svc = clf_svc.predict(test_x)
y_scores.append(('Support Vector Machine', y_score_svc, clf_svc))

clf_lr = LogisticRegression(random_state=0).fit(train_x, train_y)
y_score_lr = clf_lr.predict(test_x)
y_scores.append(('Logistic Regression', y_score_lr, clf_lr))



Show Metrics

In [33]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, plot_confusion_matrix, classification_report

target_names = ["Unambiguous", "Ambiguous"]

for model, y_score, clf in y_scores: 
    print(model)
    print(classification_report(test_y, y_score, target_names=target_names))
    print()



Multinomial Naive Bayes
              precision    recall  f1-score   support

 Unambiguous       0.95      0.88      0.91       265
   Ambiguous       0.37      0.58      0.45        31

    accuracy                           0.85       296
   macro avg       0.66      0.73      0.68       296
weighted avg       0.89      0.85      0.87       296


K-Nearest Neighbor
              precision    recall  f1-score   support

 Unambiguous       1.00      0.45      0.62       265
   Ambiguous       0.18      1.00      0.30        31

    accuracy                           0.51       296
   macro avg       0.59      0.72      0.46       296
weighted avg       0.91      0.51      0.59       296


Support Vector Machine
              precision    recall  f1-score   support

 Unambiguous       0.91      1.00      0.95       265
   Ambiguous       0.83      0.16      0.27        31

    accuracy                           0.91       296
   macro avg       0.87      0.58      0.61       296
weight