In [3]:
import pandas as pd
import numpy as np
import re
import string
import statistics

from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer

from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import statistics

pd.set_option('mode.chained_assignment', None)
porter = PorterStemmer()
tokenizer = WordPunctTokenizer()

def vectorizeSeverityValue(value):
    if value == 'Critical' or value == 'Blocker':
        return 0
    elif value == 'Minor' or value == 'Trivial':
        return 1
    elif value == 'Major':
        return 2
    else:
        return 3

def getTokenizedText(text):
    text = str(text)
    tokens = tokenizer.tokenize(text)
    stemmed = []
    for token in tokens:
        stemmed.append(porter.stem(token))
        stemmed.append(" ")
    stemmed = "".join(stemmed)
    
    #text cleaning
    text_without_punctuation = [char for char in stemmed if char not in string.punctuation]
    text_without_punctuation = ''.join(text_without_punctuation)

    tokenized_text = [word for word in text_without_punctuation.split() if word.lower() not in stopwords.words('english')]
    tokenized_text = ' '.join(tokenized_text)
    return tokenized_text

def experiment(X, y, classifier):
    kfold = StratifiedKFold(n_splits=10, shuffle=True)
    kfoldPrecision = []
    kfoldRecall = []
    kfoldF_Measures = []
    for train, test in kfold.split(X, y):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]

        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        report = classification_report(y_test, y_pred, output_dict=True)
        kfoldPrecision.append(report['weighted avg']['precision'])
        kfoldRecall.append(report['weighted avg']['recall'])
        kfoldF_Measures.append(report['weighted avg']['f1-score'])

    print('#### -- ####')
    print('Average Precision')
    print(statistics.mean(kfoldPrecision))
    print('Average Recall')
    print(statistics.mean(kfoldRecall))
    print('Average F1-score')
    print(statistics.mean(kfoldF_Measures))
    

In [4]:
logDataset = pd.read_csv('data/five_projects_log.csv')
logDataset.dropna(subset=['title', 'description', 'new_priority'] , inplace=True)
logDataset.new_priority = logDataset.new_priority.apply(vectorizeSeverityValue)
logDataset.drop(logDataset[(logDataset['new_priority'] == 2) | (logDataset['new_priority'] == 3)].index, inplace=True)

logDataset = logDataset.reset_index(drop=True)

logDataset.title = logDataset.title.apply(getTokenizedText)
logDataset.description = logDataset.description.apply(getTokenizedText)

X = logDataset['title'] + ' ' + logDataset['description']
y = logDataset['new_priority']

SVMLinear = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='linear'))
])

experiment(X, y, SVMLinear)

svmSigmoid = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='sigmoid'))
])

experiment(X, y, svmSigmoid)

logisticRegression = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

experiment(X, y, logisticRegression)


#### -- ####
Average Precision
0.698956172146779
Average Recall
0.7117263843648208
Average F1-score
0.6923003939577863
#### -- ####
Average Precision
0.6895827105046974
Average Recall
0.7039087947882736
Average F1-score
0.6826478735420942
#### -- ####
Average Precision
0.6979593181725446
Average Recall
0.7087947882736156
Average F1-score
0.6786119478041561
