In [1]:
import pandas as pd
import numpy as np
import re
import string
import statistics

from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer

from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


from imblearn.over_sampling import RandomOverSampler

pd.set_option('mode.chained_assignment', None)
porter = PorterStemmer()
tokenizer = WordPunctTokenizer()

def vectorizeSeverityValue(value):
    if value == 'Critical' or value == 'Blocker':
        return 0
    elif value == 'Minor' or value == 'Trivial':
        return 1
    elif value == 'Major':
        return 2
    else:
        return 3

def getTokenizedText(text):
    text = str(text)
    tokens = tokenizer.tokenize(text)
    stemmed = []
    for token in tokens:
        stemmed.append(porter.stem(token))
        stemmed.append(" ")
    stemmed = "".join(stemmed)
    
    #text cleaning
    text_without_punctuation = [char for char in stemmed if char not in string.punctuation]
    text_without_punctuation = ''.join(text_without_punctuation)

    tokenized_text = [word for word in text_without_punctuation.split() if word.lower() not in stopwords.words('english')]
    tokenized_text = ' '.join(tokenized_text)
    return tokenized_text

def trainModel(dataset):
    X = dataset['title'].astype(str) + ' ' + dataset['description'].astype(str)
    y = dataset['priority']

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3)
    labelEncoder = preprocessing.LabelEncoder()
    y_train = labelEncoder.fit_transform(y_train)
    y_test = labelEncoder.fit_transform(y_test)
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', max_features=500000)
    tfidf_vectorizer.fit(X_train)
    X_train_TFIDF = tfidf_vectorizer.transform(X_train)

    classifier = SVC(kernel='linear')
    classifier.fit(X_train_TFIDF, y_train)

    return tfidf_vectorizer, classifier

def comparePerformance(fullDataset, logDataset, projectName, oversampling=True):

    results = {"log": [], "full": [], "fullMS": [], "fullMNS": []}

    fullDataset = fullDataset[fullDataset['project_name'] == projectName]
    fullDataset = fullDataset.sample(frac=1).reset_index(drop=True)
    if(projectName == 'Tools (JBoss Tools)'):
        df0 = fullDataset.loc[fullDataset['priority'] == 0].head(600)
        df1 = fullDataset.loc[fullDataset['priority'] == 1].head(600)
        df2 = fullDataset.loc[fullDataset['priority'] == 2].head(600)
    else:
        df0 = fullDataset.loc[fullDataset['priority'] == 0].head(400)
        df1 = fullDataset.loc[fullDataset['priority'] == 1].head(400)
        df2 = fullDataset.loc[fullDataset['priority'] == 2].head(400)
    
    fullDataset = pd.concat([df0, df1, df2])

    fullDataset.title = fullDataset.title.apply(getTokenizedText)
    fullDataset.description = fullDataset.description.apply(getTokenizedText)

    ### Full dataset with Major severity in Severe class
    fullDatasetMajorAsSevere = fullDataset.copy()
    fullDatasetMajorAsSevere.loc[fullDatasetMajorAsSevere['priority'] == 2] = 0

    ### Full dataset with Major severity in Non Severe class
    fullDatasetMajorAsNonSevere = fullDataset.copy()
    fullDatasetMajorAsNonSevere.loc[fullDatasetMajorAsNonSevere['priority'] == 2] = 1

    fullDataset.drop(fullDataset[fullDataset['priority'] == 2].index, inplace=True)
    
    logDataset = logDataset[logDataset['project_name'] == projectName]

    # Models Training
    vectorizerFullDataset, classifierFullDataset = trainModel(fullDataset)
    vectorizerFullDatasetMajorAsSevere, classifierFullDatasetMajorAsSevere = trainModel(fullDatasetMajorAsSevere)
    vectorizerFullDatasetMajorAsNonSevere, classifierFullDatasetMajorAsNonSevere = trainModel(fullDatasetMajorAsNonSevere)


    logDataset.title = logDataset.title.apply(getTokenizedText)
    logDataset.description = logDataset.description.apply(getTokenizedText)

    X = logDataset['title'].astype(str) + ' ' + logDataset['description'].astype(str)
    y = logDataset['new_priority']

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3)

    labelEncoder2 = preprocessing.LabelEncoder()
    y_train = labelEncoder2.fit_transform(y_train)
    y_test = labelEncoder2.fit_transform(y_test)

    tfidf_vectorizer = TfidfVectorizer(analyzer='word', max_features=100000)
    tfidf_vectorizer.fit(X_train)
    X_train_TFIDF = tfidf_vectorizer.transform(X_train)
    X_test_TFIDF = tfidf_vectorizer.transform(X_test)

    
    classifier = SVC(kernel='linear')
    
    if(oversampling):
        Sampler = RandomOverSampler(random_state=777)
        X_train_TFIDF_randomSampler, y_train_randomSampler = Sampler.fit_resample(X_train_TFIDF, y_train)
        classifier.fit(X_train_TFIDF_randomSampler, y_train_randomSampler)
    else:
        classifier.fit(X_train_TFIDF, y_train)

    y_pred = classifier.predict(X_test_TFIDF)
    
    ## Log Model Performance ##
    results['log'].append(classification_report(y_test, y_pred, output_dict=True))

    ## Full Model Performance ##
    _X_test_TFIDF = vectorizerFullDataset.transform(X_test)
    _y_pred = classifierFullDataset.predict(_X_test_TFIDF)
    results['full'].append(classification_report(y_test, _y_pred, output_dict=True))
    

    ## Full Model with Major As Severe Performance ##
    _X_test_TFIDF = vectorizerFullDatasetMajorAsSevere.transform(X_test)
    _y_pred = classifierFullDatasetMajorAsSevere.predict(_X_test_TFIDF)
    results['fullMS'].append(classification_report(y_test, _y_pred, output_dict=True))

    ## Full Model with Major As Non-Severe Performance ##
    _X_test_TFIDF = vectorizerFullDatasetMajorAsNonSevere.transform(X_test)
    _y_pred = classifierFullDatasetMajorAsNonSevere.predict(_X_test_TFIDF)
    results['fullMNS'].append(classification_report(y_test, _y_pred, output_dict=True))

    return results

In [2]:
fullDataset = pd.read_csv('data/five_projects_full.csv')
fullDataset.dropna(subset=['title', 'description', 'priority'] , inplace=True)
fullDataset.priority = fullDataset.priority.apply(vectorizeSeverityValue)
fullDataset.drop(fullDataset[fullDataset['priority'] == 3].index, inplace=True)

# Log dataset
logDataset = pd.read_csv('data/five_projects_log.csv')
logDataset.dropna(subset=['title', 'description', 'new_priority'] , inplace=True)
logDataset.new_priority = logDataset.new_priority.apply(vectorizeSeverityValue)
logDataset.drop(logDataset[(logDataset['new_priority'] == 2) | (logDataset['new_priority'] == 3)].index, inplace=True)

In [3]:
def runExperiment(fullDataset, logDataset, projectName, iterations, oversampling=True):
    print('###############################')
    print(' ### Results >>> ', projectName)
    print('###############################')
    
    # Log Model
    avgPrecision_0 = []
    avgRecall_0 = []
    avgF_Measures_0 = []
    avgPrecision_1 = []
    avgRecall_1 = []
    avgF_Measures_1 = []
    avgPrecision = []
    avgRecall = []
    avgF_Measures = []

    # Full Model
    avgPrecisionFull_0 = []
    avgRecallFull_0 = []
    avgF_MeasuresFull_0 = []
    avgPrecisionFull_1 = []
    avgRecallFull_1 = []
    avgF_MeasuresFull_1 = []
    avgPrecisionFull = []
    avgRecallFull = []
    avgF_MeasuresFull = []

    # Full Model with Major As Severe Performance fullMS
    avgPrecisionFullMS_0 = []
    avgRecallFullMS_0 = []
    avgF_MeasuresFullMS_0 = []
    avgPrecisionFullMS_1 = []
    avgRecallFullMS_1 = []
    avgF_MeasuresFullMS_1 = []
    avgPrecisionFullMS = []
    avgRecallFullMS = []
    avgF_MeasuresFullMS = []

    # Full Model with Major As Non-Severe Performance fullMNS
    avgPrecisionFullMNS_0 = []
    avgRecallFullMNS_0 = []
    avgF_MeasuresFullMNS_0 = []
    avgPrecisionFullMNS_1 = []
    avgRecallFullMNS_1 = []
    avgF_MeasuresFullMNS_1 = []
    avgPrecisionFullMNS = []
    avgRecallFullMNS = []
    avgF_MeasuresFullMNS = []


    for i in range(iterations):
        result = comparePerformance(fullDataset, logDataset, projectName, oversampling)
        # Log Model
        avgPrecision_0.append(result['log'][0]['0']['precision'])
        avgRecall_0.append(result['log'][0]['0']['recall'])
        avgF_Measures_0.append(result['log'][0]['0']['f1-score'])

        avgPrecision_1.append(result['log'][0]['1']['precision'])
        avgRecall_1.append(result['log'][0]['1']['recall'])
        avgF_Measures_1.append(result['log'][0]['1']['f1-score'])

        avgPrecision.append(result['log'][0]['weighted avg']['precision'])
        avgRecall.append(result['log'][0]['weighted avg']['recall'])
        avgF_Measures.append(result['log'][0]['weighted avg']['f1-score'])

        # Full Model
        avgPrecisionFull_0.append(result['full'][0]['0']['precision'])
        avgRecallFull_0.append(result['full'][0]['0']['recall'])
        avgF_MeasuresFull_0.append(result['full'][0]['0']['f1-score'])

        avgPrecisionFull_1.append(result['full'][0]['1']['precision'])
        avgRecallFull_1.append(result['full'][0]['1']['recall'])
        avgF_MeasuresFull_1.append(result['full'][0]['1']['f1-score'])

        avgPrecisionFull.append(result['full'][0]['weighted avg']['precision'])
        avgRecallFull.append(result['full'][0]['weighted avg']['recall'])
        avgF_MeasuresFull.append(result['full'][0]['weighted avg']['f1-score'])

        # Full Model with Major As Severe Performance fullMS
        avgPrecisionFullMS_0.append(result['fullMS'][0]['0']['precision'])
        avgRecallFullMS_0.append(result['fullMS'][0]['0']['recall'])
        avgF_MeasuresFullMS_0.append(result['fullMS'][0]['0']['f1-score'])

        avgPrecisionFullMS_1.append(result['fullMS'][0]['1']['precision'])
        avgRecallFullMS_1.append(result['fullMS'][0]['1']['recall'])
        avgF_MeasuresFullMS_1.append(result['fullMS'][0]['1']['f1-score'])

        avgPrecisionFullMS.append(result['fullMS'][0]['weighted avg']['precision'])
        avgRecallFullMS.append(result['fullMS'][0]['weighted avg']['recall'])
        avgF_MeasuresFullMS.append(result['fullMS'][0]['weighted avg']['f1-score'])

        # Full Model with Major As Non-Severe Performance fullMNS
        avgPrecisionFullMNS_0.append(result['fullMNS'][0]['0']['precision'])
        avgRecallFullMNS_0.append(result['fullMNS'][0]['0']['recall'])
        avgF_MeasuresFullMNS_0.append(result['fullMNS'][0]['0']['f1-score'])

        avgPrecisionFullMNS_1.append(result['fullMNS'][0]['1']['precision'])
        avgRecallFullMNS_1.append(result['fullMNS'][0]['1']['recall'])
        avgF_MeasuresFullMNS_1.append(result['fullMNS'][0]['1']['f1-score'])

        avgPrecisionFullMNS.append(result['fullMNS'][0]['weighted avg']['precision'])
        avgRecallFullMNS.append(result['fullMNS'][0]['weighted avg']['recall'])
        avgF_MeasuresFullMNS.append(result['fullMNS'][0]['weighted avg']['f1-score'])


    print('## ---------------------------------- ##')
    print('#### Log Model >> Severe Class ####')
    print('Average Precision')
    print(statistics.mean(avgPrecision_0))
    print('Average Recall')
    print(statistics.mean(avgRecall_0))
    print('Average F1-score')
    print(statistics.mean(avgF_Measures_0))

    print('#### Log Model >> Non-Severe Class ####')
    print('Average Precision')
    print(statistics.mean(avgPrecision_1))
    print('Average Recall')
    print(statistics.mean(avgRecall_1))
    print('Average F1-score')
    print(statistics.mean(avgF_Measures_1))

    print('#### Log Model ####')
    print('Average Precision')
    print(statistics.mean(avgPrecision))
    print('Average Recall')
    print(statistics.mean(avgRecall))
    print('Average F1-score')
    print(statistics.mean(avgF_Measures))

    print('## ---------------------------------- ##')
    print("#### FULL Model >> Severe Class ####")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFull_0))
    print('Average Recall')
    print(statistics.mean(avgRecallFull_0))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFull_0))

    print("#### FULL Model >> Non-Severe Class ####'")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFull_1))
    print('Average Recall')
    print(statistics.mean(avgRecallFull_1))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFull_1))

    print("#### FULL Model ####'")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFull))
    print('Average Recall')
    print(statistics.mean(avgRecallFull))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFull))

    print('## ---------------------------------- ##')
    print("#### Full Model with Major As Severe >> Severe Class ####")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFullMS_0))
    print('Average Recall')
    print(statistics.mean(avgRecallFullMS_0))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFullMS_0))

    print("#### Full Model with Major As Severe >> Non-Severe Class ####")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFullMS_1))
    print('Average Recall')
    print(statistics.mean(avgRecallFullMS_1))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFullMS_1))

    print("#### Full Model with Major As Severe ####")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFullMS))
    print('Average Recall')
    print(statistics.mean(avgRecallFullMS))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFullMS))

    print('## ---------------------------------- ##')
    print("#### Full Model with Major As Non-Severe >> Severe Class ####")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFullMNS_0))
    print('Average Recall')
    print(statistics.mean(avgRecallFullMNS_0))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFullMNS_0))

    print("#### Full Model with Major As Non-Severe >> Non-Severe Class ####")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFullMNS_1))
    print('Average Recall')
    print(statistics.mean(avgRecallFullMNS_1))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFullMNS_1))

    print("#### Full Model with Major As Non-Severe ####")
    print('Average Precision')
    print(statistics.mean(avgPrecisionFullMNS))
    print('Average Recall')
    print(statistics.mean(avgRecallFullMNS))
    print('Average F1-score')
    print(statistics.mean(avgF_MeasuresFullMNS))

In [4]:
runExperiment(fullDataset, logDataset, 'Tools (JBoss Tools)', 10, True)

runExperiment(fullDataset, logDataset, 'RichFaces', 10, False)

runExperiment(fullDataset, logDataset, 'HBase', 10, True)

runExperiment(fullDataset, logDataset, 'Hadoop Common', 10, True)

runExperiment(fullDataset, logDataset, 'Grails', 10, True)



###############################
 ### Results >>>  Tools (JBoss Tools)
###############################
## ---------------------------------- ##
#### Log Model >> Severe Class ####
Average Precision
0.7364826751351763
Average Recall
0.7997850753234176
Average F1-score
0.7666278375985012
#### Log Model >> Non-Severe Class ####
Average Precision
0.4517127614403572
Average Recall
0.36540654703018105
Average F1-score
0.4032250192608761
#### Log Model ####
Average Precision
0.648422600539635
Average Recall
0.6647058823529411
Average F1-score
0.6538497254636149
## ---------------------------------- ##
#### FULL Model >> Severe Class ####
Average Precision
0.7823056974487276
Average Recall
0.6886073803848264
Average F1-score
0.732123825994515
#### FULL Model >> Non-Severe Class ####'
Average Precision
0.45324161575680394
Average Recall
0.5739917278055493
Average F1-score
0.5058770196808355
#### FULL Model ####'
Average Precision
0.6802944773341215
Average Recall
0.6529411764705882
Average F1-sc