In [1]:
import pandas as pd
import numpy as np
import re
import string
import statistics

from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer

from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


from imblearn.over_sampling import RandomOverSampler

pd.set_option('mode.chained_assignment', None)
porter = PorterStemmer()
tokenizer = WordPunctTokenizer()

def vectorizeSeverityValue(value):
    if value == 'Critical' or value == 'Blocker':
        return 0
    elif value == 'Minor' or value == 'Trivial':
        return 1
    elif value == 'Major':
        return 2
    else:
        return 3

def getTokenizedText(text):
    text = str(text)
    tokens = tokenizer.tokenize(text)
    stemmed = []
    for token in tokens:
        stemmed.append(porter.stem(token))
        stemmed.append(" ")
    stemmed = "".join(stemmed)
    
    #text cleaning
    text_without_punctuation = [char for char in stemmed if char not in string.punctuation]
    text_without_punctuation = ''.join(text_without_punctuation)

    tokenized_text = [word for word in text_without_punctuation.split() if word.lower() not in stopwords.words('english')]
    tokenized_text = ' '.join(tokenized_text)
    return tokenized_text

In [2]:
# Log dataset
logDataset = pd.read_csv('data/five_projects_log.csv')
logDataset.dropna(subset=['title', 'description', 'new_priority'] , inplace=True)
logDataset.new_priority = logDataset.new_priority.apply(vectorizeSeverityValue)
logDataset.drop(logDataset[(logDataset['new_priority'] == 2) | (logDataset['new_priority'] == 3)].index, inplace=True)


#print(logDataset.info())
logDataset.title = logDataset.title.apply(getTokenizedText)
logDataset.description = logDataset.description.apply(getTokenizedText)

X = logDataset['title'].astype(str) + ' ' + logDataset['description'].astype(str)
y = logDataset['new_priority']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3)


print('## Logistic Regression Results ##')
logisticRegression = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

logisticRegression.fit(X_train, y_train)
y_pred = logisticRegression.predict(X_test)
print(classification_report(y_test, y_pred, output_dict=True))
print(confusion_matrix(y_test, y_pred))


print('## Random Forest Results ##')
randomForest = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

randomForest.fit(X_train, y_train)
y_pred = randomForest.predict(X_test)
print(classification_report(y_test, y_pred, output_dict=True))
print(confusion_matrix(y_test, y_pred))

print('## Decision Tree Results ##')
decisionTree = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', DecisionTreeClassifier())
])

decisionTree.fit(X_train, y_train)
y_pred = decisionTree.predict(X_test)
print(classification_report(y_test, y_pred, output_dict=True))
print(confusion_matrix(y_test, y_pred))

print('## SVM (RBF) Results ##')
svmRBF = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='rbf'))
])

svmRBF.fit(X_train, y_train)
y_pred = svmRBF.predict(X_test)
print(classification_report(y_test, y_pred, output_dict=True))
print(confusion_matrix(y_test, y_pred))

print('## SVM (linear) Results ##')
svmLinear = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='linear'))
])

svmLinear.fit(X_train, y_train)
y_pred = svmLinear.predict(X_test)
print(classification_report(y_test, y_pred, output_dict=True))
print(confusion_matrix(y_test, y_pred))

print('## SVM (poly) Results ##')
svmPoly = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='poly'))
])

svmPoly.fit(X_train, y_train)
y_pred = svmPoly.predict(X_test)
print(classification_report(y_test, y_pred, output_dict=True))
print(confusion_matrix(y_test, y_pred))

print('## SVM (sigmoid) Results ##')
svmSigmoid = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='sigmoid'))
])

svmSigmoid.fit(X_train, y_train)
y_pred = svmSigmoid.predict(X_test)
print(classification_report(y_test, y_pred, output_dict=True))
print(confusion_matrix(y_test, y_pred))




## Logistic Regression Results ##
{'0': {'precision': 0.7406417112299465, 'recall': 0.8978930307941653, 'f1-score': 0.8117216117216116, 'support': 617}, '1': {'precision': 0.6358381502890174, 'recall': 0.3618421052631579, 'f1-score': 0.4612159329140461, 'support': 304}, 'accuracy': 0.7209554831704669, 'macro avg': {'precision': 0.688239930759482, 'recall': 0.6298675680286616, 'f1-score': 0.6364687723178288, 'support': 921}, 'weighted avg': {'precision': 0.7060485705936355, 'recall': 0.7209554831704669, 'f1-score': 0.6960280977612425, 'support': 921}}
[[554  63]
 [194 110]]
## Random Forest Results ##
{'0': {'precision': 0.727391874180865, 'recall': 0.899513776337115, 'f1-score': 0.8043478260869565, 'support': 617}, '1': {'precision': 0.6075949367088608, 'recall': 0.3157894736842105, 'f1-score': 0.41558441558441556, 'support': 304}, 'accuracy': 0.7068403908794788, 'macro avg': {'precision': 0.6674934054448629, 'recall': 0.6076516250106627, 'f1-score': 0.609966120835686, 'support': 921},