In [27]:
import sys
import arff
from io import StringIO
from sklearn import svm
import numpy as np
import pandas as pd
import time
import json
import csv
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import minmax_scale
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import Binarizer
from sklearn.calibration import calibration_curve
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn import preprocessing

from nltk.corpus import stopwords

def weka_tokenizer(doc):
    delimiters_regexp = re.compile("[ |\n|\f|\r|\t|.|,|;|:|'|\"|(|)|?|!]")
    return list(filter(None, delimiters_regexp.split(doc)))

def initClassifiers():
    classifiers = {
        'randomForest': RandomForestClassifier(random_state=1), 
        'decisionTree': DecisionTreeClassifier(min_samples_leaf=1),
        'naiveBayes': GaussianNB(),
        'smo': CalibratedClassifierCV(LinearSVC(fit_intercept=False, tol=0.001, C=1, dual=False, max_iter=100000), method='sigmoid'),
        'knn': KNeighborsClassifier(n_neighbors=1, metric='euclidean'),
        'logisticRegression': LogisticRegression(max_iter=1000),
        'perceptron': CalibratedClassifierCV(Perceptron()),
        'lda': LinearDiscriminantAnalysis(),
    }

    return classifiers

def round_float(value):
    return float("{:.3f}".format(value))

def execClassifiers(X_train, x_test, y_train, y_test, classifiers):

    labels = ['Flaky', 'NonFlaky']
    results = pd.DataFrame()

    comparison_values = {}

    # create a normalized version
    trainScaler = Binarizer(threshold=0.0).fit(X_train)
    testScaler = Binarizer(threshold=0.0).fit(x_test)
    X_train_norm = trainScaler.transform(X_train)
    x_test_norm = testScaler.transform(x_test)

    for key, classifier in classifiers.items():

        x_train_exec = X_train
        x_test_exec = x_test
        y_train_exec = y_train
        y_test_exec = y_test        

        classifier.fit(x_train_exec, y_train)
        classifier.score(x_test_exec, y_test)

        predict = classifier.predict(x_test_exec)
        y_probs = classifier.predict_proba(x_test_exec)[:,1]

        result = {
            'classifier': key,
            'f1Score': f1_score(y_test, predict, average='weighted'), #labels=labels,
            'accuracy': classifier.score(x_test_exec, y_test),
            'confucionMatrix': confusion_matrix(y_test, predict),
            'classificationReport': classification_report(y_test, predict, output_dict=True), #, target_names=labels
            'AUC': roc_auc_score(y_test, y_probs),
            'MCC': matthews_corrcoef(y_test, predict), 
        }

        results = results.append(result,  ignore_index=True)    
                        
        print(key, classification_report(y_test, predict, output_dict=True)['Flakey'], matthews_corrcoef(y_test, predict), roc_auc_score(y_test, y_probs), "\n \n")   

    return results

In [28]:
complete = arff.load('../../datasets/MSR4FlakinessOriginal.arff')
completeColumns = ['tokens', 'loc', 'abstract_keyword', 'assert_keyword', 'boolean_keyword', 'break_keyword', 'byte_keyword', 'case_keyword', 'catch_keyword', 'char_keyword', 'class_keyword', 'continue_keyword', 'default_keyword', 'do_keyword', 'double_keyword', 'else_keyword', 'enum_keyword', 'exports_keyword', 'extends_keyword', 'final_keyword', 'finally_keyword', 'float_keyword', 'for_keyword', 'if_keyword', 'implements_keyword', 'import_keyword', 'instanceof_keyword', 'int_keyword', 'interface_keyword', 'long_keyword', 'modules_keyword', 'native_keyword', 'new_keyword', 'package_keyword', 'private_keyword', 'protected_keyword', 'public_keyword', 'requires_keyword', 'return_keyword', 'short_keyword', 'static_keyword', 'strictfp_keyword', 'super_keyword', 'switch_keyword', 'synchronized_keyword', 'this_keyword', 'throw_keyword', 'throws_keyword', 'transient_keyword', 'try_keyword', 'void_keyword', 'volatile_keyword', 'while_keyword', 'true_keyword', 'null_keyword', 'false_keyword', 'const_keyword', 'goto_keyword', 'keywordcount', 'klass']

nocasetokens = arff.load('../../datasets/nocasetokens.arff')
nocasetokensColumns = ['tokens', 'klass']

In [29]:
dfNoCaseTokens = pd.DataFrame(nocasetokens, columns=nocasetokensColumns)
dfNoCaseTokens.drop('klass', axis=1, inplace=True)

df = pd.DataFrame(complete, columns=completeColumns) 
df.drop('tokens', axis=1, inplace=True)
df = df.join(dfNoCaseTokens)
y = df['klass']

In [30]:
vectorizer = CountVectorizer(analyzer='word', max_features=1500, tokenizer=weka_tokenizer) 

bowToken = vectorizer.fit_transform(df['tokens'])

bowData = pd.DataFrame(bowToken.toarray(), columns=vectorizer.get_feature_names())

df.drop('tokens', axis=1, inplace=True)
df = df.join(bowData)
df.drop('klass', axis=1, inplace=True)
x = df

X_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1) #, random_state=0

In [31]:
classifiers = initClassifiers()
results = execClassifiers(X_train, x_test, y_train, y_test, classifiers)

randomForest {'precision': 0.9757085020242915, 'recall': 0.895910780669145, 'f1-score': 0.9341085271317829, 'support': 269} 0.8809319658499521 0.9808270102357795 
 

decisionTree {'precision': 0.8651685393258427, 'recall': 0.8587360594795539, 'f1-score': 0.8619402985074627, 'support': 269} 0.7356825761600413 0.8677241941233386 
 

naiveBayes {'precision': 0.9537815126050421, 'recall': 0.8438661710037175, 'f1-score': 0.8954635108481263, 'support': 269} 0.8149254726594235 0.9041859754544992 
 

smo {'precision': 0.9317269076305221, 'recall': 0.862453531598513, 'f1-score': 0.8957528957528956, 'support': 269} 0.8086735073207422 0.9618959107806692 
 

knn {'precision': 0.8181818181818182, 'recall': 0.8698884758364313, 'f1-score': 0.8432432432432433, 'support': 269} 0.6913575419434269 0.8459031420278046 
 

logisticRegression {'precision': 0.914179104477612, 'recall': 0.9107806691449815, 'f1-score': 0.9124767225325886, 'support': 269} 0.8321406840542867 0.9637673779090492 
 

perceptron {'pr