In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


In [2]:

def main():
    df = pd.read_csv('datasets.csv')
    smells = ['OSE', 'BCE', 'PDE', 'SV', 'OS', 'SDS', 'RS', 'TFS']

    m = Models()
    for smell in smells:
        print('### Predicting code smell "{}" ###'.format(smell))
        m.predict(df, smell)

class Models():

    def predict(self, df, feature):
        # train test split
        X = df.iloc[:, 1:24]
        y = df = df[[feature]]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33, random_state=42)

        # Naive Bayes
#         print('-- Naive Bayes --')
#         clf = GaussianNB()
#         self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # Random Forests
#         print('-- Random Forests --')
#         clf = RandomForestClassifier(
#             n_estimators=100, max_depth=2, random_state=0)
#         self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # C4.5 (J48)
#         print('-- C4.5 (implented as J48 in Weka) --')
#         clf = DecisionTreeClassifier(random_state=0)
#         self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # Support Vector Machine using LIBSVM implementation with SMO
        print('-- Support Vector Machine using LIBSVM implementation with SMO --')
        svc = SVC(gamma='auto')
        parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
        clf = GridSearchCV(svc, parameters, cv = 5, n_jobs=4)
        self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

    def output_accuracy(self, X, y, X_train, y_train, X_test, y_test, clf):
        # Does grid search over the parameter space
        clf.fit(X_train, y_train.values.ravel())
        
        # print outs detailed report on Grid Param search
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
        print()
        
        # Now test the best classifier on train and test set respectively        
        train_acc = clf.score(X_train, y_train.values.ravel())
        test_acc = clf.score(X_test, y_test.values.ravel())
        print('Training accuray: {}'.format(train_acc))
        print('Testing accuray: {}'.format(test_acc))

In [3]:
if __name__ == '__main__':
    main()

### Predicting code smell "OSE" ###
-- Support Vector Machine using LIBSVM implementation with SMO --




Best parameters set found on development set:

{'kernel': 'rbf', 'C': 1}
Grid scores on development set:

0.694 (+/-0.226) for {'kernel': 'linear', 'C': 1}
0.776 (+/-0.055) for {'kernel': 'rbf', 'C': 1}
0.694 (+/-0.226) for {'kernel': 'linear', 'C': 10}
0.776 (+/-0.055) for {'kernel': 'rbf', 'C': 10}

Training accuray: 1.0
Testing accuray: 0.84
### Predicting code smell "BCE" ###
-- Support Vector Machine using LIBSVM implementation with SMO --




Best parameters set found on development set:

{'kernel': 'linear', 'C': 1}
Grid scores on development set:

0.755 (+/-0.265) for {'kernel': 'linear', 'C': 1}
0.551 (+/-0.010) for {'kernel': 'rbf', 'C': 1}
0.755 (+/-0.265) for {'kernel': 'linear', 'C': 10}
0.551 (+/-0.010) for {'kernel': 'rbf', 'C': 10}

Training accuray: 0.9795918367346939
Testing accuray: 0.64
### Predicting code smell "PDE" ###
-- Support Vector Machine using LIBSVM implementation with SMO --




Best parameters set found on development set:

{'kernel': 'linear', 'C': 1}
Grid scores on development set:

0.878 (+/-0.235) for {'kernel': 'linear', 'C': 1}
0.673 (+/-0.050) for {'kernel': 'rbf', 'C': 1}
0.878 (+/-0.235) for {'kernel': 'linear', 'C': 10}
0.673 (+/-0.050) for {'kernel': 'rbf', 'C': 10}

Training accuray: 1.0
Testing accuray: 0.84
### Predicting code smell "SV" ###
-- Support Vector Machine using LIBSVM implementation with SMO --
Best parameters set found on development set:

{'kernel': 'linear', 'C': 1}
Grid scores on development set:

0.878 (+/-0.149) for {'kernel': 'linear', 'C': 1}
0.878 (+/-0.065) for {'kernel': 'rbf', 'C': 1}
0.878 (+/-0.149) for {'kernel': 'linear', 'C': 10}
0.878 (+/-0.065) for {'kernel': 'rbf', 'C': 10}

Training accuray: 1.0
Testing accuray: 0.96
### Predicting code smell "OS" ###
-- Support Vector Machine using LIBSVM implementation with SMO --
Best parameters set found on development set:

{'kernel': 'linear', 'C': 1}
Grid scores on develop



Best parameters set found on development set:

{'kernel': 'rbf', 'C': 1}
Grid scores on development set:

0.571 (+/-0.331) for {'kernel': 'linear', 'C': 1}
0.592 (+/-0.034) for {'kernel': 'rbf', 'C': 1}
0.531 (+/-0.198) for {'kernel': 'linear', 'C': 10}
0.592 (+/-0.034) for {'kernel': 'rbf', 'C': 10}

Training accuray: 1.0
Testing accuray: 0.6
### Predicting code smell "RS" ###
-- Support Vector Machine using LIBSVM implementation with SMO --
Best parameters set found on development set:

{'kernel': 'linear', 'C': 1}
Grid scores on development set:

0.857 (+/-0.195) for {'kernel': 'linear', 'C': 1}
0.857 (+/-0.070) for {'kernel': 'rbf', 'C': 1}
0.857 (+/-0.195) for {'kernel': 'linear', 'C': 10}
0.857 (+/-0.070) for {'kernel': 'rbf', 'C': 10}

Training accuray: 1.0
Testing accuray: 0.8
### Predicting code smell "TFS" ###
-- Support Vector Machine using LIBSVM implementation with SMO --
Best parameters set found on development set:

{'kernel': 'rbf', 'C': 1}
Grid scores on development se

