In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import cross_validation
from sklearn.utils import resample
import numpy as np


In [2]:

def main():
    df = pd.read_csv('datasets.csv')
    smells = ['OSE', 'BCE', 'PDE', 'SV', 'OS', 'SDS', 'RS', 'TFS']

    m = Models()
    for smell in smells:
        print('### Predicting code smell "{}" ###'.format(smell))
        m.predict(df, smell)

class Models():

    def predict(self, df, feature):
        # train test split
        X = df.iloc[:, 1:24]
        y = df = df[[feature]]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33, random_state=42)

        # Naive Bayes
#         print('-- Naive Bayes --')
#         clf = GaussianNB()
#         self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # Random Forests
#         print('-- Random Forests --')
#         # Number of trees in random forest
#         n_estimators = [int(x) for x in np.linspace(start = 10, stop = 50, num = 10)]
#         # Number of features to consider at every split
#         max_features = ['auto', 'sqrt']
#         # Maximum number of levels in tree
# #         max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# #         max_depth.append(None)
#         # Minimum number of samples required to split a node
#         min_samples_split = [2, 5, 10]
#         # Minimum number of samples required at each leaf node
#         min_samples_leaf = [1, 2, 4]
#         # Method of selecting samples for training each tree
#         bootstrap = [True, False]
        
#         rf = RandomForestClassifier(random_state=0)
#         parameters ={'n_estimators': n_estimators,
#                'max_features': max_features,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
        
#         clf = GridSearchCV(rf, parameters, cv = 10, n_jobs=4)
#         self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # C4.5 (J48)
#         print('-- C4.5 (implented as J48 in Weka) --')
        decision_tree = DecisionTreeClassifier(random_state=0)
        parameters = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
        
        clf = GridSearchCV(decision_tree, parameters, cv = 10, n_jobs=4)
        self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # Support Vector Machine using LIBSVM implementation with SMO
        print('-- Support Vector Machine using LIBSVM implementation with SMO --')
        svc = SVC(gamma='auto')
        parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
        
        clf = GridSearchCV(svc, parameters, cv = 10, n_jobs=4)
        self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

    def output_accuracy(self, X, y, X_train, y_train, X_test, y_test, clf):
        # Does grid search over the parameter space
        clf.fit(X_train, y_train.values.ravel())
        
        # print outs detailed report on Grid Param search
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
        print()
        
        # Now test the best classifier on train and test set respectively        
        train_acc = clf.score(X_train, y_train.values.ravel())
        test_acc = clf.score(X_test, y_test.values.ravel())
        print('Training accuray: {}'.format(train_acc))
        print('Testing accuray: {}'.format(test_acc))

In [3]:
if __name__ == '__main__':
    main()

### Predicting code smell "OSE" ###




Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': 4}
Grid scores on development set:

0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 4}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 5}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 6}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 7}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 8}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 9}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 10}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 11}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 12}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 15}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 20}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 30}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 40}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_depth': 50}
0.898 (+/-0.270) for {'criterion': 'gini', 'max_



Best parameters set found on development set:

{'C': 1, 'kernel': 'rbf'}
Grid scores on development set:

0.714 (+/-0.224) for {'C': 1, 'kernel': 'linear'}
0.776 (+/-0.089) for {'C': 1, 'kernel': 'rbf'}
0.714 (+/-0.224) for {'C': 10, 'kernel': 'linear'}
0.776 (+/-0.089) for {'C': 10, 'kernel': 'rbf'}

Training accuray: 1.0
Testing accuray: 0.84
### Predicting code smell "BCE" ###




Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': 4}
Grid scores on development set:

0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 4}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 5}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 6}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 7}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 8}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 9}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 10}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 11}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 12}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 15}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 20}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 30}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 40}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_depth': 50}
0.959 (+/-0.165) for {'criterion': 'gini', 'max_



Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}
Grid scores on development set:

0.796 (+/-0.386) for {'C': 1, 'kernel': 'linear'}
0.551 (+/-0.100) for {'C': 1, 'kernel': 'rbf'}
0.796 (+/-0.386) for {'C': 10, 'kernel': 'linear'}
0.551 (+/-0.100) for {'C': 10, 'kernel': 'rbf'}

Training accuray: 0.9795918367346939
Testing accuray: 0.64
### Predicting code smell "PDE" ###




Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': 4}
Grid scores on development set:

0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 4}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 5}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 6}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 7}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 8}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 9}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 10}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 11}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 12}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 15}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 20}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 30}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 40}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_depth': 50}
0.980 (+/-0.109) for {'criterion': 'gini', 'max_



Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}
Grid scores on development set:

0.878 (+/-0.255) for {'C': 1, 'kernel': 'linear'}
0.673 (+/-0.120) for {'C': 1, 'kernel': 'rbf'}
0.878 (+/-0.255) for {'C': 10, 'kernel': 'linear'}
0.673 (+/-0.120) for {'C': 10, 'kernel': 'rbf'}

Training accuray: 1.0
Testing accuray: 0.84
### Predicting code smell "SV" ###




Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 4}
Grid scores on development set:

0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 4}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 5}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 6}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 7}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 8}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 9}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 10}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 11}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 12}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 15}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 20}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 30}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 40}
0.878 (+/-0.255) for {'criterion': 'gini', 'max_depth': 50}
0.878 (+/-0.255) for {'criterion': 'gini', 'm



Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': 4}
Grid scores on development set:

0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 4}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 5}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 6}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 7}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 8}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 9}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 10}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 11}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 12}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 15}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 20}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 30}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 40}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_depth': 50}
0.980 (+/-0.121) for {'criterion': 'gini', 'max_



Best parameters set found on development set:

{'C': 1, 'kernel': 'rbf'}
Grid scores on development set:

0.571 (+/-0.383) for {'C': 1, 'kernel': 'linear'}
0.592 (+/-0.055) for {'C': 1, 'kernel': 'rbf'}
0.510 (+/-0.319) for {'C': 10, 'kernel': 'linear'}
0.592 (+/-0.055) for {'C': 10, 'kernel': 'rbf'}

Training accuray: 1.0
Testing accuray: 0.6
### Predicting code smell "RS" ###




Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 4}
Grid scores on development set:

0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 4}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 5}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 6}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 7}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 8}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 9}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 10}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 11}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 12}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 15}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 20}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 30}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 40}
0.878 (+/-0.198) for {'criterion': 'gini', 'max_depth': 50}
0.878 (+/-0.198) for {'criterion': 'gini', 'm



Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 4}
Grid scores on development set:

0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 4}
0.735 (+/-0.438) for {'criterion': 'gini', 'max_depth': 5}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 6}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 7}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 8}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 9}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 10}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 11}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 12}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 15}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 20}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 30}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 40}
0.714 (+/-0.443) for {'criterion': 'gini', 'max_depth': 50}
0.714 (+/-0.443) for {'criterion': 'gini', 'm

