##NASA - Model

##Setup

In [26]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml, time, sys, os

pd.set_option('display.max_columns', None)
pd.set_option('display.width',1000)
pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown

DEBUG = True

In [9]:
DATASET = "NASA"

import os, sys
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Imports

In [10]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

##Load Dataset

In [11]:
df = pd.read_pickle(f"{ROOT}/data/data.pkl")
print(df.shape)
df.head()

(1296, 37)


Unnamed: 0,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,DESIGN_COMPLEXITY,DESIGN_DENSITY,EDGE_COUNT,ESSENTIAL_COMPLEXITY,ESSENTIAL_DENSITY,LOC_EXECUTABLE,PARAMETER_COUNT,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,MAINTENANCE_SEVERITY,MODIFIED_CONDITION_COUNT,MULTIPLE_CONDITION_COUNT,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,defects
2,1.0,4.0,7.0,24.0,0.0,1.0,0.13,0.0,0.0,1.0,1.0,6.0,1.0,0.0,1.0,0.0,17.88,7.43,986.77,0.04,34.0,0.13,54.82,132.83,1.0,0.0,0.0,7.0,0.03,13.0,21.0,7.0,8.0,34.0,96.88,8.0,False
3,1.0,1.0,11.0,3.0,0.0,1.0,0.08,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,42.62,7.81,2598.31,0.11,77.0,0.13,144.35,332.79,1.0,0.0,0.0,3.0,0.06,29.0,48.0,13.0,7.0,17.0,93.33,12.0,False
4,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,3.0,33.44,0.63,13.06,0.01,9.0,1.6,0.73,20.9,1.0,0.0,0.0,3.0,0.33,5.0,4.0,4.0,1.0,3.0,0.0,1.0,False
6,1.0,0.0,1.0,0.0,0.0,1.0,0.5,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,10.87,3.5,133.19,0.01,11.0,0.29,7.4,38.05,1.0,0.0,0.0,2.0,0.33,4.0,7.0,4.0,7.0,3.0,50.0,2.0,False
7,3.0,1.0,1.0,0.0,6.0,2.0,1.0,2.0,3.0,1.0,0.5,5.0,1.0,0.0,1.0,1.0,11.07,4.38,211.89,0.02,14.0,0.23,11.77,48.43,0.5,2.0,3.0,5.0,0.5,5.0,9.0,4.0,7.0,4.0,50.0,2.0,False


##Dataset

In [12]:
target = 'defects'

##Preprocessing

In [38]:
X = df.drop(target,axis=1)
y = df.defects.values
X.shape, y.shape

((1296, 36), (1296,))

##Model selection

In [39]:
from sklearn.ensemble import RandomForestClassifier

##Baseline Model

In [40]:
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier()
scores = cross_val_score(model, X, y, cv=10)
scores.mean(), scores.std()

(0.97840190816935, 0.004596083828718224)

##GridSearch

In [41]:
from sklearn.model_selection import GridSearchCV

In [42]:
parameter_space = {
    "criterion": ['gini', 'entropy'],
    'max_depth': range(10,15),
    'max_features': np.linspace(0.3, 0.45, 4),
    "n_estimators": range(12,16),
}

In [44]:
grid_search = GridSearchCV(model, parameter_space, n_jobs=-1)

In [45]:
start = time.time()
grid_search.fit(X, y)
end = time.time()

In [46]:
print("Fit Time:", end - start)
print("Best param:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fit Time: 43.20111346244812
Best param: {'criterion': 'entropy', 'max_depth': 14, 'max_features': 0.3, 'n_estimators': 14}
Best score: 0.9814820314820316


##Optuna

In [48]:
import optuna

In [49]:
def objective(trial):

    # Parameter space
    parameter_space = {
        "criterion": trial.suggest_categorical('criterion', ['gini','entropy']),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "max_features": trial.suggest_float("max_features", 0.1, 0.9),
        "n_estimators": trial.suggest_int("n_estimators", 2, 10),
    }

    # Setup model using hyper-parameters values
    model = RandomForestClassifier(**parameter_space)

    # Scoring model
    score = cross_val_score(model, X, y, n_jobs=-1, cv=10)

    return score.mean()

In [50]:
study = optuna.create_study(direction="maximize")

[32m[I 2023-03-26 20:47:27,949][0m A new study created in memory with name: no-name-0254ab10-1c4e-4393-9a4c-7cb43f0d91c0[0m


In [51]:
start = time.time()
study.optimize(objective, n_trials=10)
end = time.time()

[32m[I 2023-03-26 20:47:35,865][0m Trial 0 finished with value: 0.9807155635062612 and parameters: {'criterion': 'gini', 'max_depth': 9, 'max_features': 0.3278929126306275, 'n_estimators': 4}. Best is trial 0 with value: 0.9807155635062612.[0m
[32m[I 2023-03-26 20:47:36,950][0m Trial 1 finished with value: 0.9791771019677997 and parameters: {'criterion': 'gini', 'max_depth': 14, 'max_features': 0.41137764764156803, 'n_estimators': 10}. Best is trial 0 with value: 0.9807155635062612.[0m
[32m[I 2023-03-26 20:47:37,474][0m Trial 2 finished with value: 0.9814907573047108 and parameters: {'criterion': 'entropy', 'max_depth': 11, 'max_features': 0.8976845719232794, 'n_estimators': 4}. Best is trial 2 with value: 0.9814907573047108.[0m
[32m[I 2023-03-26 20:47:37,746][0m Trial 3 finished with value: 0.9814907573047108 and parameters: {'criterion': 'gini', 'max_depth': 12, 'max_features': 0.6201859345200299, 'n_estimators': 2}. Best is trial 2 with value: 0.9814907573047108.[0m
[32

In [52]:
print("Fit Time:", end - start)
print("Best Param:", study.best_params)
print("Best score:", study.best_value)

Fit Time: 11.230395078659058
Best Param: {'criterion': 'entropy', 'max_depth': 11, 'max_features': 0.8976845719232794, 'n_estimators': 4}
Best score: 0.9814907573047108


##Hyperopt

In [None]:
!pip install hyperopt
!pip install hpsklearn

In [53]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [54]:
parmeter_space = {
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.uniform('max_features', 0.1,0.9),
    'n_estimators': hp.choice('n_estimators', range(1,10)),
}

In [55]:
def objective(params):
    model = RandomForestClassifier(**params)
    return cross_val_score(model, X, y).mean()

In [56]:
best = 0
best_param = {}

def f(params):
    global best, best_param
    acc = objective(params)
    if acc > best:
        best = acc
        best_param = params
        print( 'new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()

In [57]:
start = time.time()
hyperopt_search = fmin(f, parmeter_space, algo=tpe.suggest, max_evals=300, trials=trials)
end = time.time()

new best:
0.9791654291654291
{'criterion': 'entropy', 'max_depth': 7, 'max_features': 0.8119541004249835, 'n_estimators': 8}
new best:
0.9807098307098308
{'criterion': 'entropy', 'max_depth': 5, 'max_features': 0.15416950259686388, 'n_estimators': 9}
new best:
0.9814820314820316
{'criterion': 'gini', 'max_depth': 2, 'max_features': 0.7646456687356039, 'n_estimators': 7}
new best:
0.9822542322542324
{'criterion': 'entropy', 'max_depth': 2, 'max_features': 0.5111720519326507, 'n_estimators': 8}
new best:
0.9830264330264331
{'criterion': 'entropy', 'max_depth': 4, 'max_features': 0.8664625285069334, 'n_estimators': 5}
100%|██████████| 300/300 [01:00<00:00,  4.93trial/s, best loss: -0.9830264330264331]


In [58]:
print("Fit Time:", end - start)
print("Best Param:", best_param)
print("Best score:", best)

Fit Time: 60.884764671325684
Best Param: {'criterion': 'entropy', 'max_depth': 4, 'max_features': 0.8664625285069334, 'n_estimators': 5}
Best score: 0.9830264330264331
