In [18]:
# Importing libraries

import os, csv, random, math, time, statistics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics, datasets
from skfeature.utility.mutual_information import su_calculation
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
import warnings
warnings.filterwarnings("ignore")

random.seed(10)

In [16]:
# path to the data file
path = '../defects/data/Jureczko/collated_data/poi_merged.csv'

In [17]:
# reading the data
data = pd.read_csv(path)
rows, cols = data.shape
print("Rows:",rows,"\nCols:",cols)

Rows: 1378 
Cols: 21


### Forward Selection

In [4]:
#Forward feature selection
def forwardSelection(X,y):
    gScore = pd.DataFrame(columns=['Number of features','gScore'])
    rows, cols = X.shape
    prevG = 0
    for i in range(cols+1):
        X_sub = X.iloc[:,:i+1]
        #print("Selected ",(i+1)," cols.",X_sub.shape)
        X_train, X_test, y_train, y_test = train_test_split(X_sub, y, test_size=0.33, random_state=42)
        
        clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
        clf.fit(X_train, y_train.values.ravel())
        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        recall = cm[1][1]/(cm[1][1] + cm[1][0])
        pf = cm[0][1]/(cm[0][1] + cm[0][0])
        #computing the g-score
        g = 2/((1/recall) + (1/(1-pf)))
        delta = g - prevG
        if(delta < 0):
            break
        prevG = g
        gScore.loc[i] = [i+1,g]

    # print("Estimated g-Score: ",round(gScore.tail(1).iloc[0,1],4))
    
#     plt.plot(gScore['Number of features'],gScore['gScore'])
#     plt.xlabel('Number of features')
#     plt.ylabel('g Score')
#     plt.grid(linewidth = 0.5)
#     plt.title("Forward Selection")
#     plt.show()
    
    return i


### Backward Elimination

In [5]:
#Backward feature elimination
def backwardElimination(X,y):
    gScore = pd.DataFrame(columns=['Number of features','gScore'])
    rows, cols = X.shape
    prevG = 0
    for i in range(cols-1):
        X_sub = X.iloc[:,0:cols-i]
#         print("Removed ",i," cols.",X_sub.shape)
        X_train, X_test, y_train, y_test = train_test_split(X_sub, y, test_size=0.33, random_state=42)

        clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
        clf.fit(X_train, y_train.values.ravel())
        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        recall = cm[1][1]/(cm[1][1] + cm[1][0])
        pf = cm[0][1]/(cm[0][1] + cm[0][0])
        #computing the g-score
        g = 2/((1/recall) + (1/(1-pf)))
        delta = g - prevG
        if (delta < 0):
            break
        prevG = g
        gScore.loc[i] = [cols-i,g]
    
    # print("Estimated g-Score: ",round(gScore.tail(1).iloc[0,1],4))
    
#     plt.plot(gScore['Number of features'],gScore['gScore'])
#     plt.xlabel('Number of features')
#     plt.ylabel('g Score')
#     plt.grid(linewidth = 0.5)
#     plt.title("Backward Elimination")
#     plt.show()
    
    return i


### Information Gain as feature selector

In [6]:
def infoGain(X, y):
    model = ExtraTreesClassifier()
    model.fit(X, y)
    importance = list(model.feature_importances_)
    features = list(X.columns)
    
#     plt.bar(features, importance, align='center', alpha=0.5)
#     plt.xlabel('Features')
#     plt.xticks(rotation=90)
#     plt.ylabel('Feature Importance')
#     plt.title('Information Gain')
#     plt.show()
    
    thresh = max(importance)/2
    top_feat_idx = [importance.index(i) for i in importance if i >= thresh]
    top_feat = []
    for i in top_feat_idx:
        top_feat.append(features[i])
    return (top_feat)

### Relief-based feature selection using ReliefF

In [23]:
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100),
                    RandomForestClassifier(n_estimators=100))

In [26]:
from sklearn.model_selection import cross_val_score
print(np.mean(cross_val_score(clf, X, y)))



ValueError: All labels are of the same class.

### Principal Component Analysis

In [55]:
# Ref.: https://hub.packtpub.com/4-ways-implement-feature-selection-python-machine-learning/

pca = PCA(n_components=3)
fit = pca.fit(X)
print(fit.explained_variance_ratio_)

print(fit.components_)

[0.7718697  0.19943698 0.0242134 ]
[[ 2.08480764e-02  1.22482731e-04  1.01885694e-04  1.51927513e-02
   6.02377726e-02  5.24866004e-01  9.73575586e-03  6.70333080e-03
   1.44229992e-02 -1.31650260e-04  8.47333868e-01  1.87479904e-04
   1.56554236e-03 -7.15992479e-05 -2.54605261e-04  4.08105603e-04
   6.49997960e-04  4.30353971e-02  6.60243648e-03  6.11311438e-04]
 [ 1.10122414e-02 -1.17253912e-04  6.98785931e-04  2.31083680e-02
   5.40169019e-03  8.45584940e-01  2.38050432e-02  7.40010877e-04
   1.43858584e-02  2.04220015e-04 -5.19373527e-01 -1.51573681e-04
   5.81684247e-04 -3.15933471e-05  1.36802888e-04 -2.83898721e-04
  -6.69103107e-04 -1.17196487e-01 -6.42759813e-03 -9.16329570e-04]
 [-2.05398059e-02 -8.65992141e-04 -8.33795337e-04 -6.04139951e-03
  -6.55034624e-02  8.01500461e-02  2.03189717e-03 -8.29998028e-03
  -1.12605102e-02  7.58340848e-04 -9.43161993e-02 -5.69890271e-04
  -1.63958969e-03 -4.33937035e-05  1.37673776e-04 -7.80178961e-04
  -1.58765354e-03  9.89742845e-01 -1.14

### CFS

In [7]:
def merit_calculation(X, y):
    """
    This function calculates the merit of X given class labels y, where
    merits = (k * rcf)/sqrt(k+k*(k-1)*rff)
    rcf = (1/k)*sum(su(fi,y)) for all fi in X
    rff = (1/(k*(k-1)))*sum(su(fi,fj)) for all fi and fj in X
    Input
    ----------
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    Output
    ----------
    merits: {float}
        merit of a feature subset X
    """

    n_samples, n_features = X.shape
    rff = 0
    rcf = 0
    for i in range(n_features):
        fi = X.iloc[:, i]
        rcf += su_calculation(fi, y)
        for j in range(n_features):
            if j > i:
                fj = X.iloc[:, j]
                rff += su_calculation(fi, fj)
    rff *= 2
    merits = rcf / np.sqrt(n_features + rff)
    return merits


def cfs(X, y):
    """
    This function uses a correlation based heuristic to evaluate the worth of features which is called CFS
    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    Output
    ------
    F: {numpy array}
        index of selected features
    Reference
    ---------
    Zhao, Zheng et al. "Advancing Feature Selection Research - ASU Feature Selection Repository" 2010.
    """

    n_samples, n_features = X.shape
    F = []
    # M stores the merit values
    M = []
    while True:
        merit = -100000000000
        idx = -1
        for i in range(n_features):
            if i not in F:
                F.append(i)
                # calculate the merit of current selected features
                t = merit_calculation(X.iloc[:, F], y)
                if t > merit:
                    merit = t
                    idx = i
                F.pop()
        F.append(idx)
        M.append(merit)
        if len(M) > 5:
            if M[len(M)-1] <= M[len(M)-2]:
                if M[len(M)-2] <= M[len(M)-3]:
                    if M[len(M)-3] <= M[len(M)-4]:
                        if M[len(M)-4] <= M[len(M)-5]:
                            break
    return np.array(F)

def correlationBased(X,y):
    n_samples, n_features = X.shape    
    num_fea = 20
    idx = cfs(X,y)
    selected_features = X.iloc[:, idx[0:num_fea]]
    return list(selected_features.columns)

## Performing feature selection

In [12]:
def featureSelection(type, X, y):
    start = time.time()
    if (type == 'forward'):
        val = forwardSelection(X,y)
        sel_feat = list(X.columns[:val])
    elif (type == 'backward'):
        val = backwardElimination(X,y)
        sel_feat = list(X.columns[cols-val-1:])
    elif (type == 'infogain'):
        sel_feat = infoGain(X,y)
    elif (type == 'cfs'):
        sel_feat = correlationBased(X,y)
    elif (type == 'none'):
        sel_feat = X.columns
    # print(sel_feat)
    new_data = X[sel_feat]
    clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
    X_train, X_test, y_train, y_test = train_test_split(new_data, y, test_size=0.33, random_state=42)

    clf.fit(X_train, y_train.values.ravel())
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    recall = cm[1][1]/(cm[1][1] + cm[1][0])
    pf = cm[0][1]/(cm[0][1] + cm[0][0])
    #computing the g-score
    g = 2/((1/recall) + (1/(1-pf)))
    
    return (g, (time.time() - start))
    

In [19]:
X = data.loc[:,data.columns!='$<bug']
y = data.loc[:,data.columns=='$<bug']  

In [20]:
algos = ['forward','backward','infogain','cfs','none']
for approach in algos:
    g = []
    rt = []
    for i in range(30):
        g_score, runtime = featureSelection(approach,X,y)
        g.append(g_score)
        rt.append(runtime)
    print("Approach:",approach)
    print("Median g_score:", statistics.median(g))
    print("Median Runtime:", statistics.median(rt))


Approach: forward
Median g_score: 0.6294981640146879
Median Runtime: 2.21827495098114
Approach: backward
Median g_score: 0.6353783490539513
Median Runtime: 5.076975226402283
Approach: infogain
Median g_score: 0.586732510288066
Median Runtime: 1.3434358835220337
Approach: cfs
Median g_score: 0.5868774182925824
Median Runtime: 12.122198343276978
Approach: none
Median g_score: 0.5910230882280059
Median Runtime: 1.4215795993804932


In [47]:
g_score, runtime = featureSelection('backward',X,y)
print("g_score:", g_score)
print("Runtime: ", runtime)

g_score: 0.3937823834196891
Runtime:  7.999540328979492


In [49]:
g_score, runtime = featureSelection('infogain',X,y)
print("g_score:", g_score)
print("Runtime: ", runtime)

  This is separate from the ipykernel package so we can avoid doing imports until


g_score: 0.46708643388666293
Runtime:  1.9665675163269043


In [50]:
g_score, runtime = featureSelection('cfs',X,y)
print("g_score:", g_score)
print("Runtime: ", runtime)

g_score: 0.5200744736716475
Runtime:  43.88851451873779


In [13]:
g_score, runtime = featureSelection('none',X,y)
print("g_score:", g_score)
print("Runtime: ", runtime)

g_score: 0.49001175088131604
Runtime:  1.4996843338012695


In [15]:
g = []
rt = []
for i in range(30):
    g_score, runtime = featureSelection('none',X,y)
    g.append(g_score)
    rt.append(runtime)
print("Approach: None")
print("Median g_score:", statistics.median(g))
print("Median Runtime:", statistics.median(rt))

Approach: None
Median g_score: 0.47993291790664405
Median Runtime: 1.5152689218521118
