## Importing packages

In [1]:
import weka.core.jvm as jvm
import weka.core.converters as converters
from weka.filters import Filter
from weka.classifiers import Classifier, Evaluation
from weka.core.dataset import Instances
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import os
import pickle

In [2]:
# Moving to project directory
os.chdir('..')

In [3]:
jvm.start(packages=True)

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['C:\\Users\\Cassio\\.conda\\envs\\pww3\\Lib\\site-packages\\javabridge\\jars\\rhino-1.7R4.jar', 'C:\\Users\\Cassio\\.conda\\envs\\pww3\\Lib\\site-packages\\javabridge\\jars\\runnablequeue.jar', 'C:\\Users\\Cassio\\.conda\\envs\\pww3\\Lib\\site-packages\\javabridge\\jars\\cpython.jar', 'c:\\Users\\Cassio\\.conda\\envs\\pww3\\lib\\site-packages\\weka\\lib\\python-weka-wrapper.jar', 'c:\\Users\\Cassio\\.conda\\envs\\pww3\\lib\\site-packages\\weka\\lib\\weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support enabled


### Testing Diabets Dataset

In [4]:
# Importing big dataset arfffile
data = converters.load_any_file("data\\processed\\big_data_set.arff")
data.class_is_last()

In [5]:
# Importing descriptor of big dataset
descriptors_df = pd.read_pickle("data\\processed\\descriptors_df_bigdata.pkl")
descriptors_df

Unnamed: 0,dsMatrixCorrelSD,dsEigenvaluePropIntercept,dsEigenvalueCumulativeIntercept,dsChiSquaredMax,attChiSquaredNormalized,attClassifierLogisticNormalized,attCorrelationNormalized,attReliefFNormalized,attSymmetricalUncertNormalized,dsLOGnInstances,dsLnNumClasses,attribute
0,0.163404,0.03588,0.143081,22728.069055,1.0,,0.494489,0.469193,0.740058,5.404286,0.693147,GenHlth
1,0.163404,0.03588,0.143081,22728.069055,0.772788,,1.0,0.155079,1.0,5.404286,0.693147,HighBP
2,0.163404,0.03588,0.143081,22728.069055,0.642423,,0.824021,0.046219,0.272842,5.404286,0.693147,BMI
3,0.163404,0.03588,0.143081,22728.069055,0.532118,,0.829723,0.013901,0.722517,5.404286,0.693147,DiffWalk
4,0.163404,0.03588,0.143081,22728.069055,0.447695,,0.761307,0.124741,0.569867,5.404286,0.693147,HighChol
5,0.163404,0.03588,0.143081,22728.069055,0.386969,,0.172938,1.0,0.22008,5.404286,0.693147,Age
6,0.163404,0.03588,0.143081,22728.069055,0.352128,,0.651083,0.031842,0.263792,5.404286,0.693147,PhysHlth
7,0.163404,0.03588,0.143081,22728.069055,0.350796,,0.673888,0.007514,0.542532,5.404286,0.693147,HeartDiseaseorAttack
8,0.163404,0.03588,0.143081,22728.069055,0.308153,,0.258837,0.327522,0.187762,5.404286,0.693147,Income
9,0.163404,0.03588,0.143081,22728.069055,0.177187,,0.254276,0.213261,0.141861,5.404286,0.693147,Education


In [6]:
 # load the model from disk
filename = 'models\\best_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model

In [7]:
# Selected Features
selected_features = loaded_model.predict(descriptors_df.iloc[:,0:11])
s = list(map(bool,selected_features))
descriptors_df['attribute'][s].values

array(['GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 'Age',
       'Income', 'Education', 'PhysActivity', 'CholCheck', 'Smoker',
       'HvyAlcoholConsump', 'Veggies', 'Fruits'], dtype=object)

In [8]:
print('Complete dataset: ',len(descriptors_df['attribute']))
print('Number of select features: ',sum(selected_features))

Complete dataset:  21
Number of select features:  14


In [9]:
remove_features_index = [i for i, val in enumerate(selected_features) if val==0]

In [10]:
# proposed model subset
for p,remove in enumerate(descriptors_df.attribute[remove_features_index]):
    if p==0:
        dataset1 = Instances.copy_instances(data)  # Copy instances from dataset1
        dataset1.class_is_last()   # set class attribute
    remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", remove])
    remove.inputformat(dataset1)
    dataset1 = remove.filter(dataset1)


In [11]:
num_runs= 1
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True)
score_new_aux=[]
score_complete_aux=[]       
    ######################################################

classifier = Classifier(classname="weka.classifiers.functions.SimpleLogistic")

for run in np.arange(num_runs):

    # Perform cross-validation on the datasets
    iteration=1
    for train_indices, test_indices in kf.split(data):

        # print('Iteration ',iteration)
        train_set0 = Instances.copy_instances(data)  # Copy instances from dataset1
        test_set0 = Instances.copy_instances(data)   # Copy instances from dataset1

        train_set1 = Instances.copy_instances(dataset1)  # Copy instances from dataset1
        test_set1 = Instances.copy_instances(dataset1)   # Copy instances from dataset1


        # Set instances indices for training and testing sets
        train_set0.delete()
        train_set1.delete()
        test_set0.delete()
        test_set1.delete()


        for index in train_indices:
            train_set0.add_instance(data.get_instance(index))
            train_set1.add_instance(dataset1.get_instance(index))

        for index in test_indices:
            test_set0.add_instance(data.get_instance(index))
            test_set1.add_instance(dataset1.get_instance(index))

        # Build and evaluate model complete
        classifier.build_classifier(train_set0)    
        evaluation0 = Evaluation(train_set0)
        evaluation0.test_model(classifier, test_set0)
        score_complete_aux.append(evaluation0.percent_correct)                

        # Build and evaluate subset model 
        classifier.build_classifier(train_set1)    
        evaluation1 = Evaluation(train_set1)
        evaluation1.test_model(classifier, test_set1)
        score_new_aux.append(evaluation1.percent_correct)



        iteration=iteration+1

n_train_final = len(train_indices)
n_test_final = len(test_indices) 

In [12]:
print('Score with proposed FS:',f'{np.mean(score_new_aux):.2f}','+-',f'{np.std(score_new_aux):.2f}')
print('Score without FS:',f'{np.mean(score_complete_aux):.2f}','+-',f'{np.std(score_complete_aux):.2f}')

Score with proposed FS: 86.49 +- 0.13
Score without FS: 86.54 +- 0.12


In [13]:
# Difference score in each iteration
np.array(score_new_aux) - np.array(score_complete_aux)

array([-0.01182592, -0.06307159, -0.04730369, -0.01970987, -0.04336172,
       -0.00394197, -0.07883948, -0.08278146, -0.17344686,  0.01182592])

### Testing Financial Dataset

In [14]:
# Importing big dataset arfffile
data = converters.load_any_file("data\\external\\big_data_set2.arff")
data.class_is_last()

In [15]:
# Importing descriptor of big dataset
descriptors_df2 = pd.read_pickle("data\\processed\\descriptors_df_bigdata2.pkl")
descriptors_df2

Unnamed: 0,dsMatrixCorrelSD,dsEigenvaluePropIntercept,dsEigenvalueCumulativeIntercept,dsChiSquaredMax,attChiSquaredNormalized,attClassifierLogisticNormalized,attCorrelationNormalized,attReliefFNormalized,attSymmetricalUncertNormalized,dsLOGnInstances,dsLnNumClasses,attribute
0,0.288967,0.05152,0.649535,1013.224087,1.000000,1.000000,0.374215,0.003931,1.000000,3.771587,0.693147,Attr35
1,0.288967,0.05152,0.649535,1013.224087,0.963439,-0.277778,0.637058,0.027382,0.826957,3.771587,0.693147,Attr39
2,0.288967,0.05152,0.649535,1013.224087,0.937163,-0.222222,0.354931,0.003575,0.615657,3.771587,0.693147,Attr22
3,0.288967,0.05152,0.649535,1013.224087,0.867532,0.000000,0.146202,0.006732,0.886571,3.771587,0.693147,Attr42
4,0.288967,0.05152,0.649535,1013.224087,0.844859,-0.055556,0.046529,0.000606,0.543031,3.771587,0.693147,Attr13
...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.288967,0.05152,0.649535,1013.224087,0.066328,-0.055556,0.369112,0.039090,0.065130,3.771587,0.693147,Attr20
60,0.288967,0.05152,0.649535,1013.224087,0.053758,0.000000,0.029232,0.008258,0.085213,3.771587,0.693147,Attr47
61,0.288967,0.05152,0.649535,1013.224087,0.000000,0.000000,0.020472,0.000162,0.000000,3.771587,0.693147,Attr60
62,0.288967,0.05152,0.649535,1013.224087,0.000000,0.000000,0.062513,1.000000,0.000000,3.771587,0.693147,Attr37


In [16]:
# Selected Features
selected_features = loaded_model.predict(descriptors_df2.iloc[:,0:11])
s = list(map(bool,selected_features))
descriptors_df2['attribute'][s].values

array(['Attr35', 'Attr39', 'Attr22', 'Attr42', 'Attr13', 'Attr41',
       'Attr1', 'Attr18', 'Attr7', 'Attr14', 'Attr11', 'Attr15', 'Attr19',
       'Attr16', 'Attr26', 'Attr23', 'Attr31', 'Attr46', 'Attr12',
       'Attr45', 'Attr56', 'Attr3', 'Attr58', 'Attr27', 'Attr54',
       'Attr28', 'Attr48', 'Attr37'], dtype=object)

In [17]:
print('Complete dataset: ',len(descriptors_df2['attribute']))
print('Number of select features: ',sum(selected_features))

Complete dataset:  64
Number of select features:  28


In [18]:
remove_features_index = [i for i, val in enumerate(selected_features) if val==0]

In [19]:
# proposed model subset
for p,remove in enumerate(descriptors_df2.attribute[remove_features_index]):
    if p==0:
        dataset1 = Instances.copy_instances(data)  # Copy instances from dataset1
        dataset1.class_is_last()   # set class attribute
    remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", remove])
    remove.inputformat(dataset1)
    dataset1 = remove.filter(dataset1)


In [20]:
num_runs= 1
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True)
score_new_aux=[]
score_complete_aux=[]       
    ######################################################

classifier = Classifier(classname="weka.classifiers.functions.SimpleLogistic")

for run in np.arange(num_runs):

    # Perform cross-validation on the datasets
    iteration=1
    for train_indices, test_indices in kf.split(data):

        # print('Iteration ',iteration)
        train_set0 = Instances.copy_instances(data)  # Copy instances from dataset1
        test_set0 = Instances.copy_instances(data)   # Copy instances from dataset1

        train_set1 = Instances.copy_instances(dataset1)  # Copy instances from dataset1
        test_set1 = Instances.copy_instances(dataset1)   # Copy instances from dataset1


        # Set instances indices for training and testing sets
        train_set0.delete()
        train_set1.delete()
        test_set0.delete()
        test_set1.delete()


        for index in train_indices:
            train_set0.add_instance(data.get_instance(index))
            train_set1.add_instance(dataset1.get_instance(index))

        for index in test_indices:
            test_set0.add_instance(data.get_instance(index))
            test_set1.add_instance(dataset1.get_instance(index))

        # Build and evaluate model complete
        classifier.build_classifier(train_set0)    
        evaluation0 = Evaluation(train_set0)
        evaluation0.test_model(classifier, test_set0)
        score_complete_aux.append(evaluation0.percent_correct)                

        # Build and evaluate subset model 
        classifier.build_classifier(train_set1)    
        evaluation1 = Evaluation(train_set1)
        evaluation1.test_model(classifier, test_set1)
        score_new_aux.append(evaluation1.percent_correct)



        iteration=iteration+1

n_train_final = len(train_indices)
n_test_final = len(test_indices) 

In [21]:
print('Score with proposed FS:',f'{np.mean(score_new_aux):.2f}','+-',f'{np.std(score_new_aux):.2f}')
print('Score without FS:',f'{np.mean(score_complete_aux):.2f}','+-',f'{np.std(score_complete_aux):.2f}')

Score with proposed FS: 93.27 +- 1.44
Score without FS: 93.03 +- 1.60


In [22]:
# Difference score in each iteration
np.array(score_new_aux) - np.array(score_complete_aux)

array([ 0.33840948,  0.84602369,  0.        , -0.16920474,  0.        ,
       -0.33840948,  0.        ,  0.67681895,  0.50761421,  0.50761421])