In [1]:
import numpy as np
import pandas as pd
import timeit
import math
import itertools
import csv
import json

from statistics import mean, stdev

#sklearn
from sklearn import model_selection

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier


from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, roc_auc_score,precision_score, accuracy_score

#feature selection

from mrmr import mrmr_classif
from info_gain import info_gain

import seaborn as sns
import matplotlib.pyplot as plt

#external files
import file_operations
import pre_processing
import simple_characteristics
import arfftocsv

In [2]:
import CFS
import fc
import multisurf
import chiSquare
import testrelief
import relieff

In [3]:
def select_features(X, impFeatures, threshold):
    X_new = pd.DataFrame()
    for idx, value in enumerate(impFeatures):
        if(float(value) >= threshold):
            X_new = pd.concat((X_new, X.iloc[:, idx]), axis=1)
    return X_new 

In [4]:
def fcbf_features(X,Y):
    return fc.fcbf(X,Y)

In [5]:
def relief_feature(X, Y):
    X=np.array(X)
    X = X.astype(np.float)
    r = testrelief.Relief(n_features=(X.shape[1]-1) ) # Will run by default on all processors concurrently
    my_transformed_matrix = r.fit_transform(X,Y.values)
    return r.w_

In [6]:
def gain_ratio(X,Y):
    info_gain_ratio_values = []
    for idx, col in X.iteritems():
        info_gain_ratio_values.append(info_gain.info_gain_ratio(col.values, Y.values.tolist()))
    return info_gain_ratio_values

In [7]:
def get_variables(df):
    Y = simple_characteristics.get_labels(df)
    X = df.drop(df.columns[simple_characteristics.class_index], axis=1)
    return X, Y

In [8]:
def mutual_info(X,Y):
    return mutual_info_classif(X,Y)

In [9]:
def transform_to_non_negative(X):
    df_transformed = X.copy()
    for col_index in range(df_transformed.shape[1]):
        min_value = df_transformed.iloc[:, col_index].min()
        if min_value < 0:
            df_transformed.iloc[:, col_index] += abs(min_value)
    return df_transformed


In [10]:
def relief_F_feature(X, Y):
    r = relieff.ReliefF()
    return (r.fit(X.values, Y))

In [11]:
def drop_fs(fs, X, Y, X_new): #to drop column that has sequence number
    X = X.drop(X_new.columns[0], axis=1)
    if(fs == "GR"):
        impF = gain_ratio(X,Y)
    elif(fs=="MI"):
        impF = mutual_info(X,Y)
    elif(fs == "relief"):
        impF =  relief_feature(X,Y)
    elif(fs == "mrmr"):
        impF = mrmr_classif(X, Y, K=X.shape[1]-1, return_scores = True)
        X_new = select_features(X, impF[1], np.mean(impF[1]))
    elif(fs == "chisquare"):
        impF = chiSquare.chi_square_feature_importance(X, Y)
    elif(fs == "fcbf"):
        impF = fcbf_features(X,Y)
    elif(fs == "cfs"):
        impF =  CFS.cfs(X,Y)
    elif(fs == "relieff"):
        impF = relief_F_feature(X,Y)
    elif(fs == "multisurf"):
        impF = multisurf.multisurf(X,Y)
    if(fs != 'mrmr'):
        X_new = select_features(X, impF, np.mean(impF)) 
    return X_new

In [12]:
def classify(fileName, X,Y):
    
    result = []

    featureAlgo = ["GR", "fcbf","relieff","chisquare","cfs","multisurf", "MI","mrmr","relief"]
    importance = []
    estimators = []
    model11 = LogisticRegression(penalty = 'l2', random_state = 0)
    estimators.append(('lr',model11))
    model16 = DecisionTreeClassifier(max_depth = 3)
    estimators.append(('dt',model16))
    model26 = KNeighborsClassifier(n_neighbors = 6, metric = 'minkowski', p = 2)
    estimators.append(('knn1', model26))
    model31 = GaussianNB()
    estimators.append(('nbs1', model31))

    ensemble = VotingClassifier(estimators, voting='soft')    

    for fs in featureAlgo:  
        temp = []
        start = timeit.default_timer()
        if(fs == "GR"):
            impF = gain_ratio(X,Y)
        elif(fs=="MI"):
            impF = mutual_info(X,Y)
        elif(fs == "relief"):
            impF =  relief_feature(X,Y)
        elif(fs == "mrmr"):
            impF = mrmr_classif(X, Y, K=X.shape[1]-1, return_scores = True)
            X_new = select_features(X, impF[1], np.mean(impF[1]))
        elif(fs == "chisquare"):
            impF = chiSquare.chi_square_feature_importance(X, Y)
        elif(fs == "fcbf"):
            impF = fcbf_features(X,Y)
        elif(fs == "cfs"):
            impF =  CFS.cfs(X,Y)
        elif(fs == "relieff"):
            impF = relief_F_feature(X,Y)
        elif(fs == "multisurf"):
            impF = multisurf.multisurf(X,Y)
        stop = timeit.default_timer()
        print(fs)
        if(fs != 'mrmr'):
            X_new = select_features(X, impF, np.mean(impF)) 

        print(X_new.shape,X.shape)
        if(X_new.shape[1]==1):
            print("DROPPED")
            X_new = drop_fs(fs, X, Y, X_new)
        
        if(X_new.shape[0] == 0 or X_new.shape[1] == 0):
            print("CHANGE")
            X_new = select_features(X, impF, np.mean(impF) ) 

        print(X_new.shape)
        X_train, X_test, y_train, y_test = train_test_split(X_new, Y, test_size=0.20, random_state=42)
        print(fs)
        start1 = timeit.default_timer()
        ensemble.fit(X_train, y_train)
        y_pred = ensemble.predict(X_test)
        stop1 = timeit.default_timer()
        pre_sc_macro = precision_score(y_test, y_pred, average='macro')
        recal_sc_macro = recall_score(y_test, y_pred, average='macro')  
        pre_sc_weigh = precision_score(y_test, y_pred, average='weighted')
        recal_sc_weigh = recall_score(y_test, y_pred, average='macro')
        recal_sc_macro = recall_score(y_test, y_pred, average='macro')  
        pre_sc_weigh = precision_score(y_test, y_pred, average='weighted')
        recal_sc_weigh = recall_score(y_test,y_pred, average='weighted')  
        print("Calcu")
        temp.append(fileName)
        temp.append(fs)
        temp.append(ensemble.score(X_test,y_test))
        temp.append(pre_sc_macro)
        temp.append(recal_sc_macro)
        temp.append(pre_sc_weigh)
        temp.append(recal_sc_weigh)
        temp.append(stop-start)
        temp.append(X_new.shape[1])
        temp.append(stop1-start1)
        temp.append(simple_characteristics.class_index)
        result.append(temp)
    return result #returns mean accuracy

       # print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [13]:
timeRequired = []
listofFiles={}
classification_perfomance=[[]]
#featureAlgo = ["MI"]

listofFiles={}
abs_path = os.getcwd()+'/datasets/MFEAI/AdditionalError/Working/'
file_operations.un_zip_files(abs_path) #extracted zipped files
arfftocsv.arffTocsv(abs_path)
test = 'test'
train = 'train'
valid = 'valid'
flag=0
count = 0
duplicate_files = []
for path, subdirs, files in os.walk(abs_path):
    
    duplicate_files = []
    test_datasets = [s for s in files if test in s]
    train_datasets = [s for s in files if train in s]
    valid_datasets = [s for s in files if valid in s]
    if train_datasets:
        for td in train_datasets:
            pre_train = (td).split("_")[0]
            for ted in test_datasets:
                pre_test = (ted).split("_")[0]
                if(pre_train in ted) and (pre_test==pre_train):
                    table1 = pd.read_csv(path+"/"+td)
                    table2 = pd.read_csv(path+"/"+ted)
                    table1.columns = table2.columns
                    data = table1.append(table2)
                    data.to_csv(path+"/"+pre_train+".csv")
                    if flag:
                        files.append(pre_train+".csv")
                    else:
                        flag=1
                        files=[pre_train+".csv"]
    base = [os.path.splitext(f) for f in files]
    if(base):
        for a, b in itertools.combinations(base, 2):
            if(a[0]==b[0]):
                if((a[1]==".csv" and b[1]==".xlsx") or
                   (b[1]==".csv" and a[1]==".xlsx") or
                   (a[1]==".arff" and b[1]==".xlsx") or
                   (b[1]==".arff" and a[1]==".xlsx") or 
                   (a[1]==".arff" and b[1]==".csv") or
                   (b[1]==".arff" and a[1]==".csv") or 
                   (a[1]==".data" and b[1]==".csv") or
                   (b[1]==".data" and a[1]==".csv")):
                   duplicate_files.append(a[0]+".csv")

    if(duplicate_files):
        files = duplicate_files

    for name in files:
        if name.endswith((".data", ".csv", ".xlsx",".xls", ".asc",".dat",".trn")):
            listofFiles[name]=os.path.join(path, name)
           # print(listofFiles[name])

# create the inputs and outputs
featureAlgo = ["GR","mrmr","fcbf","relief","relieff", "MI"]
classification_perfomance = []
for eachFile in listofFiles:
    print(eachFile)
    print("***************************TOTAL FILES*********************************", count)
    count+=1
    print("----------------------")
    dataset = file_operations.custom_csv(listofFiles[eachFile])
    all_Label = simple_characteristics.get_labels(dataset)

    dataset = pre_processing.drop_rows(dataset)
    dataset = pre_processing.drop_columns(dataset)
    dataset = pre_processing.convert_NAs(dataset)
    dataset = pre_processing.convert_str_int_categorical(dataset)
    dataset = pre_processing.convert_str_int_nominal(dataset)
    X,y = simple_characteristics.get_XY(dataset)
    classification_output = classify(eachFile, X,y)
    for item in classification_output:
        classification_perfomance.append(item)
    df=pd.DataFrame(classification_perfomance)
    #df.columns = ['File', 'FeatureAlgo', 'Features']
    df.columns =['File', 'FeatureAlgo', 'Accuracy','Precision macro' ,'Recall macro', 'Precision weight', 'Recall weight', 'Time FS', 'Features','Time ensemble','class index']
#sorted_df = df.sort_values(by = ["Accuracy",'Time FS'],ascending=[False,True]).groupby("File", as_index=False).first()
    df.to_csv("df_ensemble"+ eachFile+"s.csv",index=False)

non-verbal tourist data.csv
***************************TOTAL FILES********************************* 0
----------------------
GR
(73, 9) (73, 22)
(73, 9)
GR
Calcu
fcbf
(73, 10) (73, 22)
(73, 10)
fcbf
Calcu
relieff
(73, 12) (73, 22)
(73, 12)
relieff
Calcu
chisquare
(73, 10) (73, 22)
(73, 10)
chisquare
Calcu
[0.010451220175439022, 0.11859341971115665, 0.12751753240007702, 0.12079826404608915, 0.15521993227702152, 0.1464362454194634, 0.18146538182064106, 0.044358197431603116, 0.0, 0.1741388420921757, 0.13063100207212847, 0.025462312925564428, 0.12918020937627417, 0.06746032527080427, 0.06364322523459935, 0.13868980472144832, 0.04802422987917405, 0.059228100689386574, 0.08877372471468441, 0.10716898648566205, 0.030490650231474643, 0.048341081880885414]
cfs
(73, 11) (73, 22)
(73, 11)
cfs
Calcu
0
multisurf
(73, 10) (73, 22)
(73, 10)
multisurf
Calcu
MI
(73, 11) (73, 22)
(73, 11)
MI
Calcu


100%|██████████| 21/21 [00:03<00:00,  6.57it/s]


mrmr
(73, 8) (73, 22)
(73, 8)
mrmr
Calcu
relief
(73, 5) (73, 22)
(73, 5)
relief
Calcu
