In [9]:
import numpy as np
import pandas as pd

import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

 #converting string categorical value to numeric categorical
from sklearn.preprocessing import LabelEncoder

#metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#feature selection
from sklearn import svm

from mrmr import mrmr_classif
from info_gain import info_gain
from c45 import C45

import json


import seaborn as sns
import matplotlib.pyplot as plt


#external files
import fc
import testrelief

In [10]:
#from sklearn.metrics import accuracy_score, log_loss
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [11]:
def _readCSVFile(filePath):
        with open(filePath, 'r', newline='',  encoding='utf-8') as csvfile:
            has_header = csv.Sniffer().has_header(csvfile.readline())
            csvfile.seek(0)  # Rewind.
            dialect = csv.Sniffer().sniff(csvfile.read(), delimiters=';,\t')
            csvfile.seek(0) 
            reader = csv.reader(csvfile, dialect)
            if(has_header):
                next(reader)  # Skip header row.
            dataset = pd.DataFrame(reader)
        return dataset
        #print(filePath)

In [12]:
def _readExcel(filePath):
    dataset = pd.read_excel(filePath)
    return dataset

In [13]:
def custom_csv(fname):
    if fname.endswith((".data", ".csv")):
        return _readCSVFile(fname)
    elif fname.endswith((".xlsx", ".xls")):
        return _readExcel(fname)

In [14]:
def _assumption1categorical(df):
    likely_cat = []
    for idx, var in enumerate(df.columns):
        if(1.*df[var].nunique()/df[var].count() < 0.05): #or some other threshold
            likely_cat.append(idx)
    return likely_cat


In [15]:
def _assumption2categorical(df):
    top_n = 10 
    likely_cat = []
    for idx, var in enumerate(df.columns):
        if(1.*df[var].value_counts(normalize=True).head(top_n).sum() > 0.8): #or some other threshold
            likely_cat.append(idx)
    return likely_cat

In [16]:
def convertstrtointcategory(df): 
    le = LabelEncoder()
    ass1 = _assumption1categorical(df) 
    ass2 = _assumption2categorical(df)

    #extract only columns that belong to 
    commonidx = (list(set(ass1) | set(ass2)))

    for i in commonidx:
        df.iloc[:,i] = le.fit_transform(df.iloc[:,i])

    return df

In [17]:
def getLabels(dataset):
    try:
        flag = 0
        #dataset = custom_csv(filePath)
        #dataset = pd.read_csv("/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/iris/iris.data")
        n = dataset.iloc[:, -1].nunique(dropna=False)
        perc = dataset.iloc[:, -1].value_counts(normalize=True)*100
        if(len(perc) > len(dataset.iloc[:, 0].value_counts(normalize=True)*100)):  #checking whether 1st column is label
            n=dataset.iloc[:, 0].nunique(dropna=False)
            flag = 1
        if(flag == 1):
            return dataset.iloc[:, 0]
        else:
            return dataset.iloc[:,-1]
    except:
        print("Can not read last column items for", filePath)

In [18]:
def getindependentVariables(dataset):
    try:
        flag = 0
        #dataset = custom_csv(filePath)
        n = dataset.iloc[:, -1].nunique(dropna=False)
        perc = dataset.iloc[:, -1].value_counts(normalize=True)*100
        if(len(perc) > len(dataset.iloc[:, 0].value_counts(normalize=True)*100)):  #checking whether 1st column is label
            n=dataset.iloc[:, 0].nunique(dropna=False)
            flag = 1
        if(flag == 1):
            return dataset.iloc[:, 1:]
        else:
            return dataset.iloc[:,:-1]
    except:
        print("Can not independent variabless for", filePath)    

In [19]:
def selectFeatures(X, impFeatures, threshold):
    X_new = pd.DataFrame()
    for idx, value in enumerate(impFeatures):
        if(value > threshold):
            X_new = pd.concat((X_new, X.iloc[:, idx]), axis=1)
    return X_new    

In [20]:
def mutualInfo(X,Y):
    return mutual_info_classif(X,Y)

In [21]:
def gainRatio(X,Y):
    info_gain_ratio_values = []
    for idx, col in X.iteritems():
        info_gain_ratio_values.append(info_gain.info_gain_ratio(col.values, Y.values.tolist()))
    return info_gain_ratio_values

In [22]:
def uniqueClasses(Y):
    return len(Y.unique())

In [23]:
def reliefFeature(X, Y):
    X=np.array(X)
    print(type(X))
    X = X.astype(np.float)
    print(type(Y.values))
    r = testrelief.Relief(n_features=(X.shape[1]-1) ) # Will run by default on all processors concurrently
    my_transformed_matrix = r.fit_transform(X,Y.values)
    return r.w_

In [24]:
#report should be dataset specific - each output file is for a single dataset

,0,1,2,accuracy,macro avg,weighted avg, CA, FS, %features  -> such 4*9 rows in each file


NameError: name 'accuracy' is not defined

In [2]:
def fcbf_features(X,Y):
    return fc.fcbf(X,Y)

In [42]:
def classify(clf, X,Y, fs):
    #dataset = custom_csv(fileName)
    #X = getindependentVariables(dataset)
    #Y = getLabels(dt)
    if(fs == "MI"):
      impF = mutualInfo(X,Y)
    elif(fs == "GR"):
      impF = gainRatio(X,Y)
    elif(fs == "mrmr"):
      impF = mrmr_classif(X, Y, K = 10)
    elif(fs == "fcbf"):
      impF = fcbf_features(X,Y)
    elif(fs == "relief"):
      impF =  reliefFeature(X,Y)
    X_new = selectFeatures(X, impF, 0.368)
    if(len(X_new) == 0 ):
        X_new = selectFeatures(X, impF, np.mean(impF))
    print(len(X_new), len(Y))  # add this as a feature ; one of the evaluation criteria
    X_train,X_test,y_train,y_test=train_test_split(X_new,Y,test_size=0.25,random_state=0)
    neighbors = uniqueClasses(Y)
    aName = ["a_"+str(x) for x in range(X_new.shape[1])]
    if(clf.__class__.__name__=="C45"):
      aName = ["a_"+str(x) for x in range(X_new.shape[1])]
      clf.fit(X_train,y_train, aName)
    else:
      clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    target_names = (list(range(uniqueClasses(Y))))
    for index, item in enumerate(target_names):
      target_names[index] = str(item)
   # TN = cnf_matrix.values.sum() - (FP + FN + TP) 
    #tp, fn, fp, tn = metrics.confusion_matrix(y_test, y_pred, labels = getLabels(filePath)).ravel()
    report = classification_report(y_test, y_pred,  target_names=target_names, output_dict=True)
    return (report)


In [47]:
classifiers = [ 
GaussianNB(), 
svm.SVC(kernel='linear', random_state=0),
C45()
]

listofFiles={}
classificationPerfomance={}
featureAlgo = ["MI","GR","mrmr","fcbf","relief"]
for path, subdirs, files in os.walk(os.getcwd()+'/datasets/test/'):
    for name in files:
        if name.endswith((".data", ".csv", ".xlsx", ".xls")):
            listofFiles[name]=os.path.join(path, name)

for eachFile in listofFiles:
    print(eachFile)
    print("----------------------")
    classificationPerfomance = {}
    dataset = custom_csv(listofFiles[eachFile])
    dataset = convertstrtointcategory(dataset)
    X = getindependentVariables(dataset)
    Y = getLabels(dataset)
    n = uniqueClasses(Y)
    classifiers.append(KNeighborsClassifier(n))
    for clf in classifiers:
        name = clf.__class__.__name__
        print(name)
        classificationPerfomance[name] = {}
        for eachFSAlgo in featureAlgo:
            print(eachFSAlgo)
            classificationPerfomance[name][eachFSAlgo] = {}
            perfMetrics = classify(clf, X,Y, eachFSAlgo)
            print("*******DONE*********")
            classificationPerfomance[name][eachFSAlgo] = (perfMetrics)
    classifiers.pop(-1)
    outfile =  os.path.splitext(eachFile)[0]+".json"

    with open(outfile,"w") as f:
    #dataAccuracy = dataAccuracy.to_json()
        json.dump(classificationPerfomance,f)

wine.data
----------------------
GaussianNB
MI
177 177
*******DONE*********
GR
177 177
*******DONE*********
mrmr
100%|██████████| 10/10 [00:02<00:00,  3.82it/s]
177 177
*******DONE*********
fcbf
177 177
*******DONE*********
relief
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
177 177
*******DONE*********
SVC
MI
177 177
*******DONE*********
GR
177 177
*******DONE*********
mrmr
100%|██████████| 10/10 [00:02<00:00,  4.13it/s]
177 177
*******DONE*********
fcbf
177 177
*******DONE*********
relief
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
177 177
*******DONE*********
C45
MI
177 177
*******DONE*********
GR
177 177
*******DONE*********
mrmr
100%|██████████| 10/10 [00:02<00:00,  4.10it/s]
177 177
*******DONE*********
fcbf
177 177
*******DONE*********
relief
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
177 177
*******DONE*********
KNeighborsClassifier
MI
177 177
*******DONE*********
GR
177 177
*******DONE*********
mrmr
100%|██████████| 10/10 [00:02<00:00,  4.25it/s]
  0%|          | 0

In [46]:
outfile =  os.path.splitext(eachFile)[0]+".json"

with open(outfile,"w") as f:
    #classificationPerfomance = classificationPerfomance.to_json()
    #pd.DataFrame(report).to_csv('sample.csv')

    json.dump,f)

In [44]:
pd.DataFrame(classificationPerfomance)
pd.DataFrame(classificationPerfomance).to_csv('sample1.csv')

In [49]:
#working column append n classification report
url ="/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/test/1. Wine/wine.data"
df = custom_csv(url)
X = getindependentVariables(df)
Y = getLabels(df)
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

clf = svm.SVC(kernel='linear', random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred, output_dict=True)
#pd.DataFrame(report).to_csv('sample1.csv')
print(pd.DataFrame(report))
list_of_str = ['First', 'Second', 'Third', 'Fourth']
tail_len = 4

# The two steps in the description
n_rows = sum(1 for row in open('sample1.csv', 'r'))
df = pd.read_csv('sample1.csv', skiprows=range(1, n_rows - tail_len))
df_rest  = pd.read_csv('sample1.csv', skiprows=range(tail_len, n_rows))

print(df)
clsss = 'KNN'
percFeatures = 12
FSA = 'qwe'
df['class']=clsss
df['percF'] = percFeatures
df['FS'] = FSA
df
f = []
pd.concat([df_rest, df]).to_csv('sampe2.csv')

Accuracy: 0.9555555555555556
                   1      2         3  accuracy  macro avg  weighted avg
precision   1.000000   0.95  0.900000  0.955556   0.950000      0.957778
recall      0.937500   0.95  1.000000  0.955556   0.962500      0.955556
f1-score    0.967742   0.95  0.947368  0.955556   0.955037      0.955782
support    16.000000  20.00  9.000000  0.955556  45.000000     45.000000
  Unnamed: 0                                         GaussianNB  \
0         GR  {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...   
1       mrmr  {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...   
2       fcbf  {'0': {'precision': 1.0, 'recall': 0.928571428...   
3     relief  {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...   

                                                 SVC  \
0  {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...   
1  {'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...   
2  {'0': {'precision': 1.0, 'recall': 0.928571428...   
3  {'0': {'precision': 1.0, 'recall': 1.0, 'f1

TypeError: fit() missing 1 required positional argument: 'aN'

In [32]:
#dataset = readCSVFile("/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data")
#datasethead()
import json


In [31]:
for path, subdirs, files in os.walk(os.getcwd()+'/datasets/numeric datasets'):
    for name in files:
        if name.endswith((".data", ".csv", ".xlsx")):
            listofFiles[name]=os.path.join(path, name)
for eachFile in listofFiles:
    print(eachFile)

data_banknote_authentication.csv
heart_failure_clinical_records_datase.csv
wine.data
LasVegasTripAdvisorReviews-Dataset.csv
iris.data
glass.data
bank-full.csv
HCV-Egy-Data2.csv


In [32]:
perfMetrics

Unnamed: 0,0,1,2,3,accuracy,macro avg,weighted avg
precision,0.174419,0.265306,0.202532,0.285714,0.236311,0.231993,0.234188
recall,0.178571,0.135417,0.202532,0.431818,0.236311,0.237084,0.236311
f1-score,0.176471,0.17931,0.202532,0.343891,0.236311,0.225551,0.225648
support,84.0,96.0,79.0,88.0,0.236311,347.0,347.0


In [33]:
from sklearn.datasets import load_iris
from c45 import C45
d=custom_csv('/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/working/1. Wine/wine.data')
X = getindependentVariables(d)
Y = getLabels(d)

aName = ["a_"+str(x) for x in range(X.shape[1])]

iris = load_iris()
#X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.5)
#clf = C45(attrNames=iris.feature_names)

impF = gainRatio(X,Y)
X_new = selectFeatures(X, impF, 0.368)

X_train,X_test,y_train,y_test=train_test_split(X_new,Y,test_size=0.25,random_state=0)
aName = ["a_"+str(x) for x in range(X_new.shape[1])]
clf = C45(attrNames=aName)

clf.fit(X_train, y_train)
print(f'Accuracy: {clf.score(X_test, y_test)}')


TypeError: fit() missing 1 required positional argument: 'aN'

In [27]:
import json

with open("dataaccuracy.json","w") as f:
    #dataAccuracy = dataAccuracy.to_json()
    json.dump(dataAccuracy,f)

In [28]:
json.dumps(dataAccuracy, indent=4) 

'{}'

In [29]:
listofFiles={}
classificationPerfomance={}
i=1
j=1

classificationAlgo = {'knn', 'c45', 'nb', 'svm'}
featureAlgo = {'relief', 'relieff', 'ig', 'gr', 'fcbf', 'mrmr', 'qbb', 'focus', 'sc'}
for path, subdirs, files in os.walk(os.getcwd()+'/datasets/numeric datasets'):
    for name in files:
        if name.endswith((".data", ".csv", ".xlsx")):
            listofFiles[name]=os.path.join(path, name)

perfMetrics =            
for eachFile in listofFiles:
    classificationPerfomance[eachFile] = {}
    for eachClassAlgo in classificationAlgo:
        j+=1
        classificationPerfomance[eachFile][eachClassAlgo] = {}
        for eachFSAlgo in featureAlgo:
            fs = "FS"+str(i)
            classificationPerfomance[eachFile][eachClassAlgo][eachFSAlgo] = {}
            i+=1
            if(eachClassAlgo == "kNN"):
                perfMetrics = classifyKNN(listofFiles[eachFile], eachFSAlgo)
            #classificationPerfomance[eachFile]['acc'] = 1

        i=1
    j=1
   

SyntaxError: invalid syntax (<ipython-input-29-0f41813719f4>, line 13)

[0, 1, 2, 3, 4]

In [None]:
from sklearn.feature_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectPercentile, chi2

dataset = pd.read_csv("/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/heart failure/heart_failure_clinical_records_dataset.csv")
Y = dataset.iloc[:, :1]
X = dataset.iloc[:, 1:]
#Y = getLabels("/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/iris/iris.data")

importantFeaures = mutual_info_classif(X, Y)
print((importantFeaures))

X_new = SelectPercentile(mutual_info_classif, percentile=70).fit_transform(X, Y)

model = LogisticRegression(solver='lbfgs')
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

fit = model.fit(X_train, y_train)

print(X_new.shape)
print(X.shape)
y_red=fit.predict(X_test)
# import the metrics class
cnf_matrix = metrics.confusion_matrix(y_test, y_red)
cnf_matrix


In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
classifier = KNeighborsClassifier(n_neighbors=3,p=2,metric='euclidean')

In [None]:
classifier.fit(X_train,y_train)


In [None]:
y_pred = classifier.predict(X_test)


In [None]:
conf_matrix = confusion_matrix(y_test,y_pred)
print(conf_matrix)

In [None]:
print(accuracy_score(y_test,y_pred))


In [None]:
print(f1_score(y_test,y_pred, average="macro"))


In [None]:
def classifaction_report_csv(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split('      ')
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv('classification_report.csv', index = False)

In [None]:
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'


data = pd.read_csv(url,low_memory=False)
x_1 = getindependentVariables(url)
data_dia = getLabels(url)
x_train, x_test, y_train, y_test = train_test_split(x_1, data_dia, test_size=0.3, random_state=42)
classifiers = [ 
KNeighborsClassifier(3),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis() 
]

# Logging for Visual Comparison
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(x_train, y_train)
    name = clf.__class__.__name__
    print("=" * 30)
    print(name)

    print('****Results****')
    train_predictions = clf.predict(x_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))

    train_predictions = clf.predict_proba(x_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))

    #log_entry = pd.DataFrame([[name, acc * 100, ll]], columns=log_cols)
    #log = log.append(log_entry)
    #report = classification_report(y_test, train_predictions)
    #print("log:",log)
    #print("=" * 30)



In [15]:
from mrmr import mrmr_classif


ModuleNotFoundError: No module named 'mrmr'

In [21]:
from c45 import C45
from sklearn.datasets import load_iris

iris = load_iris()
clf = C45(attrNames=iris.feature_names)

ModuleNotFoundError: No module named 'c45'

In [45]:
from sklearn.naive_bayes import GaussianNB
X = getindependentVariables(url)
Y = getLabels(url)
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

gnb = GaussianNB()
target_names = (list(range(uniqueClasses(Y))))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
pd.DataFrame(report).to_csv('sample.csv')
print(pd.DataFrame(report))


Accuracy: 0.9777777777777777
Number of mislabeled points out of a total 45 points : 1
              0          1         2  accuracy  macro avg  weighted avg
precision   1.0   1.000000  0.888889  0.977778   0.962963      0.980247
recall      1.0   0.952381  1.000000  0.977778   0.984127      0.977778
f1-score    1.0   0.975610  0.941176  0.977778   0.972262      0.978160
support    16.0  21.000000  8.000000  0.977778  45.000000     45.000000


In [52]:
def add_column_in_csv_2(input_file, output_file, transform_row, tansform_column_names):
    """ Append a column in existing csv using csv.reader / csv.writer classes"""
    # Open the input_file in read mode and output_file in write mode
    with open(input_file, 'r') as read_obj, \
            open(output_file, 'w', newline='') as write_obj:
        # Create a DictReader object from the input file object
        dict_reader = DictReader(read_obj)
        # Get a list of column names from the csv
        field_names = dict_reader.fieldnames
        # Call the callback function to modify column name list
        tansform_column_names(field_names)
        # Create a DictWriter object from the output file object by passing column / field names
        dict_writer = DictWriter(write_obj, field_names)
        # Write the column names in output csv file
        dict_writer.writeheader()
        # Read each row of the input csv file as dictionary
        for row in dict_reader:
            # Modify the dictionary / row by passing it to the transform function (the callback)
            transform_row(row, dict_reader.line_num)
            # Write the updated dictionary or row to the output file
            dict_writer.writerow(row)

In [66]:
#working column append n classification report

X = getindependentVariables(url)
Y = getLabels(url)
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

clf = svm.SVC(kernel='linear', random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred,  target_names=target_names, output_dict=True)
#pd.DataFrame(report).to_csv('sample1.csv')
print(pd.DataFrame(report))
list_of_str = ['First', 'Second', 'Third', 'Fourth']
tail_len = 4

# The two steps in the description
n_rows = sum(1 for row in open('sample1.csv', 'r'))
df = pd.read_csv('sample1.csv', skiprows=range(1, n_rows - tail_len))
df_rest  = pd.read_csv('sample1.csv', skiprows=range(tail_len, n_rows))

print(df)
clsss = 'KNN'
percFeatures = 12
FSA = 'qwe'
df['class']=clsss
df['percF'] = percFeatures
df['FS'] = FSA
df
f = []
pd.concat([df_rest, df]).to_csv('sampe2.csv')

Accuracy: 0.9777777777777777
              0          1         2  accuracy  macro avg  weighted avg
precision   1.0   1.000000  0.888889  0.977778   0.962963      0.980247
recall      1.0   0.952381  1.000000  0.977778   0.984127      0.977778
f1-score    1.0   0.975610  0.941176  0.977778   0.972262      0.978160
support    16.0  21.000000  8.000000  0.977778  45.000000     45.000000
  Unnamed: 0     0          1         2  accuracy  macro avg  weighted avg
0  precision   1.0   1.000000  0.888889  0.977778   0.962963      0.980247
1     recall   1.0   0.952381  1.000000  0.977778   0.984127      0.977778
2   f1-score   1.0   0.975610  0.941176  0.977778   0.972262      0.978160
3    support  16.0  21.000000  8.000000  0.977778  45.000000     45.000000


In [26]:

url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/working/1. Wine/wine.data'
d = custom_csv(url)

X = getindependentVariables(d)
Y = getLabels(d)
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)
neighbors = uniqueClasses(Y)
attrN= ([str(p) for p in range(0, neighbors)])
clf = C45(pathToNames=attrN)
clf.fit(X_train, y_train)


Accuracy: 0.9555555555555556


In [25]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from c45 import C45

iris = load_iris()
clf = C45(attrNames=iris.feature_names)
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.5)
clf.fit(X_train, y_train)
from c45 import C45


Accuracy: 0.8933333333333333


ModuleNotFoundError: No module named 'c45'

In [71]:
#working relief
import sklearn_relief as relief
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/1. Wine/wine.data'
df = pd.read_csv(url, index_col=False)
X = df.iloc[: , 1:]
Y = df.iloc[:, 0]
r = relief.Relief(
    n_features=3 # Choose the best 3 features
) # Will run by default on all processors concurrently
my_transformed_matrix = r.fit_transform(
    X.values,
    (Y.values)
)
pd.DataFrame(my_transformed_matrix)

FileNotFoundError: [Errno 2] No such file or directory: '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/1. Wine/wine.data'

In [75]:
#working relief
import sklearn_relief as relief
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/working/1. Wine/wine.data'
df = pd.read_csv(url, index_col=False)
X = df.iloc[: , 1:]
Y = df.iloc[:, 0]
r = relief.Relief(
    n_features=8 # Choose the best 3 features
) # Will run by default on all processors concurrently
my_transformed_matrix = r.fit_transform(
    X.values,
    (Y.values)
)
print(r)
pd.DataFrame(my_transformed_matrix)

Relief()


ValueError: DataFrame constructor not properly called!

[[0.33477065 6.        ]
 [0.32114068 0.        ]]


In [109]:
X.shape[1]

13

In [96]:

import pandas as pd
import numpy as np
from skrebate import MultiSURF
from skrebate import ReliefF

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


#url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/iris/iris.data'
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/working/2. Heart failure/heart_failure_clinical_records_dataset.csv'

d = custom_csv(url)

iris = datasets.load_iris()

#X = iris.data  # we only take the first two features.
#y = iris.target

print(d.head())
X = getindependentVariables(d)

y = getLabels(d)

#y  = labels.astype('category')
#y = y.cat.codes

fs = ReliefF()
fs.fit(X, y)
print(fs.top_features_)

#for feature_name, feature_score in zip(d.columns, fs.feature_importances_):
 #   print(feature_name, '\t', feature_score)


#reliefF_results = ReliefF().fit(X, y) #ReliefF as a default 'k' hyperparameter that is set to 100 by default (i.e. 100 nearest neighbors)
#print(reliefF_results)

#Present results
#header = X.columns.tolist()
#features = header[0:len(header)-1]
#names_scores = {'Names':features, 'Scores':reliefF_results.feature_importances_} 
#ns = pd.DataFrame(names_scores)
#ns = ns.sort_values(by='Scores')
#ns #Report sorted feature scores


   0  1     2  3   4  5          6    7    8  9  10 11 12
0  75  0   582  0  20  1     265000  1.9  130  1  0  4  1
1  55  0  7861  0  38  0  263358.03  1.1  136  1  0  6  1
2  65  0   146  0  20  0     162000  1.3  129  1  1  7  1
3  50  1   111  0  20  0     210000  1.9  137  1  0  7  1
4  65  1   160  1  20  0     327000  2.7  116  0  0  8  1


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [98]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from skrebate import ReliefF
from sklearn.model_selection import train_test_split

genetic_data = pd.read_csv('https://github.com/EpistasisLab/scikit-rebate/raw/master/data/'
                           'GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv.gz',
                           sep='\t', compression='gzip')

#features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/working/2. Heart failure/heart_failure_clinical_records_dataset.csv'

d = custom_csv(url)

features = getindependentVariables(d)

labels = getLabels(d)


# Make sure to compute the feature importance scores from only your training set
X_train, X_test, y_train, y_test = train_test_split(features, labels)

fs = ReliefF()
fs.fit(X_train, y_train)

#for feature_name, feature_score in zip(genetic_data.drop('class', axis=1).columns,
 #                                      fs.feature_importances_):
  #  print(feature_name, '\t', feature_score)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
def is_float(element) -> bool:
    try:
        float(element)
        return True
    except ValueError:
        return False
        
def greatestNegative(columnValues):
    result = 0
    for number in columnValues:
        if(is_float(number)):
            if(float(number) < 0):
                if(float(number) < float(result)):
                    result = number
    return result

def findmin(df):
    result = []
    for c in df:
        result.append(greatestNegative(df[c]))
    return min(result)

def addMintoDF(df, minElement):
    for c in df:
        df[c] = pd.to_numeric(df[c]) + minElement
    return df