In [5]:
import numpy as np
import pandas as pd

import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

 #converting string categorical value to numeric categorical
from sklearn.preprocessing import LabelEncoder

#metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#feature selection
from sklearn import svm

from mrmr import mrmr_classif
from info_gain import info_gain

import seaborn as sns
import matplotlib.pyplot as plt

In [45]:
def _readCSVFile(filePath):
        with open(filePath, 'r', newline='',  encoding='utf-8') as csvfile:
            has_header = csv.Sniffer().has_header(csvfile.readline())
            csvfile.seek(0)  # Rewind.
            dialect = csv.Sniffer().sniff(csvfile.read(), delimiters=';,\t')
            csvfile.seek(0) 
            reader = csv.reader(csvfile, dialect)
            if(has_header):
                next(reader)  # Skip header row.
            dataset = pd.DataFrame(reader)
        return dataset
        #print(filePath)

In [7]:
def _readExcel(filePath):
    dataset = pd.read_excel(filePath)
    return dataset

In [8]:
def custom_csv(fname):
    if fname.endswith((".data", ".csv")):
        return _readCSVFile(fname)
    elif fname.endswith((".xlsx", ".xls")):
        return _readExcel(fname)

In [94]:
def _assumption1categorical(df):
    likely_cat = []
    for idx, var in enumerate(df.columns):
        if(1.*df[var].nunique()/df[var].count() < 0.05): #or some other threshold
            likely_cat.append(idx)
    return likely_cat


In [95]:
def _assumption2categorical(df):
    top_n = 10 
    likely_cat = []
    for idx, var in enumerate(df.columns):
        if(1.*df[var].value_counts(normalize=True).head(top_n).sum() > 0.8): #or some other threshold
            likely_cat.append(idx)
    return likely_cat

In [97]:
def convertstrtointcategory(df): 
    le = LabelEncoder()
    ass1 = _assumption1categorical(df) 
    ass2 = _assumption2categorical(df)

    #extract only columns that belong to 
    commonidx = (list(set(ass1) | set(ass2)))

    for i in commonidx:
        df.iloc[:,i] = le.fit_transform(df.iloc[:,i])

    return df

In [98]:
def getLabels(dataset):
    try:
        flag = 0
        #dataset = custom_csv(filePath)
        #dataset = pd.read_csv("/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/iris/iris.data")
        n = dataset.iloc[:, -1].nunique(dropna=False)
        perc = dataset.iloc[:, -1].value_counts(normalize=True)*100
        if(len(perc) > len(dataset.iloc[:, 0].value_counts(normalize=True)*100)):  #checking whether 1st column is label
            n=dataset.iloc[:, 0].nunique(dropna=False)
            flag = 1
        if(flag == 1):
            return dataset.iloc[:, 0]
        else:
            return dataset.iloc[:,-1]
    except:
        print("Can not read last column items for", filePath)

In [99]:
def getindependentVariables(dataset):
    try:
        flag = 0
        #dataset = custom_csv(filePath)
        n = dataset.iloc[:, -1].nunique(dropna=False)
        perc = dataset.iloc[:, -1].value_counts(normalize=True)*100
        if(len(perc) > len(dataset.iloc[:, 0].value_counts(normalize=True)*100)):  #checking whether 1st column is label
            n=dataset.iloc[:, 0].nunique(dropna=False)
            flag = 1
        if(flag == 1):
            return dataset.iloc[:, 1:]
        else:
            return dataset.iloc[:,:-1]
    except:
        print("Can not independent variabless for", filePath)    

In [100]:
def selectFeatures(X, impFeatures, threshold):
    X_new = pd.DataFrame()
    for idx, value in enumerate(impFeatures):
        if(value > threshold):
            X_new = pd.concat((X_new, X.iloc[:, idx]), axis=1)
    return X_new    

In [12]:
def mutualInfo(X,Y):
    return mutual_info_classif(X,Y)

In [13]:
def gainRatio(X,Y):
    info_gain_ratio_values = []
    for idx, col in X.iteritems():
        info_gain_ratio_values.append(info_gain.info_gain_ratio(col.values, Y.values.tolist()))
    return info_gain_ratio_values

In [14]:
def uniqueClasses(Y):
    return len(Y.unique())

In [15]:
def classifyKNN(X,Y, fs):
    if(fs == "MI"):
      impF = mutualInfo(X,Y)
    elif(fs == "GR"):
      impF = gainRatio(X,Y)
    elif(fs == "mrmr"):
      impF = mrmr_classif(X, Y, K = 10)
    X_new = selectFeatures(X, impF, 0.368)
    print(X_new.shape[1])  # add this as a feature ; one of the evaluation criteria
    X_train,X_test,y_train,y_test=train_test_split(X_new,Y,test_size=0.25,random_state=0)
    neighbors = uniqueClasses(Y)
    knn=KNeighborsClassifier(n_neighbors=neighbors)
    knn.fit(X_train, y_train)
    y_pred= knn.predict(X_test)
    target_names = (list(range(uniqueClasses(Y))))
    for index, item in enumerate(target_names):
      target_names[index] = str(item)
   # TN = cnf_matrix.values.sum() - (FP + FN + TP) 
    #tp, fn, fp, tn = metrics.confusion_matrix(y_test, y_pred, labels = getLabels(filePath)).ravel()
    report = classification_report(y_test, y_pred,  target_names=target_names, output_dict=True)
    return pd.DataFrame(report)


In [16]:
def classifyNB(X,Y, fs):
    if(fs == "MI"):
      impF = mutualInfo(X,Y)
    elif(fs == "GR"):
      impF = gainRatio(X,Y)
    elif(fs == "mrmr"):
      impF = mrmr_classif(X, Y, K = 10)
    X_new = selectFeatures(X, impF, 0.368)
    print(X_new.shape[1])  # add this as a feature ; one of the evaluation criteria
    X_train,X_test,y_train,y_test=train_test_split(X_new,Y,test_size=0.25,random_state=0)
    neighbors = uniqueClasses(Y)
    clf = MultinomialNB() 
    clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    target_names = (list(range(uniqueClasses(Y))))
    for index, item in enumerate(target_names):
      target_names[index] = str(item)
   # TN = cnf_matrix.values.sum() - (FP + FN + TP) 
    #tp, fn, fp, tn = metrics.confusion_matrix(y_test, y_pred, labels = getLabels(filePath)).ravel()
    report = classification_report(y_test, y_pred,  target_names=target_names, output_dict=True)
    return pd.DataFrame(report)


In [17]:
def classifySVM(X,Y, fs):
    if(fs == "MI"):
      impF = mutualInfo(X,Y)
    elif(fs == "GR"):
      impF = gainRatio(X,Y)
    elif(fs == "mrmr"):
      impF = mrmr_classif(X, Y, K = 10)
    X_new = selectFeatures(X, impF, 0.368)
    print(X_new.shape[1])  # add this as a feature ; one of the evaluation criteria
    X_train,X_test,y_train,y_test=train_test_split(X_new,Y,test_size=0.25,random_state=0)
    neighbors = uniqueClasses(Y)
    clf = svm.SVC(kernel='linear', random_state=0)
    clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    target_names = (list(range(uniqueClasses(Y))))
    for index, item in enumerate(target_names):
      target_names[index] = str(item)
   # TN = cnf_matrix.values.sum() - (FP + FN + TP) 
    #tp, fn, fp, tn = metrics.confusion_matrix(y_test, y_pred, labels = getLabels(filePath)).ravel()
    report = classification_report(y_test, y_pred,  target_names=target_names, output_dict=True)
    return pd.DataFrame(report)


In [18]:
#report should be dataset specific - each output file is for a single dataset

,0,1,2,accuracy,macro avg,weighted avg, CA, FS, %features  -> such 4*9 rows in each file


NameError: name 'accuracy' is not defined

In [19]:
#working FCBF
import import_ipynb
import fastC

url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/1. Wine/wine.data'
df = pd.read_csv(url, index_col=False)
X = df.iloc[: , 1:]
Y = df.iloc[:, 0]
t = 0.05
sub = []
sub = (fastC.fcbf(X,Y,t))
print(sub)

FileNotFoundError: [Errno 2] No such file or directory: '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'

In [20]:
def is_float(element) -> bool:
    try:
        float(element)
        return True
    except ValueError:
        return False

In [21]:
def greatestNegative(columnValues):
    result = 0
    for number in columnValues:
        if(is_float(number)):
            if(float(number) < 0):
                if(float(number) < float(result)):
                    result = number
    return result

In [22]:
def findmin(df):
    result = []
    for c in df:
        result.append(greatestNegative(df[c]))
    return min(result)

In [23]:
def addMintoDF(df, minElement):
    for c in df:
        df[c] = pd.to_numeric(df[c]) + minElement
    return df

In [24]:
def classify(clf, X, Y, fs):
    print(type(X))
    if(fs == "MI"):
      impF = mutualInfo(X,Y)
    elif(fs == "GR"):
      impF = gainRatio(X,Y)
    elif(fs == "mrmr"):
      impF = mrmr_classif(X, Y, K = 10)
    elif(fs == "fcbf")
      impF =
    if(clf.__class__.__name__ == "MultinomialNB"):
        m = findmin(X)
        X =  addMintoDF(X, m)
    X_new = selectFeatures(X, impF, 0.368)
    if(len(X_new) == 0 ):
        X_new = selectFeatures(X, impF, np.mean(impF))
    print(len(X_new), len(Y))  # add this as a feature ; one of the evaluation criteria
    X_train,X_test,y_train,y_test=train_test_split(X_new,Y,test_size=0.25,random_state=0)
    neighbors = uniqueClasses(Y)
    clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    target_names = (list(range(uniqueClasses(Y))))
    for index, item in enumerate(target_names):
      target_names[index] = str(item)
   # TN = cnf_matrix.values.sum() - (FP + FN + TP) 
    #tp, fn, fp, tn = metrics.confusion_matrix(y_test, y_pred, labels = getLabels(filePath)).ravel()
    report = classification_report(y_test, y_pred,  target_names=target_names, output_dict=True)
    return pd.DataFrame(report)


In [104]:
from sklearn.feature_selection import *
#dataset = readCSVFile("/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data")
#datasethead()

classifiers = [ 
#DecisionTreeClassifier(),
#RandomForestClassifier(),
#AdaBoostClassifier(),
#GradientBoostingClassifier(),
#MultinomialNB(),
svm.SVC()
]

listofFiles={}
classificationPerfomance={}
i=1
j=1
classificationAlgo = ['knn', 'nb', 'svm']
featureAlgo = ["fcbf"]
for path, subdirs, files in os.walk(os.getcwd()+'/datasets/numeric datasets'):
    for name in files:
        if name.endswith((".data", ".csv", ".xlsx", ".xls")):
            listofFiles[name]=os.path.join(path, name)

for eachFile in listofFiles:
    print(eachFile)
    print("----------------------")
    classificationPerfomance = {}
    dataset = custom_csv(listofFiles[eachFile])
    dataset = convertstrtointcategory(dataset)
    X = getindependentVariables(dataset)
    Y = getLabels(dataset)
    n = uniqueClasses(Y)
    classifiers.append(KNeighborsClassifier(n))
    for clf in classifiers:
        j+=1
        name = clf.__class__.__name__
        print(name)
        classificationPerfomance[name] = {}
        for eachFSAlgo in featureAlgo:
            print(eachFSAlgo)
            classificationPerfomance[name][eachFSAlgo] = {}
            i+=1
            print(len(X), len(Y))
            perfMetrics = classify(clf, X,Y, eachFSAlgo)
            print("*******DONE*********")
            #classificationPerfomance[eachFile]['acc'] = 1
    classifiers.pop(-1)

bank-full.csv
----------------------
SVC
GR
45211 45211
<class 'pandas.core.frame.DataFrame'>
45211 45211
*******DONE*********
MI
45211 45211
<class 'pandas.core.frame.DataFrame'>
45211 45211
*******DONE*********
mrmr
45211 45211
<class 'pandas.core.frame.DataFrame'>
100%|██████████| 10/10 [00:00<00:00, 11.77it/s]
45211 45211
*******DONE*********
KNeighborsClassifier
GR
45211 45211
<class 'pandas.core.frame.DataFrame'>
45211 45211
*******DONE*********
MI
45211 45211
<class 'pandas.core.frame.DataFrame'>
45211 45211
*******DONE*********
mrmr
45211 45211
<class 'pandas.core.frame.DataFrame'>
100%|██████████| 10/10 [00:01<00:00,  5.71it/s]
45211 45211
*******DONE*********
LasVegasTripAdvisorReviews-Dataset.csv
----------------------
SVC
GR
504 504
<class 'pandas.core.frame.DataFrame'>
504 504
*******DONE*********
MI
504 504
<class 'pandas.core.frame.DataFrame'>
504 504
*******DONE*********
mrmr
504 504
<class 'pandas.core.frame.DataFrame'>
100%|██████████| 10/10 [00:01<00:00,  5.86it/s]
5

In [26]:
for path, subdirs, files in os.walk(os.getcwd()+'/datasets/numeric datasets'):
    for name in files:
        if name.endswith((".data", ".csv", ".xlsx")):
            listofFiles[name]=os.path.join(path, name)
for eachFile in listofFiles:
    print(eachFile)

HCV-Egy-Data2.csv


In [27]:
import json

with open("dataaccuracy.json","w") as f:
    #dataAccuracy = dataAccuracy.to_json()
    json.dump(dataAccuracy,f)

In [28]:
json.dumps(dataAccuracy, indent=4) 

'{}'

In [29]:
listofFiles={}
classificationPerfomance={}
i=1
j=1

classificationAlgo = {'knn', 'c45', 'nb', 'svm'}
featureAlgo = {'relief', 'relieff', 'ig', 'gr', 'fcbf', 'mrmr', 'qbb', 'focus', 'sc'}
for path, subdirs, files in os.walk(os.getcwd()+'/datasets/numeric datasets'):
    for name in files:
        if name.endswith((".data", ".csv", ".xlsx")):
            listofFiles[name]=os.path.join(path, name)

perfMetrics =            
for eachFile in listofFiles:
    classificationPerfomance[eachFile] = {}
    for eachClassAlgo in classificationAlgo:
        j+=1
        classificationPerfomance[eachFile][eachClassAlgo] = {}
        for eachFSAlgo in featureAlgo:
            fs = "FS"+str(i)
            classificationPerfomance[eachFile][eachClassAlgo][eachFSAlgo] = {}
            i+=1
            if(eachClassAlgo == "kNN"):
                perfMetrics = classifyKNN(listofFiles[eachFile], eachFSAlgo)
            #classificationPerfomance[eachFile]['acc'] = 1

        i=1
    j=1
   

SyntaxError: invalid syntax (<ipython-input-29-0f41813719f4>, line 13)

In [None]:
list(range(5))

In [None]:
from sklearn.feature_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectPercentile, chi2

dataset = pd.read_csv("/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/heart failure/heart_failure_clinical_records_dataset.csv")
Y = dataset.iloc[:, :1]
X = dataset.iloc[:, 1:]
#Y = getLabels("/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/iris/iris.data")

importantFeaures = mutual_info_classif(X, Y)
print((importantFeaures))

X_new = SelectPercentile(mutual_info_classif, percentile=70).fit_transform(X, Y)

model = LogisticRegression(solver='lbfgs')
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

fit = model.fit(X_train, y_train)

print(X_new.shape)
print(X.shape)
y_red=fit.predict(X_test)
# import the metrics class
cnf_matrix = metrics.confusion_matrix(y_test, y_red)
cnf_matrix


In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
classifier = KNeighborsClassifier(n_neighbors=3,p=2,metric='euclidean')

In [None]:
classifier.fit(X_train,y_train)


In [None]:
y_pred = classifier.predict(X_test)


In [None]:
conf_matrix = confusion_matrix(y_test,y_pred)
print(conf_matrix)

In [None]:
print(accuracy_score(y_test,y_pred))


In [None]:
print(f1_score(y_test,y_pred, average="macro"))


In [None]:
def classifaction_report_csv(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split('      ')
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv('classification_report.csv', index = False)

In [None]:
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'


data = pd.read_csv(url,low_memory=False)
x_1 = getindependentVariables(url)
data_dia = getLabels(url)
x_train, x_test, y_train, y_test = train_test_split(x_1, data_dia, test_size=0.3, random_state=42)
classifiers = [ 
KNeighborsClassifier(3),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis() 
]

# Logging for Visual Comparison
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(x_train, y_train)
    name = clf.__class__.__name__
    print("=" * 30)
    print(name)

    print('****Results****')
    train_predictions = clf.predict(x_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))

    train_predictions = clf.predict_proba(x_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))

    #log_entry = pd.DataFrame([[name, acc * 100, ll]], columns=log_cols)
    #log = log.append(log_entry)
    #report = classification_report(y_test, train_predictions)
    #print("log:",log)
    #print("=" * 30)



In [52]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [15]:
from mrmr import mrmr_classif


ModuleNotFoundError: No module named 'mrmr'

In [42]:
from c45 import C45
from sklearn.datasets import load_iris

iris = load_iris()
clf = C45(attrNames=iris.feature_names)

TypeError: __init__() got an unexpected keyword argument 'attrNames'

In [45]:
from sklearn.naive_bayes import GaussianNB
X = getindependentVariables(url)
Y = getLabels(url)
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

gnb = GaussianNB()
target_names = (list(range(uniqueClasses(Y))))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
pd.DataFrame(report).to_csv('sample.csv')
print(pd.DataFrame(report))


Accuracy: 0.9777777777777777
Number of mislabeled points out of a total 45 points : 1
              0          1         2  accuracy  macro avg  weighted avg
precision   1.0   1.000000  0.888889  0.977778   0.962963      0.980247
recall      1.0   0.952381  1.000000  0.977778   0.984127      0.977778
f1-score    1.0   0.975610  0.941176  0.977778   0.972262      0.978160
support    16.0  21.000000  8.000000  0.977778  45.000000     45.000000


In [52]:
def add_column_in_csv_2(input_file, output_file, transform_row, tansform_column_names):
    """ Append a column in existing csv using csv.reader / csv.writer classes"""
    # Open the input_file in read mode and output_file in write mode
    with open(input_file, 'r') as read_obj, \
            open(output_file, 'w', newline='') as write_obj:
        # Create a DictReader object from the input file object
        dict_reader = DictReader(read_obj)
        # Get a list of column names from the csv
        field_names = dict_reader.fieldnames
        # Call the callback function to modify column name list
        tansform_column_names(field_names)
        # Create a DictWriter object from the output file object by passing column / field names
        dict_writer = DictWriter(write_obj, field_names)
        # Write the column names in output csv file
        dict_writer.writeheader()
        # Read each row of the input csv file as dictionary
        for row in dict_reader:
            # Modify the dictionary / row by passing it to the transform function (the callback)
            transform_row(row, dict_reader.line_num)
            # Write the updated dictionary or row to the output file
            dict_writer.writerow(row)

In [66]:
#working column append n classification report
X = getindependentVariables(url)
Y = getLabels(url)
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

clf = svm.SVC(kernel='linear', random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred,  target_names=target_names, output_dict=True)
#pd.DataFrame(report).to_csv('sample1.csv')
print(pd.DataFrame(report))
list_of_str = ['First', 'Second', 'Third', 'Fourth']
tail_len = 4

# The two steps in the description
n_rows = sum(1 for row in open('sample1.csv', 'r'))
df = pd.read_csv('sample1.csv', skiprows=range(1, n_rows - tail_len))
df_rest  = pd.read_csv('sample1.csv', skiprows=range(tail_len, n_rows))

print(df)
clsss = 'KNN'
percFeatures = 12
FSA = 'qwe'
df['class']=clsss
df['percF'] = percFeatures
df['FS'] = FSA
df
f = []
pd.concat([df_rest, df]).to_csv('sampe2.csv')

Accuracy: 0.9777777777777777
              0          1         2  accuracy  macro avg  weighted avg
precision   1.0   1.000000  0.888889  0.977778   0.962963      0.980247
recall      1.0   0.952381  1.000000  0.977778   0.984127      0.977778
f1-score    1.0   0.975610  0.941176  0.977778   0.972262      0.978160
support    16.0  21.000000  8.000000  0.977778  45.000000     45.000000
  Unnamed: 0     0          1         2  accuracy  macro avg  weighted avg
0  precision   1.0   1.000000  0.888889  0.977778   0.962963      0.980247
1     recall   1.0   0.952381  1.000000  0.977778   0.984127      0.977778
2   f1-score   1.0   0.975610  0.941176  0.977778   0.972262      0.978160
3    support  16.0  21.000000  8.000000  0.977778  45.000000     45.000000


In [41]:
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'
d = custom_csv(url)

X = getindependentVariables(url)
Y = getLabels(url)
impF = info_gain.info_gain_ratio(X, Y)
X_new = selectFeatures(X, impF, 0.368)
X_train,X_test,y_train,y_test=train_test_split(X_new,Y,test_size=0.25,random_state=0)
neighbors = uniqueClasses(Y)
knn=KNeighborsClassifier(n_neighbors=neighbors)
knn.fit(X_train, y_train)
y_pred= knn.predict(X_test)
target_names = (list(range(uniqueClasses(Y))))
for index, item in enumerate(target_names):
    target_names[index] = str(item)
   # TN = cnf_matrix.values.sum() - (FP + FN + TP) 
    #tp, fn, fp, tn = metrics.confusion_matrix(y_test, y_pred, labels = getLabels(filePath)).ravel()
report = classification_report(y_test, y_pred,  target_names=target_names, output_dict=True)
print(type(report))
 


ValueError: math domain error

In [30]:
mrmr_classif(X, Y, K = 10)

NameError: name 'X' is not defined

In [28]:
#working relief
import sklearn_relief as relief
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'
df = pd.read_csv(url, index_col=False)
X = df.iloc[: , 1:]
Y = df.iloc[:, 0]
r = relief.Relief(
    n_features=8 # Choose the best 3 features
) # Will run by default on all processors concurrently
my_transformed_matrix = r.fit_transform(
    X.values,
    (Y.values)
)
pd.DataFrame(my_transformed_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7
0,1050.0,100.0,11.2,4.38,1.78,2.76,13.20,3.40
1,1185.0,101.0,18.6,5.68,2.36,3.24,13.16,3.17
2,1480.0,113.0,16.8,7.80,1.95,3.49,14.37,3.45
3,735.0,118.0,21.0,4.32,2.59,2.69,13.24,2.93
4,1450.0,112.0,15.2,6.75,1.76,3.39,14.20,2.85
...,...,...,...,...,...,...,...,...
172,740.0,95.0,20.5,7.70,5.65,0.61,13.71,1.74
173,750.0,102.0,23.0,7.30,3.91,0.75,13.40,1.56
174,835.0,120.0,20.0,10.20,4.28,0.69,13.27,1.56
175,840.0,120.0,20.0,9.30,2.59,0.68,13.17,1.62


In [26]:
#working relief
import sklearn_relief as relief
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'
df = pd.read_csv(url, index_col=False)
X = df.iloc[: , 1:]
Y = df.iloc[:, 0]
r = relief.Relief(
    n_features=8 # Choose the best 3 features
) # Will run by default on all processors concurrently
my_transformed_matrix = r.fit_transform(
    X.values,
    (Y.values)
)
pd.DataFrame(my_transformed_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7
0,1050.0,100.0,11.2,4.38,1.78,2.76,13.20,3.40
1,1185.0,101.0,18.6,5.68,2.36,3.24,13.16,3.17
2,1480.0,113.0,16.8,7.80,1.95,3.49,14.37,3.45
3,735.0,118.0,21.0,4.32,2.59,2.69,13.24,2.93
4,1450.0,112.0,15.2,6.75,1.76,3.39,14.20,2.85
...,...,...,...,...,...,...,...,...
172,740.0,95.0,20.5,7.70,5.65,0.61,13.71,1.74
173,750.0,102.0,23.0,7.30,3.91,0.75,13.40,1.56
174,835.0,120.0,20.0,10.20,4.28,0.69,13.27,1.56
175,840.0,120.0,20.0,9.30,2.59,0.68,13.17,1.62


[[0.33477065 6.        ]
 [0.32114068 0.        ]]


In [109]:
X.shape[1]

13

In [108]:

import pandas as pd
import numpy as np
from skrebate import MultiSURF
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


#url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/iris/iris.data'
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'

d = custom_csv(url)

iris = datasets.load_iris()

#X = iris.data  # we only take the first two features.
#y = iris.target

X = getindependentVariables(url)

y = getLabels(url)

y  = labels.astype('category')
y = y.cat.codes

reliefF_results = ReliefF().fit(X, y) #ReliefF as a default 'k' hyperparameter that is set to 100 by default (i.e. 100 nearest neighbors)
print(reliefF_results)

#Present results
#header = X.columns.tolist()
#features = header[0:len(header)-1]
#names_scores = {'Names':features, 'Scores':reliefF_results.feature_importances_} 
#ns = pd.DataFrame(names_scores)
#ns = ns.sort_values(by='Scores')
#ns #Report sorted feature scores


KeyError: 150

In [110]:
#working relief
import sklearn_relief as relief
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'
df = pd.read_csv(url, index_col=False)
X = df.iloc[: , 1:]
Y = df.iloc[:, 0]
r = relief.ReliefF(
    n_features=8 # Choose the best 3 features
) # Will run by default on all processors concurrently
my_transformed_matrix = r.fit_transform(
    X.values,
    (Y.values)
)
pd.DataFrame(my_transformed_matrix)



Unnamed: 0,0,1,2,3,4,5,6,7
0,1050.0,3.40,1.05,4.38,1.28,0.26,2.76,2.65
1,1185.0,3.17,1.03,5.68,2.81,0.30,3.24,2.80
2,1480.0,3.45,0.86,7.80,2.18,0.24,3.49,3.85
3,735.0,2.93,1.04,4.32,1.82,0.39,2.69,2.80
4,1450.0,2.85,1.05,6.75,1.97,0.34,3.39,3.27
...,...,...,...,...,...,...,...,...
172,740.0,1.74,0.64,7.70,1.06,0.52,0.61,1.68
173,750.0,1.56,0.70,7.30,1.41,0.43,0.75,1.80
174,835.0,1.56,0.59,10.20,1.35,0.43,0.69,1.59
175,840.0,1.62,0.60,9.30,1.46,0.53,0.68,1.65


In [87]:
from sklearn.preprocessing import OneHotEncoder

labels = getLabels(url)


oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform([labels.values])


from sklearn.preprocessing import OrdinalEncoder

df = pd.DataFrame
ord_enc = OrdinalEncoder()
make_code = ord_enc.fit_transform([labels])
print(make_code)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]


In [20]:
import fcbf
url = '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data'
df = pd.read_csv(url, index_col=False)
X = df.iloc[: , 1:]
Y = df.iloc[:, 0]
t = 0.05
sub = fcbf.fcbf(X,Y,t)
print(sub)

TypeError: '(slice(None, None, None), 0)' is an invalid key

In [24]:
print(sub)

[[0.33477065 6.        ]
 [0.32114068 0.        ]]


[[0.33477065 6.        ]
 [0.32114068 0.        ]]


In [84]:

# Using .fit_transform function to fit label
# encoder and return encoded label

   0   1   2   3   4   5   6   7   8   9   10  11  12
0  E1   1   3   0   1   0   1   2   2   3   3   2   7
1  E2   0  25   0   1   2   1   2   2   3   3   2   7
2  E3   0  39   0   0   0   0   2   2   3   0   2   7
3  E5   0  29   0   1   2   1   2   2   3   3   2   7
4  E6   1  23   0   1   0   1   2   2   3   0   2   7


[1, 2, 3, 4]


55

108