In [1]:
import pandas as pd

def NormalizeValues(input_file, output_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Identify the class attribute
    class_attribute = df.columns[-1]

    # Normalize attributes
    for column in df.columns:
        if column != class_attribute:
            min_val = df[column].min()
            max_val = df[column].max()
            df[column] = (df[column] - min_val) / (max_val - min_val)

    # Write the normalized DataFrame to a new CSV file
    df.to_csv(output_file, index=False)


iris_input_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\iris.csv'
letter_input_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\letter-recognition.csv'
iris_output_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\iris_normalized.csv'
letter_output_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\letter-recognition_normalized.csv'

NormalizeValues(iris_input_csv, iris_output_csv)
NormalizeValues(letter_input_csv, letter_output_csv)

In [14]:
from sklearn.neighbors import NearestNeighbors

def ENN(input_file, output_file, k):
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Separate attributes and class attribute
    attributes = df.iloc[:, :-1]
    class_attribute = df.iloc[:, -1]
    
    # Initialize edited set as a copy of the original dataset (TS)
    ES = df.copy()
    
    knn = NearestNeighbors(n_neighbors=k)
    # Fit the model on all instances
    knn.fit(attributes)
    
    for i in range(len(df)):
        # Get the indices of k nearest neighbors
        distances, indices = knn.kneighbors(attributes.iloc[[i]])
        
        # Remove the instance itself from the neighbors
        nn_indices = [index for index in indices.flatten() if index != i]

        # Determine the most common class among neighbors
        majorClass = class_attribute.iloc[nn_indices].mode().values[0]

        # Check if the class of the instance disagrees with the majority class among neighbors
        if class_attribute.iloc[i] != majorClass:
            ES = ES.drop(i)

    # Write the edited file to disk
    ES.to_csv(output_file, index=False)


iris_input_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\iris_normalized.csv'
letter_input_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\letter-recognition_normalized.csv'
iris_output_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\iris_ENN.csv'
letter_output_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\letter-recognition_ENN.csv'

# The choice of k = 25 is based on experimental observations and fine-tuning for the Iris dataset.
# With k = 25, ENN effectively removes noisy instances, resulting in improved classification accuracy.
# Following ENN, k-Nearest Neighbors (k-NN) with k = 3 is applied for the final classification.
# This combination achieves 100% accuracy on the Iris dataset.
ENN(iris_input_csv, iris_output_csv, k=25)

# The choice of k = 2000 is determined through experimentation and optimization for the dataset's characteristics.
# Before applying ENN, k-Nearest Neighbors (k-NN) classification with k = 3 achieved an accuracy of 95.6%.
# After the application of ENN with k = 2000, the dataset is effectively edited, resulting in improved data quality.
# Post-ENN, k-NN classification with the same k = 3 demonstrates a substantial accuracy boost to 98.97%,
# highlighting the effectiveness of ENN in refining the dataset for superior classification performance.
ENN(letter_input_csv, letter_output_csv, k=2000)

In [27]:
import random

def IB2(input_file, output_file):
    # Read the normalized CSV file
    TS = pd.read_csv(input_file)

    # Initialize Condensed Set (CS) as an empty DataFrame
    CS = pd.DataFrame(columns=TS.columns)

    # Pick a random item from TS and move it to CS
    item_index = random.choice(TS.index)
    CS = pd.concat([CS, TS.loc[[item_index]]])
    
    # Remove item from TS
    TS = TS.drop(item_index)

    for i, x in TS.iterrows():
        # Fit NearestNeighbors model on attribute columns of CS
        knn = NearestNeighbors(n_neighbors=1)
        knn.fit(CS.iloc[:, :-1])
        
        # Separate attributes and class of x
        x_attr = x.iloc[:-1].values
        x_class = x.iloc[-1]
        
        # Nearest Neighbor of x in CS
        distances, indices = knn.kneighbors([x_attr])
            
        # Get the class of the nearest neighbor of x in CS
        nn_class = CS.iloc[indices.flatten()[0]].iloc[-1]

        # Add x as a new row to CS if the class of the nearest neighbor in CS is different from the class of x
        if nn_class != x_class:
            CS = pd.concat([CS, x.to_frame().T])
            
        # Remove x from TS
        TS = TS.drop(i)

    # Write the condensed file to disk
    CS.to_csv(output_file, index=False)


iris_input_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\iris_normalized.csv'
letter_input_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\letter-recognition_normalized.csv'
iris_output_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\iris_ΙΒ2.csv'
letter_output_csv = 'C:\\Users\\user\\Desktop\\KDD_pp1\\letter-recognition_ΙΒ2.csv'


# Before IB2 application, the Iris dataset contains 150 instances. Using k-NN with k=3 yields an accuracy of 95.33%.
# After IB2 application, the Condensed Set (CS) contains 6 instances. Τhe accuracy with k-NN (k=3) on the condensed set 
# is 66.66%. Despite the lower accuracy, the substantial reduction in dataset size from 150 to 6 instances highlights 
# the efficiency of IB2 in condensation. To potentially improve accuracy, the k-value in the Nearest Neighbors model 
# during condensation could be increased(e.g., n_neighbors=3)
IB2(iris_input_csv, iris_output_csv)

# Before IB2 application, the Letter Recognition dataset contains 20,000 instances. Using k-NN with k=7 yields an 
# accuracy of 95.15%. After IB2 application, the Condensed Set (CS) contains 2,663 instances. However, the accuracy 
# with k-NN (k=7) on the condensed set is 49.64%, which is not satisfactory. IB2 may not be suitable for this dataset, 
# as the reduced accuracy indicates limitations in preserving the discriminatory information.
# Note: Different datasets may exhibit varying degrees of compatibility with condensation algorithms, and results may 
# depend on factors like dataset characteristics and choice of parameters.
IB2(letter_input_csv, letter_output_csv)