In [5]:
import numpy as np

def calculate_accuracy(data, selected_features):
    num_correct_classified = 0
    data_array = np.array(data)
    num_instances = len(data_array)
    features_array = data_array[:, selected_features]
    labels_array = data_array[:, 0]

    for x in range(num_instances):
        testing_features = features_array[x]
        testing_label = labels_array[x]

        training_features = np.delete(features_array, x, axis=0)
        training_labels = np.delete(labels_array, x)

        distances = np.sqrt(np.sum(np.square(training_features - testing_features), axis=1))
        nearest_neighbor_index = np.argmin(distances)
        nearest_neighbor_label = training_labels[nearest_neighbor_index]

        if testing_label == nearest_neighbor_label:
            num_correct_classified += 1

    accuracy = num_correct_classified / num_instances
    return accuracy


def forward_selection(data, early_stopping_threshold=None):
    selected_features, best_features = [], []
    num_features = len(data[0])
    best_accuracy = 0

    print("Forward Selection:")
    print("Beginning search.")

    for level in range(1, num_features):
        feature_to_add = None
        best_so_far_accuracy = 0

        for feature in range(1, num_features):
            if feature not in selected_features:
                accuracy = calculate_accuracy(data, selected_features + [feature])
                new_features = selected_features + [feature]
                print(f"Using feature(s) {new_features}, accuracy is {accuracy:.1%}")

                if accuracy > best_so_far_accuracy:
                    best_so_far_accuracy = accuracy
                    feature_to_add = feature
        
        if level > 1 and best_so_far_accuracy < best_accuracy:
            print("\n(Warning, Accuracy has decreased! Continuing search in case of local maxima)")
            
        if feature_to_add is not None:
            selected_features.append(feature_to_add)
            print(f"Added feature {feature_to_add} to the current set at level {level}, with accuracy: {best_so_far_accuracy * 100:.1f}%")
            print(f"Selected set: {selected_features}\n")

        if best_so_far_accuracy > best_accuracy:
            best_features = selected_features[:]
            best_accuracy = best_so_far_accuracy
        elif early_stopping_threshold is not None and best_accuracy - best_so_far_accuracy > early_stopping_threshold:
            print(f"Terminating at level {level} due to limited improvement in accuracy, satisfying the threshold condition.")
            break

    print("\nFinished search!!")
    print(f"The best feature subset found using Forward Selection is {best_features}, which has an accuracy of {best_accuracy:.1%}")


def backward_elimination(data, early_stopping_threshold=None):
    num_features = len(data[0])
    current_features = list(range(1, num_features))
    best_features = current_features[:]
    best_accuracy = calculate_accuracy(data, current_features)

    print("Backward Elimination:")
    print("Beginning search.")

    for level in range(1, num_features):
        feature_to_remove = None
        best_so_far_accuracy = 0

        for feature in current_features:
            accuracy = calculate_accuracy(data, current_features[:feature-1] + current_features[feature:])
            if accuracy > best_so_far_accuracy:
                best_so_far_accuracy = accuracy
                feature_to_remove = feature

        if level > 1 and best_so_far_accuracy < best_accuracy:
            print("\n(Warning, Accuracy has decreased! Continuing search in case of local maxima)")        

        if feature_to_remove is not None:
            current_features.remove(feature_to_remove)
            print(f"\nRemoved feature {feature_to_remove} from the current set at level {level}.")
            print(f"Feature set {current_features} was best, accuracy is {best_so_far_accuracy:.1%}\n")

        if best_so_far_accuracy > best_accuracy:
            best_features = current_features[:]
            best_accuracy = best_so_far_accuracy
        elif early_stopping_threshold is not None and best_accuracy - best_so_far_accuracy > early_stopping_threshold:
            print(f"Terminating at level {level} due to limited improvement in accuracy, satisfying the threshold condition.")
            break

    print("\nFinished search!!")
    print(f"The best feature subset found using Backward Elimination is {best_features}, which has an accuracy of {best_accuracy:.1%}")


# TESTING DATASET AGAINST CODE

In [8]:
def run(dataset_choice=None, algorithm_choice=None):
    # Define the dataset filenames
    dataset_filenames = ["CS170_small_Data__27.txt", "CS170_large_Data__1.txt", "CS170_XXXlarge_Data__17.txt", "Real_world_data(Wisconsin Breast Cancer Dataset).csv"]

    # Ask for inputs if not provided
    if dataset_choice is None:
        # Choose a dataset
        print("Please select a dataset by entering a number:")
        for i, filename in enumerate(dataset_filenames, 1):
            print(f"{i}) {filename}")

        dataset_choice = int(input())

        # Validate the dataset choice
        if dataset_choice < 1 or dataset_choice > len(dataset_filenames):
            print("Invalid dataset choice. Please enter a valid number!")
            exit()

    # Load the selected dataset
    dataset_filename = dataset_filenames[dataset_choice - 1]
    if dataset_choice == 4:
        # Data cleaning (Wisconsin breast cancer dataset)
        print("(Wisconsin breast cancer dataset)")
        data = pd.read_csv('data.csv')
        X = data.iloc[:, 2:-1].to_numpy()
        # Create a new DataFrame with only the 'diagnosis' column
        df = pd.DataFrame(data, columns=['diagnosis'])

        # Replace values in the 'diagnosis' column
        df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})

        # Assign the modified 'diagnosis' column back to the original Data
        data['diagnosis'] = df['diagnosis']
        data = data.iloc[:, 1:-1].to_numpy()
        print(data)
    else:
        data = np.genfromtxt(dataset_filename, dtype=float)

    # Ask for inputs if not provided
    if algorithm_choice is None:
        # Choose an algorithm
        print("Please select an algorithm by entering:")
        print("1) Perform Forward Selection")
        print("2) Perform Backward Elimination")

        algorithm_choice = int(input())

        # Validate the algorithm choice
        if algorithm_choice < 1 or algorithm_choice > 2:
            print("Invalid algorithm choice. Please enter either 1 or 2!")
            exit()

    # Threshold value for early stopping (default is None)
    threshold = 0.01 if dataset_choice == 3 else None

    # Print algorithm details and chosen dataset
    if algorithm_choice == 1:
        algorithm_name = "Forward Selection"
    elif algorithm_choice == 2:
        algorithm_name = "Backward Elimination"

    print(f"\nYou have chosen to perform {algorithm_name} on the dataset '{dataset_filename}'.")
    print(f"This dataset contains {len(data[0]) - 1} features (excluding the class attribute) and {len(data)} instances.\n")

    # Perform the selected algorithm on the chosen dataset
    if algorithm_choice == 1:
        forward_selection(data,threshold)
    elif algorithm_choice == 2:
        backward_elimination(data,threshold)

## Usage
Call the `run()` function to perform feature selection on a dataset.

### Options
- [no arguments]: Run the function without arguments to interactively select the dataset and algorithm.
- [arg1, arg2]: Provide the `dataset_choice` and `algorithm_choice` as arguments to run the function with specific choices.

#### Dataset Choices:
1. CS170_small_Data__27.txt
2. CS170_large_Data__1.txt
3. CS170_XXXlarge_Data__17.txt
4. Real_world_data(Wisconsin Breast Cancer Dataset).csv

#### Algorithm Choices:
1. Forward Selection
2. Backward Elimination

#### Example 1: Run the function interactively
```python
run()  # Perform feature selection on a dataset interactively.
```

#### Example 2: Provide the `dataset_choice` and `algorithm_choice` as arguments

##### Select the `second dataset` and Perform `Forward Selection` algorithm
```python
dataset_choice = 2
```
```python
algorithm_choice = 1
```

Run the function with specific choices:
```python
run(dataset_choice, algorithm_choice)
```

## Example interactive run

In [10]:
run() #interactive run


Please select a dataset by entering a number:
1) CS170_small_Data__27.txt
2) CS170_large_Data__1.txt
3) CS170_XXXlarge_Data__17.txt
4) Real_world_data(Wisconsin Breast Cancer Dataset).csv
Please select an algorithm by entering:
1) Perform Forward Selection
2) Perform Backward Elimination

You have chosen to perform Backward Elimination on the dataset 'CS170_large_Data__1.txt'.
This dataset contains 20 features (excluding the class attribute) and 2000 instances.

Backward Elimination:
Beginning search.

Removed feature 12 from the current set at level 1.
Feature set [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20] was best, accuracy is 73.2%


Removed feature 1 from the current set at level 2.
Feature set [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20] was best, accuracy is 73.5%


Removed feature 9 from the current set at level 3.
Feature set [2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20] was best, accuracy is 75.4%



Removed feature 

## 1.1) Small data , forward selection

In [None]:
run(1,1)


You have chosen to perform Forward Selection on the dataset 'CS170_small_Data__27.txt'.
This dataset contains 10 features (excluding the class attribute) and 1000 instances.

Beginning search.
Using feature(s) [1], accuracy is 74.7%
Using feature(s) [2], accuracy is 70.2%
Using feature(s) [3], accuracy is 68.7%
Using feature(s) [4], accuracy is 71.7%
Using feature(s) [5], accuracy is 70.0%
Using feature(s) [6], accuracy is 73.9%
Using feature(s) [7], accuracy is 68.5%
Using feature(s) [8], accuracy is 73.5%
Using feature(s) [9], accuracy is 68.9%
Using feature(s) [10], accuracy is 84.7%
Added feature 10 to the current set at level 1, with accuracy: 84.7%
Selected set: [10]

Using feature(s) [10, 1], accuracy is 95.9%
Using feature(s) [10, 2], accuracy is 83.6%
Using feature(s) [10, 3], accuracy is 84.3%
Using feature(s) [10, 4], accuracy is 85.7%
Using feature(s) [10, 5], accuracy is 82.6%
Using feature(s) [10, 6], accuracy is 84.7%
Using feature(s) [10, 7], accuracy is 85.7%
Using fe

## 1.2) Small data , backward elimination

In [None]:
run(1,2)


You have chosen to perform Backward Elimination on the dataset 'CS170_small_Data__27.txt'.
This dataset contains 10 features (excluding the class attribute) and 1000 instances.

Beginning search.

Removed feature 8 from the current set at level 1.
Feature set [1, 2, 3, 4, 5, 6, 7, 9, 10] was best, accuracy is 79.8%


Removed feature 4 from the current set at level 2.
Feature set [1, 2, 3, 5, 6, 7, 9, 10] was best, accuracy is 81.6%


Removed feature 5 from the current set at level 3.
Feature set [1, 2, 3, 6, 7, 9, 10] was best, accuracy is 83.5%


Removed feature 2 from the current set at level 4.
Feature set [1, 3, 6, 7, 9, 10] was best, accuracy is 85.9%


Removed feature 3 from the current set at level 5.
Feature set [1, 6, 7, 9, 10] was best, accuracy is 88.9%


Removed feature 7 from the current set at level 6.
Feature set [1, 6, 9, 10] was best, accuracy is 92.0%


Removed feature 6 from the current set at level 7.
Feature set [1, 9, 10] was best, accuracy is 94.6%


Removed fea

## 2.1) large data , forward selection

In [None]:
run(2,1)


You have chosen to perform Forward Selection on the dataset 'CS170_large_Data__1.txt'.
This dataset contains 20 features (excluding the class attribute) and 2000 instances.

Beginning search.
Using feature(s) [1], accuracy is 72.8%
Using feature(s) [2], accuracy is 70.0%
Using feature(s) [3], accuracy is 71.4%
Using feature(s) [4], accuracy is 71.2%
Using feature(s) [5], accuracy is 71.7%
Using feature(s) [6], accuracy is 71.4%
Using feature(s) [7], accuracy is 71.0%
Using feature(s) [8], accuracy is 71.0%
Using feature(s) [9], accuracy is 70.0%
Using feature(s) [10], accuracy is 70.8%
Using feature(s) [11], accuracy is 85.0%
Using feature(s) [12], accuracy is 70.3%
Using feature(s) [13], accuracy is 70.1%
Using feature(s) [14], accuracy is 70.7%
Using feature(s) [15], accuracy is 70.2%
Using feature(s) [16], accuracy is 71.2%
Using feature(s) [17], accuracy is 75.8%
Using feature(s) [18], accuracy is 71.1%
Using feature(s) [19], accuracy is 70.2%
Using feature(s) [20], accuracy is 70

## 2.2) large data , backward elimination

In [None]:
run(2,2)


You have chosen to perform Backward Elimination on the dataset 'CS170_large_Data__1.txt'.
This dataset contains 20 features (excluding the class attribute) and 2000 instances.

Beginning search.

Removed feature 12 from the current set at level 1.
Feature set [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20] was best, accuracy is 73.2%


Removed feature 1 from the current set at level 2.
Feature set [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20] was best, accuracy is 73.5%


Removed feature 10 from the current set at level 3.
Feature set [2, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20] was best, accuracy is 75.4%


Removed feature 7 from the current set at level 4.
Feature set [2, 3, 4, 5, 6, 8, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20] was best, accuracy is 75.5%


Removed feature 5 from the current set at level 5.
Feature set [2, 3, 4, 6, 8, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20] was best, accuracy is 76.2%


Removed feature 3 from the cu

## 3.1) XXXL data , forward selection with threshold condition for early abandoning

In [None]:
run(3,1)


You have chosen to perform Forward Selection on the dataset 'CS170_XXXlarge_Data__17.txt'.
This dataset contains 80 features (excluding the class attribute) and 4000 instances.

Beginning search.
Using feature(s) [1], accuracy is 71.3%
Using feature(s) [2], accuracy is 71.7%
Using feature(s) [3], accuracy is 71.2%
Using feature(s) [4], accuracy is 70.0%
Using feature(s) [5], accuracy is 69.9%
Using feature(s) [6], accuracy is 69.5%
Using feature(s) [7], accuracy is 70.8%
Using feature(s) [8], accuracy is 70.2%
Using feature(s) [9], accuracy is 71.7%
Using feature(s) [10], accuracy is 69.9%
Using feature(s) [11], accuracy is 70.1%
Using feature(s) [12], accuracy is 70.9%
Using feature(s) [13], accuracy is 70.2%
Using feature(s) [14], accuracy is 70.2%
Using feature(s) [15], accuracy is 69.9%
Using feature(s) [16], accuracy is 83.8%
Using feature(s) [17], accuracy is 74.2%
Using feature(s) [18], accuracy is 70.7%
Using feature(s) [19], accuracy is 70.6%
Using feature(s) [20], accuracy i

## 3.2) XXXL data , backward elimination with threshold condition for early abandoning

In [None]:
run(3,2)


You have chosen to perform Backward Elimination on the dataset 'CS170_XXXlarge_Data__17.txt'.
This dataset contains 80 features (excluding the class attribute) and 4000 instances.

Beginning search.

Removed feature 17 from the current set at level 1.
Feature set [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80] was best, accuracy is 69.0%


Removed feature 18 from the current set at level 2.
Feature set [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80] was best, accuracy is 69.7%



## 4.1) Real world datset (Wisconsin Breast Cancer Dataset), forward selection

In [None]:
run(4,1)

(Wisconsin breast cancer dataset)
[[ 1.      17.99    10.38    ...  0.2654   0.4601   0.1189 ]
 [ 1.      20.57    17.77    ...  0.186    0.275    0.08902]
 [ 1.      19.69    21.25    ...  0.243    0.3613   0.08758]
 ...
 [ 1.      16.6     28.08    ...  0.1418   0.2218   0.0782 ]
 [ 1.      20.6     29.33    ...  0.265    0.4087   0.124  ]
 [ 0.       7.76    24.54    ...  0.       0.2871   0.07039]]

You have chosen to perform Forward Selection on the dataset 'Real_world_data(Wisconsin Breast Cancer Dataset).csv'.
This dataset contains 30 features (excluding the class attribute) and 569 instances.

Beginning search.
Using feature(s) [1], accuracy is 80.5%
Using feature(s) [2], accuracy is 61.7%
Using feature(s) [3], accuracy is 82.6%
Using feature(s) [4], accuracy is 81.7%
Using feature(s) [5], accuracy is 62.0%
Using feature(s) [6], accuracy is 73.6%
Using feature(s) [7], accuracy is 81.4%
Using feature(s) [8], accuracy is 84.0%
Using feature(s) [9], accuracy is 57.8%
Using feature

## 4.2) Real world datset (Wisconsin Breast Cancer Dataset), backword elimination

In [None]:
run(4,2)

(Wisconsin breast cancer dataset)
[[ 1.      17.99    10.38    ...  0.2654   0.4601   0.1189 ]
 [ 1.      20.57    17.77    ...  0.186    0.275    0.08902]
 [ 1.      19.69    21.25    ...  0.243    0.3613   0.08758]
 ...
 [ 1.      16.6     28.08    ...  0.1418   0.2218   0.0782 ]
 [ 1.      20.6     29.33    ...  0.265    0.4087   0.124  ]
 [ 0.       7.76    24.54    ...  0.       0.2871   0.07039]]

You have chosen to perform Backward Elimination on the dataset 'Real_world_data(Wisconsin Breast Cancer Dataset).csv'.
This dataset contains 30 features (excluding the class attribute) and 569 instances.

Beginning search.

Removed feature 1 from the current set at level 1.
Feature set [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] was best, accuracy is 91.6%


Removed feature 2 from the current set at level 2.
Feature set [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30