In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# Load the Excel file
excel_file = './feature_ranking_4_filters.xlsx'
rankings_df = pd.read_excel(excel_file)

In [15]:
rankings_df.head(10)

Unnamed: 0,Information Gain,Gain Ratio,Pearson Correlation,ReliefF
0,f5,f5,f29,f24
1,f3,f3,f33,f23
2,f6,f6,f34,f33
3,f30,f4,f12,f5
4,f4,f29,f39,f32
5,f29,f30,f38,f6
6,f33,f33,f25,f35
7,f34,f34,f26,f34
8,f35,f12,f4,f36
9,f38,f38,f23,f37


In [5]:
# Load the dataset from CSV file
train_dataset_file = '../../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv'
dataset = pd.read_csv(train_dataset_file)
dataset = dataset.drop('level',axis=1)

dataset.columns = [f'f{i+1}' for i in range(len(dataset.columns))]

In [6]:
X  =dataset.drop('f42',axis=1)
y = dataset['f42']

In [9]:
def calculate_accuracy(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)


In [11]:
# Split your data into train and test sets before the iterations
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


In [13]:
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier()
}

In [17]:
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Define n_val to select top 10 features
n_val = 10

# Extract top 10 features from each filter method
top_features_by_method = {}
for column in rankings_df.columns:
    method_name = column  # Assuming column names are the filter method names
    top_features_by_method[method_name] = set(rankings_df[column][:n_val])  # Extracting top 10 features

# Flatten the list of features and count the occurrence of each feature
all_features = [feature for features in top_features_by_method.values() for feature in features]
feature_counts = Counter(all_features)

# Find the features that are in at least 2 filters
features_in_2_or_more_filters = [feature for feature, count in feature_counts.items() if count >= 2]

# Print the features that are in at least 2 filters
print("Kshirsagar Method")
print("-----------------")
print("Features present in at least 2 filter methods:")
print(features_in_2_or_more_filters)

# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame()

for i, (classifier_name, classifier) in enumerate(classifiers.items()):
    metrics_results = {'accuracy': {}, 'precision': {}, 'recall': {}}
    print(f'\n{classifier_name}:\n{"-" * len(classifier_name)}')

    if not features_in_2_or_more_filters:
        print('No features selected from at least 2 filters')
        continue

    X_train_selected = X_train[features_in_2_or_more_filters]
    X_test_selected = X_test[features_in_2_or_more_filters]

    classifier.fit(X_train_selected, y_train)
    y_pred = classifier.predict(X_test_selected)

    metrics_results['accuracy'][n_val] = accuracy_score(y_test, y_pred)
    metrics_results['precision'][n_val] = precision_score(y_test, y_pred)
    metrics_results['recall'][n_val] = recall_score(y_test, y_pred)

    print(f'Accuracy: {metrics_results["accuracy"][n_val]:.4f}, Precision: {metrics_results["precision"][n_val]:.4f}, Recall: {metrics_results["recall"][n_val]:.4f}')
    


Kshirsagar Method
-----------------
Features present in at least 2 filter methods:
['f34', 'f3', 'f6', 'f35', 'f30', 'f5', 'f29', 'f38', 'f33', 'f4', 'f12', 'f23']

Random Forest:
-------------
Accuracy: 0.9987, Precision: 0.9982, Recall: 0.9993

Decision Tree:
-------------
Accuracy: 0.9980, Precision: 0.9982, Recall: 0.9981

KNN:
---
Accuracy: 0.9968, Precision: 0.9966, Recall: 0.9974


In [ ]:
'''
Kshirsagar Method
-----------------
Features present in at least 2 filter methods:
['f34', 'f3', 'f6', 'f35', 'f30', 'f5', 'f29', 'f38', 'f33', 'f4', 'f12', 'f23']

Random Forest:
-------------
Accuracy: 0.9987, Precision: 0.9982, Recall: 0.9993

Decision Tree:
-------------
Accuracy: 0.9980, Precision: 0.9982, Recall: 0.9981

KNN:
---
Accuracy: 0.9968, Precision: 0.9966, Recall: 0.9974

'''