# Script to run all four feature selection algorithms and save outputs.

In [1]:
# All imports
import numpy as np
import pandas as pd
import csv 
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# ExtraTreesClassifier imports
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification

# KNeighborsClassifier imports
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

# Recursive Feature Selection imports
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

# ANOVA imports
from sklearn.feature_selection import SelectKBest, f_classif

In [13]:
# File path (the file may require non-numerical values across the top row,
# so an extra portion may need to be added to modify this based on inputs).
dataFolder = 'Data/'
dataFile = dataFolder + 'bandsOnTopWithClassification.csv'

# Load the dataset
df = pd.read_csv(dataFile)
indColumn = df.iloc[:,0:767] #independent columns
targetColumn = df.iloc[:,-1] #target column

In [14]:
# Specify features to select (if applies)
n_to_select = 50

# Specify seed value for consistent results (if applies)
seed_value = 13

In [15]:
# Methods

def writeToCSV(file_name, model_output):

    saveOutput = file_name + '_' + str(n_to_select) + '.csv'
    with open(saveOutput, 'w') as csvfile:  
        # creating a csv writer object  
        csvwriter = csv.writer(csvfile)  

        # writing the fields  
        csvwriter.writerow(model_output)  

In [16]:
"""
ExtraTreesClassifier

NOTES 
model.predict may be of use, although we get the 
predicted class probabilities with .feature_importances_

outputs will be sorted in ascending order
in the case of ETC that means the LAST
n_to_select values are important

output is the same while using seed_value
"""

# Run the model
ETC_model = ExtraTreesClassifier(random_state = seed_value)
ETC_model.fit(indColumn,targetColumn)

# Rank the output indicies
ETC_output = ETC_model.feature_importances_
ETC_sort_index = np.argsort(ETC_output)

In [17]:
# Save ETC data as .csv
ETC_output_name = 'ETC_output'
writeToCSV(ETC_output_name, ETC_output)
ETC_sort_index_name = 'ETC_sorted_index'
writeToCSV(ETC_sort_index_name, ETC_sort_index)

In [18]:
"""
Recursive Feature Elimination

NOTES 
outputs will be sorted in ascending order
in the case of RFE that means the FIRST
n_to_select values are important
"""

# Run the model
estimator = SVR(kernel="linear")
RFE_model = RFE(estimator, n_features_to_select=n_to_select, step=1)
RFE_model.fit(indColumn, targetColumn)

# Rank the output indicies
RFE_output = RFE_model.ranking_
RFE_sort_index = np.argsort(RFE_output)

In [19]:
# Save RFE data as .csv
RFE_output_name = 'RFE_output'
writeToCSV(RFE_output_name, RFE_output)
RFE_sort_index_name = 'RFE_sorted_index'
writeToCSV(RFE_sort_index_name, RFE_sort_index)

In [20]:
"""
ANOVA 
using SelectKBest

NOTES
outputs will be sorted in ascending order
in the case of RFE that means the LAST
n_to_select values are important

output is the same on every run
"""

# Run the model
ANOVA_model = SelectKBest(f_classif, k=4)
ANOVA_model.fit(indColumn, targetColumn)

# Rank the output indicies
ANOVA_scores = -np.log10(ANOVA_model.pvalues_)
ANOVA_scores /= ANOVA_scores.max()
ANOVA_sort_index = np.argsort(ANOVA_scores)

In [21]:
# Save ANOVA data as .csv
ANOVA_output_name = 'ANOVA_output'
writeToCSV(ANOVA_output_name, ANOVA_scores)
ANOVA_sort_index_name = 'ANOVA_sorted_index'
writeToCSV(ANOVA_sort_index_name, ANOVA_sort_index)

In [11]:
"""
K-Neighbors Classifier
using SequentialFeatureSelector

NOTES
outputs come as boolean values, hence sorting will
place 'true' or important values in the LAST n_to_select
positions

same output every run with current params (n_neighbors=3)
and (n_to_select = 150)

**takes the longest to run
"""

# Run the model (time)
knn = KNeighborsClassifier(n_neighbors=3)
KNN_model = SequentialFeatureSelector(knn, n_features_to_select=n_to_select)
KNN_model.fit(indColumn, targetColumn)

# Rank the output indicies
KNN_output = KNN_model.support_
KNN_sort_index = np.argsort(KNN_output)

KeyboardInterrupt: 

In [None]:
# Save KNN data as .csv
KNN_output_name = 'KNN_output'
writeToCSV(KNN_output_name, KNN_output)
KNN_sort_index_name = 'KNN_sorted_index'
writeToCSV(KNN_sort_index_name, KNN_sort_index)