### №1 Section for importing libraries and reading data

In [6]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from genetic_selection import GeneticSelectionCV
from sklearn import metrics

colors = ['b', 'g', 'r']
markers = ['s', '*', 'o']
work_dir_path = os.path.join(os.getcwd(), 'bin_data')
spectra_path = os.path.join(work_dir_path, 'data.npy')
labels_path = os.path.join(work_dir_path, 'labels.npy')
spectra = np.load(spectra_path)
labels_tmp = np.load(labels_path)
labels = np.asarray([int(i[1:])-1  for i in labels_tmp])

### №2 Section for visualizing input data

In [None]:
def plot_spectra(spectra, labels):
    unique_labels = np.unique(labels)

    fig = plt.figure(figsize=(15, 7), dpi=100)
    plt.ylabel('Absorption', fontsize=10)
    plt.xlabel('Wavenumber / frequency / wavelength', fontsize=10)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    c = 1
    for label in unique_labels:
        label_indices = (labels[:] == label)
        label_spectra = spectra[label_indices]
        
        for spectrum in label_spectra:
            plt.plot(range(0, len(spectrum)), spectrum, color=colors[len(colors) % c], label=label)
        c += 1

    legend_handles, legend_labels = plt.gca().get_legend_handles_labels()
    handle_by_label = dict(zip(legend_labels, legend_handles))
    plt.legend(handle_by_label.values(), handle_by_label.keys()) 
    plt.show()

    return

plot_spectra(spectra, labels)

In [None]:
### №3 Section for feature selection based on genetic algorithm

In [7]:
random_state = 42
#Number of component was reduced to 5000 to lower computational cost of the method. Data was processed for the first class only
spectra_cut = spectra[:, :5000]

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
clf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = GeneticSelectionCV(clf,
                                   cv=5,
                                   verbose=1,
                                   scoring="accuracy",
                                   max_features=5,
                                   n_population=50,
                                   crossover_proba=0.5,
                                   mutation_proba=0.2,
                                   n_generations=40,
                                   crossover_independent_proba=0.5,
                                   mutation_independent_proba=0.05,
                                   tournament_size=3,
                                   n_gen_no_change=10,
                                   caching=True,
                                   n_jobs=-1)

# find all relevant features - 5 features should be selected
feat_selector.fit(spectra_cut, labels)

# check selected features - first 5 features are selected
print(feat_selector.support_)
print(np.sum(feat_selector.support_))
X_train, X_test, y_train, y_test = train_test_split(spectra_cut, labels, random_state=random_state, test_size=0.3, stratify=labels)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy metrics for classifier")
print('Before feature selection')
print("Accuracy:", metrics.accuracy_score(y_test, y_pred),\
    "Precision:", metrics.precision_score(y_test,y_pred), "Recall:",metrics.recall_score(y_test, y_pred))
# call transform() on X to filter it down to selected features
X = feat_selector.transform(spectra_cut)
X_train, X_test, y_train, y_test = train_test_split(spectra_cut, labels, random_state=random_state, test_size=0.3, stratify=labels)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('After feature selection')
print("Accuracy:", metrics.accuracy_score(y_test, y_pred),\
    "Precision:", metrics.precision_score(y_test,y_pred), "Recall:",metrics.recall_score(y_test, y_pred))





Selecting features with genetic algorithm.
gen	nevals	avg          	std                    	min      	max    
0  	50    	[0.812 3.28 ]	[0.14914423 1.48377896]	[0.4 1. ]	[1. 5.]
1  	39    	[-2999.3576    54.04  ]	[4582.99624508  100.5516703 ]	[-1.e+04  1.e+00]	[  1. 292.]
2  	34    	[-3399.3704    83.66  ]	[4737.53960271  117.85662646]	[-10000.      0.]	[  1. 279.]
3  	27    	[-2399.2542    64.1   ]	[4271.25040437  109.56536862]	[-1.e+04  1.e+00]	[  1. 283.]
4  	22    	[-2999.3094    58.    ]	[4583.02779891  102.41230395]	[-10000.      0.]	[  1. 271.]
5  	24    	[-1599.1608    41.56  ]	[3666.42681287   91.52992079]	[-1.e+04  1.e+00]	[  1. 268.]
6  	35    	[-3199.32    79.5 ]    	[4665.22799203  114.60789676]	[-1.e+04  1.e+00]	[  1. 286.]
7  	38    	[-2599.2604    59.5   ]	[4386.78083704  101.57465235]	[-1.e+04  1.e+00]	[  1. 268.]
8  	28    	[-1799.18    38.66]    	[3842.25872991   87.21068971]	[-1.e+04  1.e+00]	[  1. 278.]
9  	31    	[-2799.28    70.58]    	[4490.43786302  111.85080956