### №1 Section for importing libraries and reading data

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from sklearn import metrics

colors = ['b', 'g', 'r']
markers = ['s', '*', 'o']
work_dir_path = os.path.join(os.getcwd(), 'bin_data')
spectra_path = os.path.join(work_dir_path, 'data.npy')
labels_path = os.path.join(work_dir_path, 'labels.npy')
spectra = np.load(spectra_path)
labels_tmp = np.load(labels_path)
labels = np.asarray([int(i[1:])-1  for i in labels_tmp])

### №2 Section for visualizing input data

In [None]:
def plot_spectra(spectra, labels):
    unique_labels = np.unique(labels)

    fig = plt.figure(figsize=(15, 7), dpi=100)
    plt.ylabel('Absorption', fontsize=10)
    plt.xlabel('Wavenumber / frequency / wavelength', fontsize=10)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    c = 1
    for label in unique_labels:
        label_indices = (labels[:] == label)
        label_spectra = spectra[label_indices]
        
        for spectrum in label_spectra:
            plt.plot(range(0, len(spectrum)), spectrum, color=colors[len(colors) % c], label=label)
        c += 1

    legend_handles, legend_labels = plt.gca().get_legend_handles_labels()
    handle_by_label = dict(zip(legend_labels, legend_handles))
    plt.legend(handle_by_label.values(), handle_by_label.keys()) 
    plt.show()

    return

plot_spectra(spectra, labels)

In [None]:
### №3 Section for Pearson correlation based feature selection

In [13]:
       
random_state = 42
#Number of component was reduced to 2000 to lower computational cost of the method
spectra_cut = spectra[:, :2000]
#Set correlation threshold
threshold = 0.9
features = []
for i in range(spectra_cut.shape[1]):
    for j in range(i+1, spectra_cut.shape[1]):
        if pearsonr(spectra_cut[:, i], spectra_cut[:, j])[0] >= threshold:
            features.append(j)
res = [] 
[res.append(x) for x in features if x not in res] 
#Output components with correlation less then threshold
for i in range(spectra_cut.shape[1]):
    if i not in res:
        print(i)


2000
1982
0
138
157
243
289
297
422
439
778
786
923
979
1073
1131
1146
1202
1207
1995
