In [47]:
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, VarianceThreshold

In [6]:
# Create a function for performing feature selection using the filter method with sklearn

### Star Dataset Pre-processing

In [32]:
star_file_name = 'datasets/star_assessment.csv'
star_features = np.genfromtxt(star_file_name, delimiter=',', skip_header=True, encoding="utf-8", usecols=range(0, 17))
star_labels = np.genfromtxt(star_file_name, delimiter=',', skip_header=True, encoding="utf-8", usecols=17, dtype=None)

In [33]:
# Explore the star dataset
print(f'Number of samples: {star_features.shape[0]}')
print(f'Number of features: {star_features.shape[1]-1}')
print(f'Number of classes: {np.unique(star_labels).shape[0]}')
print(f'Class names: {np.unique(star_labels)}')

Number of samples: 100000
Number of features: 16
Number of classes: 3
Class names: ['GALAXY' 'QSO' 'STAR']


In [34]:
# Count the number of missing values in each column
missing_vals = np.sum(np.isnan(star_features), axis=0)
for k, v in enumerate(missing_vals):
    print(f'Column {k+1}: {v} missing values')
print(f'Total: {sum(missing_vals)} missing values')

Column 1: 60 missing values
Column 2: 55 missing values
Column 3: 65 missing values
Column 4: 70 missing values
Column 5: 63 missing values
Column 6: 50 missing values
Column 7: 59 missing values
Column 8: 61 missing values
Column 9: 59 missing values
Column 10: 51 missing values
Column 11: 68 missing values
Column 12: 59 missing values
Column 13: 58 missing values
Column 14: 49 missing values
Column 15: 50 missing values
Column 16: 61 missing values
Column 17: 62 missing values
Total: 1000 missing values


In [36]:
# Use a KNN imputer to fill in the missing values
knn_imputer = KNNImputer()
star_features_imputed = knn_imputer.fit_transform(star_features)

In [40]:
# Encode the class labels
label_encoder = LabelEncoder()
star_labels_encoded = label_encoder.fit_transform(star_labels)

In [48]:
# First use a variance threshold to remove any features with zero variance
threshold = 0
variance_threshold = VarianceThreshold(threshold=threshold)
star_features_imputed = variance_threshold.fit_transform(star_features_imputed)

In [53]:
select_k_best = SelectKBest(k=5)
star_features_selected = select_k_best.fit_transform(star_features_imputed, star_labels_encoded)

In [54]:
star_features

array([[1.23766096e+18, 1.35689107e+02, 3.24946318e+01, ...,
        5.81200000e+03, 5.63540000e+04, 1.71000000e+02],
       [1.23766488e+18, 1.44826101e+02, 3.12741849e+01, ...,
        1.04450000e+04, 5.81580000e+04, 4.27000000e+02],
       [1.23766096e+18, 1.42188790e+02, 3.55824442e+01, ...,
        4.57600000e+03, 5.55920000e+04, 2.99000000e+02],
       ...,
       [1.23766830e+18, 2.24587407e+02, 1.57007074e+01, ...,
        2.76400000e+03, 5.45350000e+04, 7.40000000e+01],
       [1.23766115e+18, 2.12268621e+02, 4.66603653e+01, ...,
        6.75100000e+03, 5.63680000e+04, 4.70000000e+02],
       [1.23766115e+18, 1.96896053e+02, 4.94646428e+01, ...,
        7.41000000e+03, 5.71040000e+04, 8.51000000e+02]])

In [55]:
star_features_selected

array([[2.03950100e+01, 1.91657300e+01, 6.54377737e+18, 6.34793600e-01,
        5.81200000e+03],
       [2.25844400e+01, 2.11681200e+01, 1.17601420e+19, 7.79136000e-01,
        1.04450000e+04],
       [2.06097600e+01, 1.93485700e+01, 5.15220026e+18, 6.44194500e-01,
        4.57600000e+03],
       ...,
       [1.82042800e+01, 1.76903400e+01, 3.11200776e+18, 1.43365600e-01,
        2.76400000e+03],
       [1.99138600e+01, 1.90725400e+01, 7.60107957e+18, 4.55039600e-01,
        6.75100000e+03],
       [2.06011500e+01, 2.00095900e+01, 8.34315235e+18, 5.42944200e-01,
        7.41000000e+03]])

### GWP Dataset Pre-processing

In [87]:
gwp_array = read_dataset('datasets/gwp_assessment.csv')

In [89]:
# Explore the gwp dataset
print(f'Number of samples: {gwp_array.shape[0]}')
print(f'Number of features: {gwp_array.shape[1]-1}')

Number of samples: 1197
Number of features: 14


In [None]:
# Convert