### Library Needed

In [4]:
import os
import pandas as pd
from scipy.stats import pearsonr


### Preprocessing : Read file and prepare input 

In [5]:
# reading file
os.chdir('../backend')
file_path = os.getcwd() + "/metadata.csv"
df = pd.read_csv(file_path)
column_names = df.columns

# Filter columns starting with 'feature_' as x value and 'algo_' as y value
x_columns = [col for col in column_names if col.startswith('feature_')]
y_columns = [col for col in column_names if col.startswith('algo_')]


# Extract x and y from selected columns 
x = df[x_columns]
y = df[y_columns]

# Skip feature selection if <= 3 features and all features are kept
if x.shape[1] <= 3:
    print("There are 3 or less features to do selection. Skipping feature selection.")
    # TODO: return the desire output format
elif x.shape[1] <= 1:
    print("There is only 1 feature. Stopping space construction.")
    # TODO: return the desire output format

In [6]:
x.head()

Unnamed: 0,feature_Max_Normalized_Entropy_attributes,feature_Normalized_Entropy_Class_Attribute,feature_Mean_Mutual_Information_Attribute_Class,feature_ErrorRate_Decision_Node,feature_WeightedDist_StdDev,feature_Max_Feature_Efficiency_F3,feature_Collective_Feature_Efficiency_F4,feature_Training_Error_Linear_Classifier_L2,feature_Fraction_Points_Class_Boundary_N1,feature_Nonlinearity_Nearest_Neighbor_Classifier_N4
0,0.332548,0.131687,0.229928,0.386459,2.343235,0.076929,0.095204,0.260235,0.404836,0.327388
1,0.332548,0.299462,0.368704,0.21041,2.343235,0.500693,0.555474,0.035714,0.082629,0.357113
2,0.297231,0.10674,0.105274,0.141818,0.745009,0.292453,0.754717,0.198113,0.292453,0.212264
3,0.313965,0.202552,0.146191,0.219275,0.814613,0.134146,0.134146,0.154472,0.357724,0.288618
4,0.279463,0.437579,0.100076,0.240355,0.690097,0.827618,0.966261,0.031711,0.041482,0.030697


In [7]:
y.head()

Unnamed: 0,algo_NB,algo_LDA,algo_QDA,algo_CART,algo_J48,algo_KNN,algo_L_SVM,algo_poly_SVM,algo_RBF_SVM,algo_RandF
0,0.280096,0.239715,0.254012,0.245339,0.256491,0.256969,0.230584,0.277411,0.221198,0.234432
1,0.164182,0.159577,0.895472,0.172975,0.165282,0.167113,0.171518,0.178773,0.169919,0.895472
2,0.141818,0.123636,0.17,0.162727,0.132727,0.178182,0.123636,0.123636,0.132727,0.132727
3,0.253286,0.264329,0.554844,0.264329,0.264329,0.246951,0.264329,0.229573,0.231402,0.244275
4,0.034967,0.859129,0.859129,0.060208,0.038009,0.072298,0.035244,0.26588,0.151676,0.859129


### 1st Selection: Features Selection (Correlation Based)


In [8]:
# Initialize empty matrices for correlation coefficients and p-values
corr_matrix = pd.DataFrame(index=x.columns, columns=y.columns)
p_value_matrix = pd.DataFrame(index=x.columns, columns=y.columns)

# Compute correlation coefficient and p-value for each pair of variables
for x_col in x.columns:
    for y_col in y.columns:
        corr_coef, p_value = pearsonr(x[x_col], y[y_col])
        corr_matrix.loc[x_col, y_col] = corr_coef
        p_value_matrix.loc[x_col, y_col] = p_value

In [9]:
corr_matrix

Unnamed: 0,algo_NB,algo_LDA,algo_QDA,algo_CART,algo_J48,algo_KNN,algo_L_SVM,algo_poly_SVM,algo_RBF_SVM,algo_RandF
feature_Max_Normalized_Entropy_attributes,0.155569,0.184577,0.083635,0.284112,0.139638,0.139925,0.262739,0.241402,0.288322,0.206101
feature_Normalized_Entropy_Class_Attribute,-0.181881,0.419038,0.698115,0.044732,-0.00367,-0.023107,0.026912,0.067973,0.098525,0.308929
feature_Mean_Mutual_Information_Attribute_Class,-0.469024,-0.020967,0.289984,-0.38408,-0.411127,-0.446141,-0.378365,-0.380629,-0.372192,-0.149177
feature_ErrorRate_Decision_Node,0.533626,0.344297,0.261357,0.598178,0.615601,0.585527,0.586799,0.689633,0.544109,0.308655
feature_WeightedDist_StdDev,-0.24205,-0.23178,-0.203043,-0.349398,-0.349154,-0.3307,-0.298319,-0.386479,-0.342066,-0.184245
feature_Max_Feature_Efficiency_F3,-0.498631,0.023974,0.155385,-0.469903,-0.504981,-0.482735,-0.41571,-0.41463,-0.37701,-0.033425
feature_Collective_Feature_Efficiency_F4,-0.270725,0.005447,0.088412,-0.366902,-0.371975,-0.382477,-0.334577,-0.361583,-0.278572,-0.086481
feature_Training_Error_Linear_Classifier_L2,0.627241,0.225743,-0.103778,0.668658,0.705683,0.651858,0.686151,0.640647,0.643098,0.259195
feature_Fraction_Points_Class_Boundary_N1,0.732773,0.269194,0.027883,0.844128,0.860762,0.911544,0.750021,0.728602,0.764341,0.356168
feature_Nonlinearity_Nearest_Neighbor_Classifier_N4,0.44207,0.000364,-0.129661,0.397288,0.412983,0.374073,0.377263,0.25912,0.327465,0.148677


In [10]:
p_value_matrix

Unnamed: 0,algo_NB,algo_LDA,algo_QDA,algo_CART,algo_J48,algo_KNN,algo_L_SVM,algo_poly_SVM,algo_RBF_SVM,algo_RandF
feature_Max_Normalized_Entropy_attributes,0.02348,0.007044,0.225254,2.7e-05,0.042245,0.041819,0.000108,0.00039,2e-05,0.002565
feature_Normalized_Entropy_Class_Attribute,0.007937,0.0,0.0,0.517131,0.957634,0.738004,0.69683,0.324631,0.152849,5e-06
feature_Mean_Mutual_Information_Attribute_Class,0.0,0.7615,1.8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.029903
feature_ErrorRate_Decision_Node,0.0,0.0,0.000118,0.0,0.0,0.0,0.0,0.0,0.0,5e-06
feature_WeightedDist_StdDev,0.000376,0.000671,0.002979,0.0,0.0,1e-06,1e-05,0.0,0.0,0.007148
feature_Max_Feature_Efficiency_F3,0.0,0.728551,0.023646,0.0,0.0,0.0,0.0,0.0,0.0,0.628434
feature_Collective_Feature_Efficiency_F4,6.5e-05,0.93716,0.19977,0.0,0.0,0.0,1e-06,0.0,3.9e-05,0.209812
feature_Training_Error_Linear_Classifier_L2,0.0,0.000932,0.132025,0.0,0.0,0.0,0.0,0.0,0.0,0.000135
feature_Fraction_Points_Class_Boundary_N1,0.0,7.2e-05,0.686463,0.0,0.0,0.0,0.0,0.0,0.0,0.0
feature_Nonlinearity_Nearest_Neighbor_Classifier_N4,0.0,0.995793,0.059472,0.0,0.0,0.0,0.0,0.000136,1e-06,0.030463


In [11]:
# line 42: rho(isnan(rho) | (out.p>0.05)) = 0;
# does pvalue > 0.05 indicates insignificant feature
p_value_matrix[p_value_matrix > 0.05] = 0

In [12]:
# TODO: [rho,row] = sort(abs(rho),1,'descend'); (line 43)
p_value_matrix

Unnamed: 0,algo_NB,algo_LDA,algo_QDA,algo_CART,algo_J48,algo_KNN,algo_L_SVM,algo_poly_SVM,algo_RBF_SVM,algo_RandF
feature_Max_Normalized_Entropy_attributes,0.02348,0.007044,0.0,2.7e-05,0.042245,0.041819,0.000108,0.00039,2e-05,0.002565
feature_Normalized_Entropy_Class_Attribute,0.007937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5e-06
feature_Mean_Mutual_Information_Attribute_Class,0.0,0.0,1.8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.029903
feature_ErrorRate_Decision_Node,0.0,0.0,0.000118,0.0,0.0,0.0,0.0,0.0,0.0,5e-06
feature_WeightedDist_StdDev,0.000376,0.000671,0.002979,0.0,0.0,1e-06,1e-05,0.0,0.0,0.007148
feature_Max_Feature_Efficiency_F3,0.0,0.0,0.023646,0.0,0.0,0.0,0.0,0.0,0.0,0.0
feature_Collective_Feature_Efficiency_F4,6.5e-05,0.0,0.0,0.0,0.0,0.0,1e-06,0.0,3.9e-05,0.0
feature_Training_Error_Linear_Classifier_L2,0.0,0.000932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000135
feature_Fraction_Points_Class_Boundary_N1,0.0,7.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
feature_Nonlinearity_Nearest_Neighbor_Classifier_N4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000136,1e-06,0.030463
