### Library and data needed

In [224]:
import os
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [225]:
opts_k = 10
opts_rho = 0.1
np.random.seed(0)


### Preprocessing : Read file and prepare input 

In [226]:
# # reading file
os.chdir('../backend')
x = pd.read_csv(os.getcwd() + "/x.csv", header=None)
y = pd.read_csv(os.getcwd() + "/y.csv", header=None)

In [227]:
# Skip feature selection if <= 3 features and all features are kept
if x.shape[1] <= 3:
    print("There are 3 or less features to do selection. Skipping feature selection.")
    # TODO: return the desire output format
elif x.shape[1] <= 1:
    print("There is only 1 feature. Stopping space construction.")
    # TODO: return the desire output format
    
n_feature = x.shape[1]

### Features Selection (Correlation Based): Line 39-53)


In [228]:
# Initialize empty matrices for correlation coefficients and p-values
corr_matrix = pd.DataFrame(index=x.columns, columns=y.columns)
p_value_matrix = pd.DataFrame(index=x.columns, columns=y.columns)

# Compute correlation coefficient and p-value for each pair of variables
for x_col in x.columns:
    for y_col in y.columns:
        corr_coef, p_value = pearsonr(x[x_col], y[y_col])
        # assign correlation value to 0 so that it can be filtered later
        if p_value > 0.05:
            corr_matrix.loc[x_col, y_col] = 0    
        else:
            corr_matrix.loc[x_col, y_col] = corr_coef
corr_matrix = abs(corr_matrix)

# TODO: testing: check corr_matrix (manual check pass)

In [229]:
corr_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.16947,0.0,0.15147,0.0,0.0,0.0,0.0,0.0,0.0
1,0.183991,0.30404,0.603926,0.162437,0.0,0.0,0.0,0.157502,0.0,0.188585
2,0.515083,0.216131,0.0,0.328626,0.370241,0.511744,0.427149,0.36599,0.460169,0.296477
3,0.508291,0.464738,0.357001,0.564691,0.526048,0.496176,0.522701,0.682551,0.537335,0.390996
4,0.186968,0.288352,0.233814,0.379854,0.352313,0.324384,0.307727,0.427858,0.371381,0.259739
5,0.538544,0.162534,0.0,0.507152,0.54086,0.519021,0.485531,0.462534,0.468816,0.346111
6,0.327107,0.158215,0.0,0.376153,0.390311,0.408971,0.40457,0.369844,0.354679,0.280685
7,0.623523,0.444072,0.0,0.658937,0.658274,0.667203,0.745088,0.709117,0.727195,0.502527
8,0.713447,0.475969,0.141597,0.812574,0.82579,0.88434,0.795783,0.784334,0.825643,0.643079
9,0.496212,0.184281,0.0,0.354537,0.385519,0.39823,0.466231,0.307199,0.417937,0.330138


In [230]:
# choose features with correlatiton value over 0.01 (specified in options.json)
# filter out x values with selected feature
for column in corr_matrix.columns:
    corr_matrix[column] = corr_matrix[column].apply(lambda x: 0 if x < opts_rho else x)

# choose features (only choose feautres that all correlation value >= opts_rho)
features_selected = []
for index, row in corr_matrix.iterrows():
    if (row != 0).any():
        features_selected.append(index)
        
x_aux = x.iloc[:,features_selected]
# TODO: test, compare xaux 
x_aux.shape

(212, 10)

### K-means Clustering (Line 69-95)

- Applies k-means clustering algorithm to Xaux
- Evaluates sihouette values for different number of clusters
- Output -> (1) inspected num of clusters, (2) corresponding criterion values
- Checks, if sihouette value for specified number of clusters is below 0.5 (sihouette value is  a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation), the range is -1 to 1)
    - we do average sihouette value for each cluster
    - if ASV is < 0.5 ==> poor cluster quality and suggest increasing opts.K. Or it will find a value of K that yields sihouette value above 7.5, if available
- After checking sihouette value, do k-mean clustering

***Steps***
1. construct kmc and compute silhouette_score for chosen k clusters
2. repeat step 1 for k clusters value ranging from 3 to n_feature
3. display average of silhouette score based on k clusters
3. report if your chosen k-mean based on options.json is lower than 0.5
    - if lower than 0.5, based on step 2, find the best k clusters that has silhouette score higher than 7.5

***k clusters***
- using k value from options.json, do kmc

In [231]:
x_aux.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,202,203,204,205,206,207,208,209,210,211
0,0.295956,0.295956,-0.502396,-0.128526,-0.890739,-0.03495,-2.22361,0.249945,0.187172,-0.147803,...,0.298118,0.151432,-7e-05,0.163201,-0.204848,0.21764,-0.312705,-0.34535,-0.31896,-0.319541
1,-0.057217,1.295543,-0.381586,0.657416,1.820903,-0.404396,-0.738473,-0.938083,-0.245951,0.354739,...,-0.090279,0.290992,-0.429621,-1.841058,0.714898,0.099474,-0.488907,-0.425249,-0.420382,1.562873
2,0.886483,1.398411,-0.006613,0.354321,-0.058625,-0.49874,-1.391399,-1.42999,-1.376439,-1.11043,...,-0.199497,1.962561,0.347064,-1.172688,1.322951,-0.695821,1.353713,-1.173897,-1.181963,1.475819
3,1.233727,-0.135799,-0.719773,-0.062564,0.109614,1.653189,0.259672,1.01244,1.857577,0.910404,...,0.504574,0.494313,-0.747642,-1.517337,0.838224,1.1434,-1.17581,0.089661,0.074696,0.231585
4,0.708027,0.708027,-0.701276,-0.601823,-0.784149,-0.259824,-1.049457,-1.055267,-1.139036,-1.66678,...,0.215728,-1.068249,0.485973,0.696063,-0.968163,1.720647,1.244649,-0.605052,-0.605052,-0.450352
5,-0.892574,0.854621,0.171937,-0.561026,1.55418,-1.175922,-1.413303,-1.405158,-1.350688,-0.360929,...,-0.191139,1.61217,0.613613,-0.472901,1.298495,-0.475261,-0.630595,-0.339655,-0.321637,1.392404
6,-1.254885,-0.120492,0.425788,-1.166366,1.038919,-1.350971,-1.464993,-1.448122,-1.35224,0.710869,...,-0.690227,1.122295,0.054226,-0.656004,1.094865,-0.939207,-0.911191,1.109462,1.124274,0.875254
7,1.055548,-1.163496,0.600559,0.220545,-1.221553,0.129667,1.047357,1.607579,1.237335,0.440966,...,-0.123097,-1.693319,1.355755,-0.910122,-0.709725,0.103117,-1.214254,0.898569,0.896715,-1.128042
8,1.134618,-0.795848,0.604311,0.926423,-1.160483,0.520597,1.36368,1.877606,1.489992,1.013887,...,0.354838,-1.531847,0.06694,-0.740361,-0.829494,-0.087697,-0.991483,1.215112,1.208741,-1.112357
9,1.275136,1.460828,0.460399,1.018685,-1.239134,0.179919,0.75303,-1.594327,-1.592733,-1.594327,...,-1.014619,-1.591949,-0.685372,0.865197,-1.385595,1.725304,-1.157298,0.708646,0.726846,-1.378727


In [232]:
# Initialize empty dictionary to store silhouette scores
silhouette_scores = {}

# Range of cluster numbers to evaluate
min_clusters = 3
max_clusters = n_feature  # Assuming nfeats is defined
cluster_range = range(min_clusters, max_clusters + 1)

# # Evaluating silhouette scores for different numbers of clusters
for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, n_init = 10, random_state=0)
    cluster_labels = kmeans.fit_predict(x_aux)  # Transpose Xaux for clustering
    silhouette_scores[n_clusters] = silhouette_score(x_aux, cluster_labels, metric='correlation')

# Displaying average silhouette values for each number of clusters
print('-> Average silhouette values for each number of clusters.')
for n_clusters, silhouette_score in silhouette_scores.items():
    print(f"Number of clusters: {n_clusters}, Silhouette score: {silhouette_score}")

# Checking silhouette value for opts.K
if silhouette_scores[opts_k] < 0.5:
    print(f'-> The silhouette value for K={opts_k} is below 0.5. You should consider increasing K.')
    K_suggested = next((k for k, score in silhouette_scores.items() if score > 0.75), None)
    if K_suggested is not None:
        print(f'A suggested value of K is {K_suggested}.')
    else:
        print(f'No suggested value of K')
# TODO: silhouette score is way different, perhaps the x_aux is very different

-> Average silhouette values for each number of clusters.
Number of clusters: 3, Silhouette score: 0.4004134635030101
Number of clusters: 4, Silhouette score: 0.36785027764563133
Number of clusters: 5, Silhouette score: 0.33545180919898104
Number of clusters: 6, Silhouette score: 0.3208224051697146
Number of clusters: 7, Silhouette score: 0.30190541309710284
Number of clusters: 8, Silhouette score: 0.2476332521470801
Number of clusters: 9, Silhouette score: 0.2682247181262402
Number of clusters: 10, Silhouette score: 0.2715311894295123
-> The silhouette value for K=10 is below 0.5. You should consider increasing K.
No suggested value of K
