In [3]:
import numpy as np
import pandas as pd



In [4]:
# If your file is named like this:

data = pd.read_csv("../data/vpufs_reduced_features.csv")

data = data.dropna(axis=1)
data = data.loc[:, ~data.columns.duplicated()]
data = data.apply(pd.to_numeric, errors='coerce')
data = data.dropna()

# 3. Extract features and labels
labels = data.iloc[:, -1].values
features_df = data.drop(data.columns[-1], axis=1)


print(data.shape)

print(data.head())


(624, 1000)
    586        384    87    751    81       262   697   581   364   383  ...  \
0  52.5  29.600000  65.4  11.20  22.5  3.580000  15.3  21.8  2.95  25.1  ...   
1  63.6  42.800000  68.7  12.00  27.3  4.620000  20.2  27.7  3.27  31.9  ...   
2  47.6   1.505021  75.8  10.20  32.6  3.310000  16.2  20.5  5.22  28.8  ...   
3  59.2  40.100000  62.1  14.00  27.2  3.300000  18.9  30.9  5.48  26.5  ...   
4  40.9  37.500000  74.7   4.61  32.9  1.510252  16.9  24.3  1.56  30.0  ...   

     884   634    29   172   263    45   741   378        906    840  
0  3.050  14.3  5.92  6.76  11.3  13.6  1.99  64.1  32.300000  12.70  
1  1.660  18.3  6.83  8.31  15.7  20.9  2.31  47.6  30.400000   8.12  
2  0.625  15.6  9.65  7.63  15.8  19.2  1.86  43.5  23.800000   8.85  
3  3.580  16.1  7.15  6.30  11.3  19.6  2.15  54.8  31.100000  10.80  
4  1.170  15.9  9.72  5.24  14.4  18.5  2.15  42.1   1.510252   9.81  

[5 rows x 1000 columns]


In [5]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

# def reduce_features(X, y=None, k=1000):
#     """ Reduce gene expression matrix to top k features """
#     if y is None:
#         # Use variance threshold if labels are not present
#         from sklearn.feature_selection import VarianceThreshold
#         selector = VarianceThreshold(threshold=0.01)
#         X_new = selector.fit_transform(X)
#         if X_new.shape[1] > k:
#             X_new = X_new[:, :k]
#         return X_new
#     else:
#         selector = SelectKBest(score_func=f_classif, k=k)
#         X_new = selector.fit_transform(X, y)
#         return X_new

def normalized_euclidean_similarity(X):
    """ Compute (1 - normalized Euclidean distance) similarity matrix """
    # Normalize data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Euclidean distances
    distances = cdist(X_scaled, X_scaled, metric='euclidean')
    
    # Normalize distances to [0, 1]
    max_dist = distances.max()
    normalized_dist = distances / max_dist
    
    # Convert to similarity
    similarity = 1 - normalized_dist
    return similarity

def build_clusters(similarity_matrix, r=0.9):
    """ Identify neighbors based on similarity threshold and count densities """
    n_samples = similarity_matrix.shape[0]
    clusters = []
    densities = []
    
    for i in range(n_samples):
        neighbors = np.where(similarity_matrix[i] >= r)[0]
        clusters.append(neighbors)
        densities.append(len(neighbors))
    
    return clusters, densities



def run_pipeline(data: pd.DataFrame, labels=None, k=1000, r=0.9):
    # print("Step 1: Reducing features...")
    # reduced_data = reduce_features(data.values, y=labels, k=k)
    # print(f"Reduced shape: {reduced_data.shape}")
    
    print("Step 2: Computing similarity matrix...")
    similarity = normalized_euclidean_similarity(data)
    
    print("Step 3: Identifying neighbors and densities...")
    clusters, densities = build_clusters(similarity, r=r)
    
    return clusters, densities, similarity


In [None]:
clusters, densities, similarity = run_pipeline(features_df, labels=labels, k=1000, r=0.862)



# Combine index, density, and cluster info into a list of tuples
sample_info = [(i, densities[i], clusters[i]) for i in range(len(clusters))]

# Now sort that list by density (2nd item in tuple) in descending order
sample_info_sorted = sorted(sample_info, key=lambda x: x[1], reverse=True)

# Print sorted result
for i, density, neighbors in sample_info_sorted[:50]:
    print(f"Sample {i}: Density = {density}")
    print(f"Neighbors: {neighbors}")
    print("-----------")
    print()



Step 2: Computing similarity matrix...
Step 3: Identifying neighbors and densities...
Sample 465: Density = 7
Neighbors: [443 454 460 461 464 465 466]
-----------

Sample 466: Density = 7
Neighbors: [443 450 451 454 461 465 466]
-----------

Sample 536: Density = 6
Neighbors: [490 498 503 525 536 538]
-----------

Sample 426: Density = 5
Neighbors: [316 347 409 421 426]
-----------

Sample 461: Density = 5
Neighbors: [443 461 462 465 466]
-----------

Sample 2: Density = 4
Neighbors: [  2 316 322 409]
-----------

Sample 322: Density = 4
Neighbors: [  2 322 338 413]
-----------

Sample 409: Density = 4
Neighbors: [  2 409 413 426]
-----------

Sample 443: Density = 4
Neighbors: [443 461 465 466]
-----------

Sample 490: Density = 4
Neighbors: [490 525 536 538]
-----------

Sample 498: Density = 4
Neighbors: [498 505 536 552]
-----------

Sample 505: Density = 4
Neighbors: [442 498 505 538]
-----------

Sample 538: Density = 4
Neighbors: [490 505 536 538]
-----------

Sample 286: Densit

In [7]:
def merge_clusters(sample_info_sorted, threshold=0.5):
    """
    Merge clusters if they share more than `threshold` of the total number of samples.
    The merging will be based on the ratio of common samples to the total distinct samples.
    """
    merged = []
    visited = set()

    for i, density_i, cluster_i in sample_info_sorted:
        if i in visited:
            continue

        # Start new merged cluster
        merged_cluster = set(cluster_i)
        merged_indices = {i}
        visited.add(i)

        changed = True
        while changed:
            changed = False
            for j, density_j, cluster_j in sample_info_sorted:
                if j in visited:
                    continue

                set_j = set(cluster_j)
                common = merged_cluster.intersection(set_j)  # common samples
                union = merged_cluster.union(set_j)  # total distinct samples
                common_size = len(common)
                union_size = len(union)

                # Calculate the ratio of common samples to total distinct samples
                similarity_ratio = common_size / union_size if union_size > 0 else 0

                # Merge if similarity ratio exceeds threshold
                if similarity_ratio > threshold:
                    merged_cluster.update(set_j)
                    visited.add(j)
                    merged_indices.add(j)
                    changed = True

        merged.append(list(merged_cluster))

    return merged

In [8]:
# Step 1: Sort by density (already done)
sample_info = [(i, densities[i], clusters[i]) for i in range(len(clusters))]
sample_info_sorted = sorted(sample_info, key=lambda x: x[1], reverse=True)

# Step 2: Merge clusters based on >50% overlap rule
merged_clusters = merge_clusters(sample_info_sorted)

# Step 3: Print merged cluster summary
print(f"Total merged clusters: {len(merged_clusters)}")
for idx, cluster in enumerate(merged_clusters):
    print(f"Merged Cluster {idx+1}: Size = {len(cluster)} Samples = {sorted([int(i) for i in cluster])}")


Total merged clusters: 607
Merged Cluster 1: Size = 9 Samples = [443, 450, 451, 454, 460, 461, 464, 465, 466]
Merged Cluster 2: Size = 6 Samples = [490, 498, 503, 525, 536, 538]
Merged Cluster 3: Size = 5 Samples = [316, 347, 409, 421, 426]
Merged Cluster 4: Size = 5 Samples = [443, 461, 462, 465, 466]
Merged Cluster 5: Size = 4 Samples = [2, 316, 322, 409]
Merged Cluster 6: Size = 4 Samples = [2, 322, 338, 413]
Merged Cluster 7: Size = 4 Samples = [2, 409, 413, 426]
Merged Cluster 8: Size = 4 Samples = [498, 505, 536, 552]
Merged Cluster 9: Size = 4 Samples = [442, 498, 505, 538]
Merged Cluster 10: Size = 4 Samples = [490, 505, 536, 538]
Merged Cluster 11: Size = 3 Samples = [286, 287, 289]
Merged Cluster 12: Size = 3 Samples = [286, 287, 293]
Merged Cluster 13: Size = 3 Samples = [2, 316, 426]
Merged Cluster 14: Size = 3 Samples = [322, 338, 376]
Merged Cluster 15: Size = 3 Samples = [322, 409, 413]
Merged Cluster 16: Size = 3 Samples = [454, 465, 466]
Merged Cluster 17: Size = 3 Sam

# Remove Outliers