In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

plt.rcParams['font.family'] = 'Arial'

csv_iba1_path = r"use\morphology_results.csv" # morphology file
csv_contra_path = r"use\morphology_results_control.csv" # morphology file control

df_iba1 = pd.read_csv(csv_iba1_path)
df_contra = pd.read_csv(csv_contra_path)
df_iba1.insert(2, 'lateral', 'ipsi')
df_contra.insert(2, 'lateral', 'contra')
df = pd.concat([df_iba1, df_contra], ignore_index=True)
df_value = df.drop(columns=['lateral', 'mouse', 'sample']).copy()
n_samples, n_features = df_value.shape
df_value.replace([np.inf, -np.inf], np.nan, inplace=True)  # Convert inf to NaN
df_value.fillna(df_value.mean(), inplace=True)  # Fill NaN with the average value of each column
df_value

# standardization
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_value), columns=df_value.columns)

def calculate_mmi(data):
    """多峰性指数（MMI）を計算"""
    n = len(data)
    M3 = skew(data)  # skewness
    M4 = kurtosis(data)  # Kurtosis (using Fisher's definition)
    
    if n < 4:
        return np.nan

    MMI = (M3**2 + 1) / (M4 + 3 * ((n - 1)**2 / ((n - 2) * (n - 3))))
    return MMI

# Calculate the MMI for each feature
mmi_values = df_scaled.apply(calculate_mmi)
print(mmi_values, end="\n\n")

# Select features according to the MMI value (e.g., MMI ≥ 0.5)
selected_features = mmi_values[mmi_values >= 0.5].index.tolist()
print("selected features")
for fe in selected_features:
    print(fe)

# Use only selected features
df_selected = df_scaled[selected_features]
df_selected = df_selected.apply(pd.to_numeric, errors='coerce')
df_selected = df_selected.dropna()
print(df_selected.head())


# Testing for an appropriate number of clusters
X = df_selected
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
scores = []
k_range = range(2, 10)  # Evaluated with cluster counts of 2 to 9

# silhouette score
best_k = 0
scores = []
best_score = -1

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    if score > best_score:
        print(score)
        best_score = score
        best_k = k
    scores.append(score)

# plot
plt.figure(figsize=(10, 7))
plt.plot(k_range, scores, marker='o')
plt.xlabel("Number of Clusters", fontsize=18)
plt.ylabel("Silhouette Score", fontsize=18)
plt.tick_params(axis='x', labelsize=16)
plt.tick_params(axis='y', labelsize=16)
plt.title("Optimal Cluster Selection using Silhouette Score")
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["top"].set_visible(False)
plt.show()




# Weighting using MMI (contribution ratio) (temporary value, replace with actual MMI)
mmi_weights = {}
for i in range(len(df_value.columns)):
    mmi_weights[df_value.columns[i]] = mmi_values[i]

# Normalize features (Min-Max scaling)
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_value[mmi_weights.keys()]), columns=mmi_weights.keys())

# Classification of features
ameboid_features = [
    'Cell Area (µm²)',
    'Density', 
    'Convex Hull Circularity', 
    'Cell Circularity',
    'Cell body Area (μm²)'
    ]

ramified_features = [
    'Fractal_dimention', 
    'lacnaulity', 
    'Convex Hull Area (µm²)', 
    'Cell Perimeter (µm)', 
    'Convex Hull Perimeter (µm)', 
    'Roughness',   
    'Total_Process_Length', 
    'Mean_Process_Length', 
    'Max_Process_Length',
    'Sholl_Max_Counts', 
    'Straightness'
    ]

# Score calculation
df_scaled["Amoeboid_Score"] = df_scaled[ameboid_features].mul([mmi_weights[f] for f in ameboid_features], axis=1).sum(axis=1)
df_scaled["Ramified_Score"] = df_scaled[ramified_features].mul([mmi_weights[f] for f in ramified_features], axis=1).sum(axis=1)

# Maximum score calculation
max_value_ameboid = np.dot(np.ones(len(ameboid_features)), [mmi_weights[f] for f in ameboid_features])
max_value_ramified = np.dot(np.ones(len(ramified_features)), [mmi_weights[f] for f in ramified_features])

# Calculate minimum score
min_value_ameboid = 0
min_value_ramified = 0

# resule
print(f"Max score: ram={max_value_ramified} , ame={max_value_ameboid}")
print(f"Min score: ram={min_value_ramified} , ame={min_value_ameboid}")

min_value = min_value_ameboid - max_value_ramified
max_value = max_value_ameboid - min_value_ramified

print(f"type score max: { max_value_ameboid - min_value_ramified}")
print(f"type score min: { min_value_ameboid - max_value_ramified}")

scaler_type_ameboid = MinMaxScaler(feature_range=(min_value_ameboid, max_value_ameboid))
df_scaled["Amoeboid_Score_scaled"] = scaler_type_ameboid.fit_transform(df_scaled["Amoeboid_Score"].values.reshape(-1, 1))

scaler_type_ramified = MinMaxScaler(feature_range=(min_value_ramified, max_value_ramified))
df_scaled["Ramified_Score_scaled"] = scaler_type_ameboid.fit_transform(df_scaled["Ramified_Score"].values.reshape(-1, 1))

#  Scale min-max to 0-100
def scale_score(value, min_value, max_value):
    return ((value - min_value) / (max_value - min_value)) * 100 if max_value != min_value else 0

# Classification (classify into higher score)
df_scaled["Morphology_Type"] = np.where(
    df_scaled["Amoeboid_Score_scaled"] > df_scaled["Ramified_Score_scaled"], "Amoeboid", "Ramified"
)

Microglia_Type_list = df_scaled["Morphology_Type"].copy()

# check results
print(df_scaled[["Amoeboid_Score", "Ramified_Score", "Morphology_Type"]].head())

df_scaled_save_vesion = df_scaled.copy()
df_scaled_save_vesion["sample"] = df["sample"]

# save
df_scaled_save_vesion.to_csv("classified_morphology.csv", index=False)

df_scaled["Type_Score_Sum"] = df_scaled["Amoeboid_Score"]  - df_scaled["Ramified_Score"]
df_scaled["Type_Score_Scaled"] = scale_score(df_scaled["Type_Score_Sum"], min_value, max_value)

df_scaled['Group'] = df["mouse"] + "-" + df["lateral"]
df_scaled["mouse"] = df["mouse"]

# Performing hierarchical clustering-----------------------------------------------------------------------------------------------------------------

distance_matrix = pdist(df_selected, metric='euclidean') # Calculating the Euclidean Distance Matrix

# [Applying clustering] 
cluster_number = 4
distance_matrix = pdist(df_selected, metric='euclidean')
linkage_matrix = sch.linkage(distance_matrix, method='ward')
optimal_clusters = sch.fcluster(linkage_matrix, cluster_number, criterion='maxclust')

df_selected["Cluster"] = optimal_clusters

# Add experimental group information
df_selected["Group"] = df["mouse"] + "-" + df["lateral"]  
df_clustering_save = df_selected.copy()
df_clustering_save["sample"] = df["sample"]

clustering_save_dir = r"morphology_analysis\Machine Learning" # save dir
clustering_save_path = os.path.join(clustering_save_dir, "selected_clustering.csv")
df_clustering_save.to_csv(clustering_save_path)

# Statistics by cluster (numeric data only)
summary = df_selected.groupby(["Group", "Cluster"]).mean()
print("\nCluster Statistics by Group:")
print(summary)

# [Application of PCA] Dimensionality reduction to 2 dimensions
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_selected.drop(columns=["Cluster", "Group"])) 

# Obtain the coefficients of the principal components
loadings = pd.DataFrame(pca.components_.T, columns=[f"PC{i+1}" for i in range(pca.n_components_)], index=df_selected.drop(columns=["Cluster", "Group"]).columns)
print(f"Loadings: {loadings}")

# Calculate the cumulative variance
explained_variance_ratio = pca.explained_variance_ratio_ * 100  # Convert 0-1 to percentage (%)
cumulative_variance = np.cumsum(explained_variance_ratio)

print("Cumulative variance")
for i, (var, cum_var) in enumerate(zip(explained_variance_ratio, cumulative_variance)):
    print(f"PC{i+1}: {var:.2f}% (cumulative {cum_var:.2f}%)")

# Save the results of PCA in a data frame
df_pca = pd.DataFrame(df_pca, columns=["PC1", "PC2"])
df_pca["Cluster"] = df_selected["Cluster"]
df_pca["Group"] = df_selected["Group"]
df_pca["Morphology_Type"] = Microglia_Type_list
df_pca["lateral"] = df["lateral"]
df_pca["lateral-type"] = df["lateral"] + "-" + Microglia_Type_list
df_pca["mouse"] = df["mouse"]
