In [65]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif   
import gmpy2

In [66]:
DATASETS = {
    'diabetes': ('diabetes_kmeans.csv', 'Outcome', [])
}

In [67]:
def safe_int(x):
    return int(x) if not isinstance(x, int) else x

def log2_safe(p: gmpy2.mpfr):
    return gmpy2.log2(p) if p > 0 else gmpy2.mpfr(0)

In [68]:
def load_csv_data(filename, label_col, drop_cols=None):
    df = pd.read_csv(filename, delimiter=',')
    if drop_cols:
        df = df.drop(columns=drop_cols, errors='ignore')
    df = df.apply(lambda s: s.astype(np.int64) if s.name != label_col else s)
    data_parts = np.array_split(df, 3, axis=1)
    return np.array_split(df, 3, axis=1), df

def load_dataset(name):
    if name not in DATASETS:
        raise ValueError(f"Unknown dataset: {name}")
    file, target, drops = DATASETS[name]
    data_parts, df = load_csv_data(file, target, drops)
    return data_parts, df, target


In [69]:

def compute_mutual_information(data_parts, label_col):
    # Locate target column
    target_series = None
    for p in data_parts:
        if label_col in p.columns:
            target_series = p[label_col]
            break
    if target_series is None:
        return {}
    n = len(target_series)
    unique_Y = pd.unique(target_series)
    # Precompute Y indicators & entropy H(Y)
    Y_ind = {y: (target_series == y).astype(int).to_numpy() for y in unique_Y}
    H_y = gmpy2.mpfr(0)
    for y in unique_Y:
        py = gmpy2.mpfr(int(Y_ind[y].sum())) / gmpy2.mpfr(n)
        if py > 0:
            H_y -= py * log2_safe(py)
    mi = {}
    for part in data_parts:
        for col in part.columns:
            if col == label_col:
                continue
            X = part[col]
            unique_X = pd.unique(X)
            X_ind = {xv: (X == xv).astype(int).to_numpy() for xv in unique_X}
            count_x = {xv: X_ind[xv].sum() for xv in unique_X}
            H_y_given_x = gmpy2.mpfr(0)
            for xv in unique_X:
                cx = count_x[xv]
                if cx == 0:
                    continue
                for y in unique_Y:
                    c_xy = int((X_ind[xv] & Y_ind[y]).sum())
                    if c_xy == 0:
                        continue
                    p_xy = gmpy2.mpfr(c_xy) / gmpy2.mpfr(n)
                    p_y_given_x = gmpy2.mpfr(c_xy) / cx
                    H_y_given_x -= p_xy * log2_safe(p_y_given_x)
            mi[col] = float(H_y - H_y_given_x)
    return mi



In [70]:
def get_min_mutual_info_feature(data_parts, label_col):
    plain_mi_score = compute_mutual_information(data_parts, label_col)
    print("Plain MI Scores:", plain_mi_score)
    for part in data_parts:
        if label_col in part.columns:
            feature_cols = [c for c in part.columns if c != label_col]
            if not feature_cols:
                raise ValueError("No feature columns in target-containing part.")
            min_feature = min(feature_cols, key=lambda c: plain_mi_score.get(c, float('inf')))
            return min_feature
    raise ValueError(f"Target column '{label_col}' not found.")

In [71]:
def compute_ranks(data_parts, label_col):
    ranks = []
    for part in data_parts:
        cols = [c for c in part.columns if c != label_col]
        if cols:
            # Use method='average' to handle ties, will be converted to int later
            ranks.append(part[cols].rank(method='average').astype(int))
        else:
            ranks.append(pd.DataFrame(index=part.index))
    return ranks


def compute_spearman_correlation(ranked_parts, correlation_target):
    # Find which part (if any) has target ranks; fallback: last part
    target_rank = None
    for rp in ranked_parts:
        if correlation_target in rp.columns:
            target_rank = rp[correlation_target].astype(int)
            break
    # If target ranks are not stored separately, cannot proceed
    if target_rank is None:
        # Assume target in last original part (not ranked); cannot compute -> return empty
        return {}
    
    res = {}
    for rp in ranked_parts:
        for col in rp.columns:
            if col == correlation_target:
                continue
            
            feature_rank = rp[col].astype(int)
            
            # ρ = cov(R_x, R_y) / (std(R_x) * std(R_y))
            mean_feature = feature_rank.mean()
            mean_target = target_rank.mean()
            
            # Covariance
            cov = ((feature_rank - mean_feature) * (target_rank - mean_target)).sum()
            
            # Standard deviations
            std_feature = ((feature_rank - mean_feature) ** 2).sum() ** 0.5
            std_target = ((target_rank - mean_target) ** 2).sum() ** 0.5
            
            # Pearson correlation on ranks (Spearman correlation)
            if std_feature > 0 and std_target > 0:
                res[col] = cov / (std_feature * std_target)
            else:
                res[col] = 0.0  # Handle case where all ranks are identical

    print(f"res:{res}")
    return res

def compute_ranks(df, label_col):
    """
    Compute ranks for all columns in df except label_col
    
    Parameters:
    - df: pd.DataFrame - the full dataset
    - label_col: str - the label column to exclude from ranking
    
    Returns:
    - pd.DataFrame with ranked values
    """
    cols = [c for c in df.columns if c != label_col]
    if cols:
        ranks = df[cols].rank().astype(int)
    else:
        ranks = pd.DataFrame(index=df.index)
    
    return ranks


def compute_spearman_correlation(df, correlation_target, label_col):
    """
    Compute Spearman correlation between features and correlation target
    
    Parameters:
    - df: pd.DataFrame - the full dataset
    - correlation_target: str - the target column for correlation
    - label_col: str - the label column to exclude
    
    Returns:
    - dict mapping feature names to their Spearman correlation with target
    """

    cols = [c for c in df.columns if c != label_col]
    
    if correlation_target not in cols:
        print(f"Warning: {correlation_target} not found in data")
        return {}
    
    # Rank each column independently
    ranked_data = df[cols].rank(method='average').astype(int)
    
    target_rank = ranked_data[correlation_target].astype(int)
    
    res = {}
    for col in ranked_data.columns:
        if col == correlation_target:
            continue
        
        fr = ranked_data[col].astype(int)
        
        # Use Pearson correlation on ranks
        mean_fr = fr.mean()
        mean_target = target_rank.mean()
        
        numerator = ((fr - mean_fr) * (target_rank - mean_target)).sum()
        denominator = (((fr - mean_fr) ** 2).sum() ** 0.5) * (((target_rank - mean_target) ** 2).sum() ** 0.5)
        
        res[col] = numerator / denominator if denominator != 0 else 0
    
    print(f"res:{res}")
    return res


def main(dataset_name='diabetes'):
    data_parts, df, label_col = load_dataset(dataset_name)

    correlation_target = get_min_mutual_info_feature(data_parts, label_col)
    ranked_parts = compute_ranks(df,label_col)

    # Spearman
    plain_spearman = compute_spearman_correlation(df, correlation_target, label_col)

    # Mutual Information
    plain_mi = compute_mutual_information(data_parts, label_col)



if __name__ == "__main__":
    main('diabetes')

In [72]:
def main(dataset_name='diabetes'):
    data_parts, df, label_col = load_dataset(dataset_name)

    correlation_target = get_min_mutual_info_feature(data_parts, label_col)
    ranked_parts = compute_ranks(data_parts, label_col)

    # Spearman
    plain_spearman = compute_spearman_correlation(ranked_parts, correlation_target)

    # Mutual Information
    plain_mi = compute_mutual_information(data_parts, label_col)



if __name__ == "__main__":
    main('diabetes')

Plain MI Scores: {'Pregnancies': 0.04791176873782721, 'Glucose': 0.188578888220306, 'BloodPressure': 0.027567905180460817, 'SkinThickness': 0.04315827716728404, 'Insulin': 0.05990375098362566, 'BMI': 0.09043066514352194, 'DiabetesPedigreeFunction': 0.03320503661584395, 'Age': 0.08231736993762473}
res:{'Pregnancies': np.float64(-0.04603966358699283), 'Glucose': np.float64(0.10309609084897267), 'BloodPressure': np.float64(0.019086198116041148), 'SkinThickness': np.float64(0.18465164310327667), 'Insulin': np.float64(0.21469476082425581), 'BMI': np.float64(0.1404365687528146), 'Age': np.float64(0.03545689313425083)}


  return bound(*args, **kwds)
