In [181]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif   
import gmpy2

In [182]:
DATASETS = {
    'diabetes': ('diabetes_kmeans.csv', 'Outcome', [])
}

In [183]:
def safe_int(x):
    return int(x) if not isinstance(x, int) else x

def log2_safe(p: gmpy2.mpfr):
    return gmpy2.log2(p) if p > 0 else gmpy2.mpfr(0)

In [184]:
def load_csv_data(filename, target_col, drop_cols=None):
    df = pd.read_csv(filename, delimiter=',')
    if drop_cols:
        df = df.drop(columns=drop_cols, errors='ignore')
    df = df.apply(lambda s: s.astype(np.int64) if s.name != target_col else s)
    data_parts = np.array_split(df, 3, axis=1)
    original_data = pd.concat(data_parts, axis=1)
    return data_parts, original_data

def load_dataset(name):
    if name not in DATASETS:
        raise ValueError(f"Unknown dataset: {name}")
    file, target, drops = DATASETS[name]
    data_parts, original_data = load_csv_data(file, target, drops)
    return (data_parts, original_data), target

In [185]:

def compute_mutual_information(data_parts, target_col):
    # Locate target column
    target_series = None
    for p in data_parts:
        if target_col in p.columns:
            target_series = p[target_col]
            break
    if target_series is None:
        return {}
    n = len(target_series)
    unique_Y = pd.unique(target_series)
    # Precompute Y indicators & entropy H(Y)
    Y_ind = {y: (target_series == y).astype(int).to_numpy() for y in unique_Y}
    H_y = gmpy2.mpfr(0)
    for y in unique_Y:
        py = gmpy2.mpfr(int(Y_ind[y].sum())) / gmpy2.mpfr(n)
        if py > 0:
            H_y -= py * log2_safe(py)
    mi = {}
    for part in data_parts:
        for col in part.columns:
            if col == target_col:
                continue
            X = part[col]
            unique_X = pd.unique(X)
            X_ind = {xv: (X == xv).astype(int).to_numpy() for xv in unique_X}
            count_x = {xv: X_ind[xv].sum() for xv in unique_X}
            H_y_given_x = gmpy2.mpfr(0)
            for xv in unique_X:
                cx = count_x[xv]
                if cx == 0:
                    continue
                for y in unique_Y:
                    c_xy = int((X_ind[xv] & Y_ind[y]).sum())
                    if c_xy == 0:
                        continue
                    p_xy = gmpy2.mpfr(c_xy) / gmpy2.mpfr(n)
                    p_y_given_x = gmpy2.mpfr(c_xy) / cx
                    H_y_given_x -= p_xy * log2_safe(p_y_given_x)
            mi[col] = float(H_y - H_y_given_x)
    return mi



In [186]:
def get_min_mutual_info_feature(data_parts, target_col):
    plain_mi_score = compute_mutual_information(data_parts, target_col)
    print("Plain MI Scores:", plain_mi_score)
    for part in data_parts:
        if target_col in part.columns:
            feature_cols = [c for c in part.columns if c != target_col]
            if not feature_cols:
                raise ValueError("No feature columns in target-containing part.")
            min_feature = min(feature_cols, key=lambda c: plain_mi_score.get(c, float('inf')))
            return min_feature
    raise ValueError(f"Target column '{target_col}' not found.")


def compute_ranks(data_parts, target_col):
    ranks = []
    original_data = pd.concat(data_parts, axis=1)
    
    for part in data_parts:
        cols = [c for c in part.columns if c != target_col]
        if cols:
            # Use original_data for ranking, but select only columns from current part
            ranks.append(original_data[cols].rank().astype(int))
        else:
            ranks.append(pd.DataFrame(index=part.index))
    return ranks
    

def compute_spearman_correlation(ranked_parts, target_col, data_parts):
    # Find which part (if any) has target ranks; fallback: last part
    target_rank = None
    for rp in ranked_parts:
        if target_col in rp.columns:
            target_rank = rp[target_col].astype(int)
            break
    # If target ranks are not stored separately, cannot proceed
    if target_rank is None:
        # Assume target in last original part (not ranked); cannot compute -> return empty
        return {}
    n = len(target_rank)
    denom = n * (n**2 - 1)
    res = {}
    for rp in ranked_parts:
        for col in rp.columns:
            if col == target_col:
                continue
            fr = rp[col].astype(int)
            d2 = (fr - target_rank).apply(lambda d: d * d).sum()
            res[col] = 1 - (6 * d2) / denom
    

    print(f"Your calculated ranks for {target_col} (first 10):")
    print(target_rank.head(10).tolist())
    
    for rp in ranked_parts:
        for col in rp.columns:
            if col == target_col:
                continue
                
            fr = rp[col].astype(int)
            
            # DEBUG: Print sample ranks for this feature
            print(f"\nYour calculated ranks for {col} (first 10):")
            print(fr.head(10).tolist())
            
            # DEBUG: Compare with pandas direct calculation
            # Get original data and calculate ranks directly
            original_data = pd.concat(data_parts, axis=1)
            pandas_rank_target = original_data[target_col].rank(method='average').astype(int)
            pandas_rank_feature = original_data[col].rank(method='average').astype(int)
            
            print(f"Pandas ranks for {target_col} (first 10):")
            print(pandas_rank_target.head(10).tolist())
            print(f"Pandas ranks for {col} (first 10):")
            print(pandas_rank_feature.head(10).tolist())
            
            break  # Just check first feature for now
        break
    print(f"res:{res}")
    return res



In [187]:

def compute_ranks(original_data, target_col):
    """
    Compute ranks for all columns except the target column.
    
    Parameters:
    -----------
    original_data : pd.DataFrame
        The complete dataset
    target_col : str
        Name of the target column
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with ranked values for all feature columns
    """
    cols = [c for c in original_data.columns if c != target_col]
    if cols:
        ranked_data = original_data[cols].rank().astype(int)
    else:
        ranked_data = pd.DataFrame(index=original_data.index)
    return ranked_data

def compute_spearman_correlation(original_data, min_feature, target_col):
    """
    Compute Spearman correlation between min_feature and all other features.
    
    Parameters:
    -----------
    original_data : pd.DataFrame
        The complete dataset
    min_feature : str
        The feature to correlate with all others
    target_col : str
        Name of the target column (to exclude)
        
    Returns:
    --------
    dict
        Dictionary mapping feature names to their Spearman correlation with min_feature
    """
    if min_feature not in original_data.columns:
        return {}
    
    # Get min_feature ranks
    min_feature_rank = original_data[min_feature].rank().astype(int)
    n = len(min_feature_rank)
    denom = n * (n**2 - 1)
    
    res = {}
    for col in original_data.columns:
        # Skip the min_feature itself and the target column
        if col == min_feature or col == target_col:
            continue
        
        feature_rank = original_data[col].rank().astype(int)
        d2 = ((feature_rank - min_feature_rank) ** 2).sum()
        res[col] = 1 - (6 * d2) / denom
    
    print(f"res:{res}")

    return res


In [188]:

def main(dataset_name='diabetes'):
    (parts_list, original_data), target_col = load_dataset(dataset_name)
    
    # Use parts_list for functions that need data_parts
    min_feature = get_min_mutual_info_feature(parts_list, target_col)
    
    # Use original_data for the new functions
    ranked_data = compute_ranks(original_data, target_col)
    
    # Compute correlation between min_feature and all other features
    plain_spearman = compute_spearman_correlation(original_data, min_feature, target_col)
    
    return plain_spearman


if __name__ == "__main__":
    main('diabetes')

Plain MI Scores: {'Pregnancies': 0.04791176873782721, 'Glucose': 0.188578888220306, 'BloodPressure': 0.027567905180460817, 'SkinThickness': 0.04315827716728404, 'Insulin': 0.05990375098362566, 'BMI': 0.09043066514352194, 'DiabetesPedigreeFunction': 0.03320503661584395, 'Age': 0.08231736993762473}
res:{'Pregnancies': np.float64(0.028008124365275666), 'Glucose': np.float64(0.15358134982867744), 'BloodPressure': np.float64(0.08952660639293486), 'SkinThickness': np.float64(0.23816241800506255), 'Insulin': np.float64(0.3088322285880679), 'BMI': np.float64(0.19381295055889647), 'Age': np.float64(0.11354750439962502)}


  return bound(*args, **kwds)
