# Install Dependencies

In [None]:
import pandas as pd
from sklearn.metrics import pair_confusion_matrix
import numpy as np

# Data Exploration

### Checking if Cho is a proper subset of KEGG

In [None]:



cho_data = pd.read_csv('../dataset/Cho_Ours.txt', sep='\t')
cho_genes = cho_data.iloc[:, 0]

kegg_data = pd.read_csv('../dataset/Kegg_Subcluster.txt', sep='\t')
kegg_genes = kegg_data.iloc[:, 0]

cho_genes_in_kegg = cho_genes.isin(kegg_genes)
cho_genes_not_in_kegg = cho_genes[~cho_genes_in_kegg]
missing_genes_count = cho_genes_not_in_kegg.count()
print("Are all cho_genes present in kegg_genes?", cho_genes_in_kegg.all())
print("Number of missing genes in kegg_genes:", missing_genes_count)
print(cho_genes_not_in_kegg)

Are all cho_genes present in kegg_genes? False
Number of missing genes in kegg_genes: 21
18     YLR273c
34     YKL042w
39     YNL225c
43     YLR313c
44     YGR041w
61     YJL173c
80     YMR179w
85     YDL227c
86     YNL173c
101    YMR198w
103    YDR356w
111    YDL197c
123    YIR017c
130    YKL049c
131    YCL014w
132    YOR188w
140    YLR014c
150    YBR138c
154    YJR092w
155    YLR353w
168    YDL048c
Name: Gene, dtype: object


# Formulation of the Modified Rand Index

In [None]:
def modified_rand_score(labels_true, labels_pred):
    """
    Computes the Modified Rand Index (MRI), an extension of the Rand Index
    that accounts for missing elements between partitions.
    
    Parameters:
    - labels_true: array-like, shape (n_samples,)
      Ground truth labels (e.g., KEGG or Cho dataset).
      
    - labels_pred: array-like, shape (n_samples,)
      Predicted labels from a clustering algorithm.

    Returns:
    - MRI: float
      A similarity score between 0.0 and 1.0, where higher values indicate better alignment.
    """
    
    # Compute confusion matrix components
    contingency = pair_confusion_matrix(labels_true, labels_pred)
    
    # Extract agreement and disagreement values
    n_00, n_01, n_10, n_11 = contingency.ravel()
    
    # Compute n_xx (pairs where at least one node is missing in one partition)
    total_nodes_true = len(set(labels_true))  # Unique classes in true labels
    total_nodes_pred = len(set(labels_pred))  # Unique classes in predicted labels
    missing_pairs = abs(total_nodes_true - total_nodes_pred) * (abs(total_nodes_true - total_nodes_pred) - 1) / 2
    n_xx = int(missing_pairs)

    # Compute total pairs
    total_pairs = n_00 + n_11 + n_01 + n_10 + n_xx

    # Compute MRI score
    if total_pairs == 0:
        return 1.0  # Perfect match in trivial cases

    MRI = (n_00 + n_11 + n_xx) / total_pairs
    return MRI