In [None]:
# Data Loading Functions
import pandas as pd

def load_file(file_path, dtype=int, delimiter=None):
    """
    Helper function to load data from file.
    Supports reading .xlsx and .txt files.
    
    Parameters:
        file_path (str): Path to the file
        dtype (type): Data type for loading .txt files
        delimiter (str or None): Delimiter for text files (optional)
        
    Returns:
        DataFrame or ndarray: Loaded data
    """
    if file_path.endswith('.xlsx'):
        return pd.read_excel(file_path, header=None, names=['Num', 'Name'])
    elif file_path.endswith('.txt'):
        return np.loadtxt(file_path).astype(np.int64)
    else:
        raise ValueError("Unsupported file format.")

def create_interaction_matrix(MDA, nd, nm):
    """
    Create the binary interaction (adjacency) matrix for microRNA-disease pairs.
    
    Parameters:
        MDA (ndarray): An array of known associations [miRNA_index, disease_index]
        nd (int): Number of diseases
        nm (int): Number of miRNAs
        
    Returns:
        ndarray: Binary matrix of shape (nd, nm)
    """
    interaction = np.zeros((nd, nm), dtype=int)
    for i in range(len(MDA)):
        interaction[MDA[i, 1] - 1, MDA[i, 0] - 1] = 1
    return interaction

def load_microRNA_disease_dataset(version='v1.0'):
    """
    Load the microRNA-disease association dataset for a given version.
    
    Parameters:
        version (str): Dataset version to load ('v1.0', 'v2.0', or 'v3.2')
        
    Returns:
        tuple: (interaction_matrix, MDA_pairs, disease_similarity, miRNA_similarity, disease_names, miRNA_names)
    """
    
    # Map dataset version to its folder path
    version_paths = {
        'v3.2': 'HMDD v3.2',
        'v2.0': 'HMDD v2.0',
        'v1.0': 'HMDD v1.0'
    }

    root0 = f'..\\dataset\\microRNA-disease\\{version_paths[version]}'

    # Version-specific data loading
    if version == 'v3.2':
        # Load association matrix and transpose it
        interaction_data = load_file(os.path.join(root0, 'miRNA_disease_association.txt')).T
        
        # Convert to pairwise format: each row is [miRNA_id, disease_id]
        MDA = np.array([[j + 1, i + 1] 
                        for i in range(interaction_data.shape[0]) 
                        for j in range(interaction_data.shape[1]) 
                        if interaction_data[i, j] == 1])

        # Load similarity matrices
        SS = load_file(os.path.join(root0, 'disease_semantic_sim.txt'))
        FS1 = load_file(os.path.join(root0, 'miRNA_semantic_sim.txt'))
        FS2 = load_file(os.path.join(root0, 'miRNA_sequence_sim.txt'))
        FS3 = load_file(os.path.join(root0, 'miRNA_functional_sim.txt'))
        FS = (FS1 + FS2 + FS3) / 3  # Average similarity matrix
        
        # Load disease and miRNA names
        disease = load_file(os.path.join(root0, 'disease_name_374.xlsx'))
        miRNA = load_file(os.path.join(root0, 'miRNA_name_788.xlsx'))

    elif version == 'v2.0':
        # Load known microRNA-disease association pairs
        MDA = load_file(os.path.join(root0, '1.miRNA-disease association\\knowndiseasemirnainteraction.txt'))

        # Load and average two disease semantic similarity matrices
        SS1 = load_file(os.path.join(root0, '2.disease semantic similarity 1\\disease semantic similarity matrix 1.txt'))
        SS2 = load_file(os.path.join(root0, '3.disease semantic similarity 2\\disease semantic similarity matrix 2.txt'))
        SS = (SS1 + SS2) / 2

        # Load miRNA functional similarity
        FS = load_file(os.path.join(root0, '4.miNA functional simialrity\\functional similarity matrix.txt'))
        
        # Load disease and miRNA names
        disease = load_file(os.path.join(root0, '1.miRNA-disease association\\disease name.xlsx'))
        miRNA = load_file(os.path.join(root0, '1.miRNA-disease association\\miRNA name.xlsx'))
        
    elif version == 'v1.0':
        # Load association pairs and similarity matrices
        MDA = load_file(os.path.join(root0, 'HMDD1.txt'))
        SS = load_file(os.path.join(root0, 'DisSim.txt'))
        FS = load_file(os.path.join(root0, 'miRSim.txt'))
        
        # Load disease and miRNA names
        disease = load_file(os.path.join(root0, 'Disease_Name.xlsx'))
        miRNA = load_file(os.path.join(root0, 'miRNA_Name.xlsx'))

    # Determine the dimensions of the interaction matrix
    nd = max(MDA[:, 1])  # Number of diseases
    nm = max(MDA[:, 0])  # Number of miRNAs

    # Construct interaction matrix
    interaction = create_interaction_matrix(MDA, nd, nm)

    diseaseName = disease.Name
    miRNAName = miRNA.Name
    
    return interaction, MDA, SS, FS, diseaseName, miRNAName


In [None]:
# AMCMDA Functions

def singular_value_thresholding(Y, b):
    """
    Singular Value Thresholding (SVT) method for matrix completion.
    Shrinks singular values below threshold 'b' and reconstructs the matrix.
    
    Parameters:
        Y (ndarray): Input matrix to be processed.
        b (float): Threshold value for singular values.
        
    Returns:
        Y_n (ndarray): The reconstructed matrix after singular value thresholding.
    """
    # Singular value decomposition
    U, S, V = np.linalg.svd(Y)
    
    # Apply singular value shrinkage
    for index in range(0, S.size):
        if S[index] >= b:
            S[index] = S[index] - b
        else:
            S[index] = 0
    
    # Create a diagonal matrix from the updated singular values
    s = np.diag(S)
    
    # Get the shape of matrix Y
    row, col = Y.shape[0], Y.shape[1]
    
    # If Y is a high-dimensional matrix, adjust the diagonal matrix to match the dimensions of Y
    if row < col:
        s_n = np.column_stack((s, np.zeros((row, col - row))))
    else:
        s_n = np.row_stack((s, np.zeros((row - col, col))))
    
    # Reconstruct the matrix Y using the updated singular values
    Y_n = np.dot(U, np.dot(s_n, V))
    return Y_n

def AMCMDA_step1(interaction_matrix, disease_similarity, miRNA_similarity):
    """
    Perform the first step of AMCMDA to generate a new matrix 
    by stacking the disease similarity matrix, interaction matrix, and miRNA similarity matrix 
    into a heterogenous graph matrix.
    
    Parameters:
        interaction_matrix (ndarray): The matrix representing the microRNA-disease interactions.
        disease_similarity (ndarray): The disease similarity matrix.
        miRNA_similarity (ndarray): The miRNA similarity matrix.
        
    Returns:
        M_1 (ndarray): The matrix after AMCMDA step 1, representing the disease-miRNA associations.
    """
    # Stack the matrices to form a heterogeneous graph matrix
    hs1 = np.hstack((disease_similarity, interaction_matrix))
    hs2 = np.hstack((np.transpose(interaction_matrix), miRNA_similarity))
    combined_matrix = np.vstack((hs1, hs2))
    
    # Set the maximum number of iterations 
    max_iter_1 = 3  
    
    for i in range(max_iter_1):
        # Singular value decomposition on the combined matrix
        U, S, V = np.linalg.svd(combined_matrix)
        
        # Number of singular values to keep 
        r = 1 #2, 3, 4, ..., 20, 25, 30, ..., 60 
        
        # Extract the top r singular vectors from U and V
        A = U[:, :r]
        B = V[:r, :]
        
        # Update the combined matrix by calling AMC step 2
        combined_matrix, k = AMCMDA_step2(combined_matrix, A, B)
    
    # Extract the new matrix T from the updated combined matrix
    new_matrix = combined_matrix
    
    # Extract the upper-right submatrix from the new matrix
    M_1 = new_matrix[0:disease_similarity.shape[0], disease_similarity.shape[0]:combined_matrix.shape[1]]
    
    return M_1

def AMCMDA_step2(t, A, B):
    """
    Perform the second step of AMCMDA, where matrices X, W, and Y are iteratively updated to recover the association matrix.
    
    Parameters:
        t (ndarray): The current matrix that contains interaction and similarity data.
        A (ndarray): Left singular vectors from the singular value decomposition.
        B (ndarray): Right singular vectors from the singular value decomposition.
        
    Returns:
        W (ndarray): The recovered matrix after AMCMDA step 2.
        iter0 (int): The number of iterations taken to converge.
    """
    # Set parameters for AMC step 2
    max_iter_2 = 300
    alpha = 1 #0.01, 0.1, 1, 10, 100
    beta = 1 #0.01, 0.1, 1, 10, 100
    gamma = 1
    
    # Convergence criteria
    tol1 = 2 * 1e-3
    tol2 = 1 * 1e-5
    
    # Initialize the omega matrix with ones at the non-zero positions of t
    omega = np.zeros(t.shape)
    omega[t.nonzero()] = 1
    
    # Initialize matrices X, W, Y with the current matrix t
    X = t
    W = X
    Y = X
    
    # Initialize iteration counter and stopping criteria
    iter0 = 1
    stop1 = 1
    stop2 = 1
    
    # Iterate until convergence
    while stop1 > tol1 or stop2 > tol2:
        # Update matrix W
        temp = (1 / beta) * (Y + alpha * (t * omega) + (np.dot(A, B))) + X
        W = temp - (alpha / (alpha + beta)) * omega * temp
        
        # Ensure the values in W remain between 0 and 1
        W[W < 0] = 0
        W[W > 1] = 1

        # Update matrix X using Singular Value Thresholding
        X_1 = singular_value_thresholding(W - (1 / beta) * Y, 1 / beta)

        # Update matrix Y
        Y = Y + gamma * beta * (X_1 - W)
        
        # Calculate the stopping criteria
        stop1_0 = stop1
        if np.linalg.norm(X) != 0:
            stop1 = np.linalg.norm(X_1 - X) / np.linalg.norm(X)
        else:
            stop1 = np.linalg.norm(X_1 - X)
        stop2 = np.abs(stop1 - stop1_0) / (max(1, np.abs(stop1_0)))
        
        # Update matrix X
        X = X_1
        
        # Check if maximum iterations are reached
        if iter0 == max_iter_2:
            print('Maximum iterations reached without convergence')
            break
        iter0 = iter0 + 1
    
    # Return the final recovered matrix W and number of iterations
    return W, iter0

In [None]:
# Prediction and Similarity Calculation Functions
import copy

def generate_prediction_score(interaction_matrix, disease_similarity, miRNA_similarity):
    """
    Function to obtain the prediction score matrix. This is the main entry point to generate predicted
    association scores using various models. 
    
    Parameters:
        interaction_matrix (ndarray): The matrix representing the microRNA-disease interactions.
        disease_similarity (ndarray): The disease similarity matrix.
        miRNA_similarity (ndarray): The miRNA similarity matrix.
    
    Returns:
        score (ndarray): The generated prediction score matrix for disease-miRNA associations.
    """
    # Call the AMC_step1 function to perform the matrix completion and get the prediction scores
    score = AMCMDA_step1(interaction_matrix, disease_similarity, miRNA_similarity)
    
    # Convert the result to a numpy array (if not already)
    score = np.asarray(score)
    
    return score


def calculate_gaussian_similarity(interaction_matrix, num_diseases, num_miRNAs):
    """
    Function to calculate the Gaussian similarity between diseases and miRNAs based on the provided 
    interaction matrix.
    
    Parameters:
        interaction_matrix (ndarray): The matrix representing the miRNA-disease interactions.
        num_diseases (int): The number of diseases.
        num_miRNAs (int): The number of miRNAs.
    
    Returns:
        kd (ndarray): The disease-to-disease similarity matrix.
        km (ndarray): The miRNA-to-miRNA similarity matrix.
    """
    # Calculate the gamma value for Gaussian kernel computation
    gamad = num_diseases / (np.linalg.norm(interaction_matrix) ** 2)
    
    # Initialize the disease-to-disease similarity matrix
    C = copy.deepcopy(interaction_matrix)
    kd = np.zeros([num_diseases, num_diseases])
    
    # Compute the dot product of the interaction matrix to compute the distance matrix D
    D = np.dot(C, C.T)
    
    # Calculate the Gaussian similarity for diseases
    for i in range(num_diseases):
        for j in range(num_diseases):
            kd[i, j] = np.exp(-gamad * (D[i, i] + D[j, j] - 2 * D[i, j]))
    
    # Calculate the gamma value for Gaussian kernel computation for miRNAs
    gamam = num_miRNAs / (np.linalg.norm(interaction_matrix) ** 2)
    
    # Initialize the miRNA-to-miRNA similarity matrix
    km = np.zeros([num_miRNAs, num_miRNAs])
    
    # Compute the dot product of the transposed interaction matrix to compute the distance matrix E
    E = np.dot(C.T, C)
    
    # Calculate the Gaussian similarity for miRNAs
    for i in range(num_miRNAs):
        for j in range(num_miRNAs):
            km[i, j] = np.exp(-gamam * (E[i, i] + E[j, j] - 2 * E[i, j]))
    
    return kd, km


In [None]:
# Run_Prediction_Model

from openpyxl import Workbook
import numpy as np


# Load the microRNA-disease dataset (update dataset loading function accordingly)
interaction, MDA, Kd, Km, diseaseName, miRNAName = load_microRNA_disease_dataset()
# If you want to use the HMDD v1.0 dataset, call the function as it is.
# If you want to use the HMDD v2.0 dataset, add 'v2.0' as an argument in the parentheses.
# If you want to use the HMDD v3.2 dataset, add 'v3.2' as an argument in the parentheses.


# Case study for model prediction
def run_prediction_model():
    """
    This function runs the prediction model by calculating the miRNA-disease interaction scores 
    and outputs the results in an Excel file.
    """
    # Get the number of diseases and miRNAs from the MDA matrix
    nd = max(MDA[:, 1])          
    nm = max(MDA[:, 0])
    
    """
    When conducting the second type of case study, 
    the corresponding elements of the disease in the adjacency matrix need to be set to zero. 
    To achieve this, you need to execute the below-mentioned commented code.
    """
    # The second case study need the following code
    #test = 13
    #print('\t', 'The testing disease_name = ', diseaseName[test])
    #interaction[test, ] = 0

    # Compute the integrated similarity matrix for diseases and miRNAs
    kd, km = calculate_gaussian_similarity(interaction, nd, nm)
    sd, sm = np.where(Kd > 0, Kd, kd), np.where(Km > 0, Km, km)

    
    # Get the predicted interaction scores matrix
    predicted_scores = generate_prediction_score(interaction, sd, sm)
    
    # Prepare the results for output by sorting the predicted scores
    prediction_results = []
    for i in range(nd):
        for j in range(nm):
            # Store the predicted score, disease index, miRNA index, and the original interaction label
            prediction_results.append([predicted_scores[i][j], i, j, interaction[i][j]])
    
    # Sort the prediction results in descending order of predicted scores
    prediction_results.sort(reverse=True, key=lambda x: x[0])

    # Save results to an Excel file
    wb = Workbook()  # Create a new workbook
    ws = wb.active  # Get the default active sheet

    # Write the header row
    ws.append(["Disease", "miRNA", "Score", "Label"])

    # Write each prediction result as a new row in the Excel sheet
    for result in prediction_results:
        ws.append([diseaseName[result[1]], miRNAName[result[2]], result[0], result[3]])

    # Save the workbook to a file
    wb.save("..\\AMCMDA_Case_Study_Results.xlsx")

    return 0

In [None]:
# MiRNA-Disease Association Prediction Pipeline

import os

# Run the prediction model
run_prediction_model()  # Call the function that runs the overall prediction model