# Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import time
import os
import torch
import warnings
warnings.filterwarnings("ignore") 

from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from sklearn.impute import KNNImputer
from IPython.display import display

np.set_printoptions(formatter={'float': lambda x: "{:.3f}".format(x)})

# Input Dataset

In [None]:
class DataHandle(object):
    """Read data and store to D_gene object
    
    Parameters
    ----------
    data_categ : "missing" | "complete", default="missing"
        Select the object to read approriate the kind of data 
    """
    def __init__(self, filepath, filename, data_categ="missing", verbose=False):
        self.data_categ = data_categ
        
        if data_categ == "missing":
            self.read_missing_data(filepath, filename, verbose=verbose)
        elif data_categ == "complete":
            self.read_complete_data(filepath, filename, verbose=verbose)
        else:
            print("Please provide with the correct category: ('missing' or 'complete')")
        
        return None
    
    def is_duplicate_nan_in_row(self):
        if self.data_categ == "missing":
            value = self.value
            nan_per_row = [np.sum(np.isnan(row)) for row in value]
            more_than_one = np.any(np.array(nan_per_row) > 1)
            return more_than_one
        else:
            print("You cannot find NaN entries in complete matrix!")
            return None
    
    def num_nan_rows(self):
        value = self.value
        nan_idx = np.argwhere(np.isnan(value))
        return len(value[nan_idx[:, 0], nan_idx[:, 1]])
    
        
    def compute_std(self, D_gene_complete):
        data_categ = self.data_categ
        if data_categ == "missing":
            missing_idx = self.get_missing_idx(self.value, data_categ)
            y_true = D_gene_complete[missing_idx[:,0], missing_idx[:,1]]
            return np.std(y_true)
        
        else:
            print("You cannot get missing indices for complete matrix!")
            return None
        
    def get_p_mis_rate(self):
        data_categ = self.data_categ
        value = self.value
        if data_categ == "missing":
            missing_idx = self.get_missing_idx(value, data_categ)
            return len(missing_idx)/ (len(value))
        
        else:
            print("You cannot get missing rate for complete matrix!")
            return None
        
    def get_F_mat(self):
        data_categ = self.data_categ
        value = self.value
        if data_categ == "missing":
            missing_idx = self.get_missing_idx(value, data_categ)
            F_mat = np.ones_like(value, dtype=int)
            F_mat[missing_idx[:,0], missing_idx[:, 1]] = 0
            return F_mat
        
        else:
            print("You cannot get indicator matrix for complete matrix!")
            return None
    
    @staticmethod
    def get_missing_idx(value, data_categ):
        if data_categ == "missing":
            missing_idx = np.argwhere(np.isnan(value))
            return missing_idx
        
        else:
            print("You cannot get missing indices for complete matrix!")
            return None
        
    def read_missing_data(self, filepath, filename, verbose=False):
        path_to_file = os.path.join(filepath, filename)
        self.raw_df = pd.read_csv(path_to_file)

        self.value = self.raw_df.iloc[:, 1:].values

        if verbose:
            display(self.raw_df)
            display(self.value)   
            
    def read_complete_data(self, filepath, filename, verbose=False):
        path_to_file = os.path.join(filepath, filename)
        self.raw_df = pd.read_csv(path_to_file)

        self.value = self.raw_df.iloc[:, 1:].values

        if verbose:
            display(self.raw_df)
            display(self.value)  
        

## Load Dataset Lengkap

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
D_gene_complete = DataHandle("/content/drive/MyDrive/SKRIPSI/", "GSE135923-GPL26599_series_matrix.csv", verbose=False,
                             data_categ="complete")
display(D_gene_complete.raw_df)

Unnamed: 0,ID_REF,GSM4037995,GSM4037996,GSM4037997,GSM4037998,GSM4037999,GSM4038000
0,AAMP,2611.34,2519.65,2841.68,2850.22,2758.71,2589.80
1,ABI3BP,29.81,34.94,33.78,53.94,51.88,62.82
2,ACHE,64.79,74.14,73.67,98.17,120.14,122.99
3,ACTG2,29.81,34.94,33.78,38.84,29.13,31.41
4,ACVR1,970.40,1263.76,1134.33,1165.12,1207.79,1172.35
...,...,...,...,...,...,...,...
779,ZFYVE16,1160.26,1373.84,1206.84,1787.59,1751.16,1785.52
780,ZFYVE9,996.02,1231.18,1154.21,908.36,974.79,1037.87
781,ZKSCAN5,171.78,181.98,137.99,257.84,274.87,268.98
782,ZNF143,789.58,984.04,797.54,1065.87,1006.64,1033.44


In [None]:
D_gene = D_gene_complete.value.copy()

idx_not_NaN = np.array([np.all(np.logical_not(np.isnan(row))) for row in D_gene])
D_gene_not_NaN = D_gene[idx_not_NaN]

# Generate Dataset with MCAR Missing Values

In [None]:
# Function produce_NA for generating missing values ------------------------------------------------------

def produce_NA(X, p_miss, mecha="MCAR", opt=None, p_obs=None, q=None):
    """
    Generate missing values for specifics missing-data mechanism and proportion of missing values. 
    
    Parameters
    ----------
    X : torch.DoubleTensor or np.ndarray, shape (n, d)
        Data for which missing values will be simulated.
        If a numpy array is provided, it will be converted to a pytorch tensor.
    p_miss : float
        Proportion of missing values to generate for variables which will have missing values.
    mecha : str, 
            Indicates the missing-data mechanism to be used. "MCAR" by default, "MAR", "MNAR" or "MNARsmask"
    opt: str, 
         For mecha = "MNAR", it indicates how the missing-data mechanism is generated: using a logistic regression ("logistic"), quantile censorship ("quantile") or logistic regression for generating a self-masked MNAR mechanism ("selfmasked").
    p_obs : float
            If mecha = "MAR", or mecha = "MNAR" with opt = "logistic" or "quanti", proportion of variables with *no* missing values that will be used for the logistic masking model.
    q : float
        If mecha = "MNAR" and opt = "quanti", quantile level at which the cuts should occur.
    
    Returns
    ----------
    A dictionnary containing:
    'X_init': the initial data matrix.
    'X_incomp': the data with the generated missing values.
    'mask': a matrix indexing the generated missing values.s
    """
    
    to_torch = torch.is_tensor(X) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        X = X.astype(np.float32)
        X = torch.from_numpy(X)
    
    if mecha == "MAR":
        mask = MAR_mask(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(X, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(X, p_miss).double()
    else:
        mask = (torch.rand(X.shape) < p_miss).double()
    
    X_nas = X.clone()
    X_nas[mask.bool()] = np.nan
    
    return {'X_init': X.double(), 'X_incomp': X_nas.double(), 'mask': mask}

## Generate Missing Values Data Pertama

### Generate 5% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.05, mecha="MCAR")

X_missing_5 = X_miss_mcar['X_incomp']
R_mcar_5 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_5.sum()).numpy()/np.prod(R_mcar_5.size())*100, " %")

Percentage of newly generated missing values:  5.038265306122449  %


In [None]:
data_missing_5 = pd.DataFrame(X_missing_5.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_5.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 10% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.1, mecha="MCAR")

X_missing_10 = X_miss_mcar['X_incomp']
R_mcar_10 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_10.sum()).numpy()/np.prod(R_mcar_10.size())*100, " %")

Percentage of newly generated missing values:  9.800170068027212  %


In [None]:
data_missing_10 = pd.DataFrame(X_missing_10.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_10.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 15% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.15, mecha="MCAR")

X_missing_15 = X_miss_mcar['X_incomp']
R_mcar_15 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_15.sum()).numpy()/np.prod(R_mcar_15.size())*100, " %")

Percentage of newly generated missing values:  14.306972789115646  %


In [None]:
data_missing_15 = pd.DataFrame(X_missing_15.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_15.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 20% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.2, mecha="MCAR")

X_missing_20 = X_miss_mcar['X_incomp']
R_mcar_20 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_20.sum()).numpy()/np.prod(R_mcar_20.size())*100, " %")

Percentage of newly generated missing values:  19.196428571428573  %


In [None]:
data_missing_20 = pd.DataFrame(X_missing_20.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_20.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 25% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.25, mecha="MCAR")

X_missing_25 = X_miss_mcar['X_incomp']
R_mcar_25 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_25.sum()).numpy()/np.prod(R_mcar_25.size())*100, " %")

Percentage of newly generated missing values:  25.42517006802721  %


In [None]:
data_missing_25 = pd.DataFrame(X_missing_25.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_25.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 30% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.3, mecha="MCAR")

X_missing_30 = X_miss_mcar['X_incomp']
R_mcar_30 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_30.sum()).numpy()/np.prod(R_mcar_30.size())*100, " %")

Percentage of newly generated missing values:  30.272108843537417  %


In [None]:
data_missing_30 = pd.DataFrame(X_missing_30.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_30.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 35% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.35, mecha="MCAR")

X_missing_35 = X_miss_mcar['X_incomp']
R_mcar_35 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_35.sum()).numpy()/np.prod(R_mcar_35.size())*100, " %")

Percentage of newly generated missing values:  30.272108843537417  %


In [None]:
data_missing_35 = pd.DataFrame(X_missing_35.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_35.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 40% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.4, mecha="MCAR")

X_missing_40 = X_miss_mcar['X_incomp']
R_mcar_40 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_40.sum()).numpy()/np.prod(R_mcar_40.size())*100, " %")

Percentage of newly generated missing values:  39.98724489795919  %


In [None]:
data_missing_40 = pd.DataFrame(X_missing_40.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_40.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 45% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.45, mecha="MCAR")

X_missing_45 = X_miss_mcar['X_incomp']
R_mcar_45 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_45.sum()).numpy()/np.prod(R_mcar_45.size())*100, " %")

Percentage of newly generated missing values:  39.98724489795919  %


In [None]:
data_missing_45 = pd.DataFrame(X_missing_45.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_45.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 50% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.5, mecha="MCAR")

X_missing_50 = X_miss_mcar['X_incomp']
R_mcar_50 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_50.sum()).numpy()/np.prod(R_mcar_50.size())*100, " %")

Percentage of newly generated missing values:  50.595238095238095  %


In [None]:
data_missing_50 = pd.DataFrame(X_missing_50.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_50.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

### Generate 60% MV

In [None]:
# Minimal example for generating missing data ------------------------
X_miss_mcar = produce_NA(D_gene, p_miss=0.6, mecha="MCAR")

X_missing_60 = X_miss_mcar['X_incomp']
R_mcar_60 = X_miss_mcar['mask']

print("Percentage of newly generated missing values: ", (R_mcar_60.sum()).numpy()/np.prod(R_mcar_60.size())*100, " %")

Percentage of newly generated missing values:  60.26785714285714  %


In [None]:
data_missing_60 = pd.DataFrame(X_missing_60.numpy(),
                               columns = D_gene_complete.raw_df.columns[1:])
data_missing_60.insert(loc = 0,
                       column = D_gene_complete.raw_df.columns[0],
                       value = D_gene_complete.raw_df['ID_REF'])

## Export as csv

In [None]:
data_missing_5.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 5%.csv', index = False)
data_missing_10.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 10%.csv', index = False)
data_missing_15.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 15%.csv', index = False)
data_missing_20.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 20%.csv', index = False)
data_missing_25.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 25%.csv', index = False)
data_missing_30.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 30%.csv', index = False)
data_missing_40.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 40%.csv', index = False)
data_missing_50.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 50%.csv', index = False)
data_missing_60.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 50%.csv', index = False)

In [None]:
data_missing_35.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 60%.csv', index = False)
data_missing_45.to_csv('/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5/data missing 60%.csv', index = False)

## Load Dataset Missing

In [None]:
list_missing_files = [f"data missing {p_mis_rate:d}%.csv"for p_mis_rate in [5, 10, 15, 20, 25, 30, 40, 50]]
list_missing_files

['data missing 5%.csv',
 'data missing 10%.csv',
 'data missing 15%.csv',
 'data missing 20%.csv',
 'data missing 25%.csv',
 'data missing 30%.csv',
 'data missing 40%.csv',
 'data missing 50%.csv']

In [None]:
D_gene_mis_arr = [0]*len(list_missing_files)
for i, missing_file in enumerate(list_missing_files):
    D_gene_mis_arr[i] = DataHandle("/content/drive/MyDrive/SKRIPSI/data missing GSE135923-GPL26599_v5", missing_file)

# Data Imputation

## compute_SSSIM


In [None]:
from skimage.metrics import structural_similarity

def sssim(a,b):
    # return np.sqrt(np.sum((a - b)**2))/len(a)

    # return structural_similarity(a,b, full=True)
    n = len(a)

    def lmean(i):
        if i == 1:  # python index start from 0
            c1 = (a[i+1]-a[i])/(a[1]-a[0])
            c2 = (b[i+1]-b[i])/(b[1]-b[0])
            c3 = (a[i+2]-a[i+1])/(a[1]-a[0])
            c4 = (b[i+2]-b[i+1])/(b[1]-b[0])
            return np.mean([c1,c2,c3,c4])
        elif i == n-2:  # python index start from 0
            c1 = (a[i]-a[i-1])/(a[1]-a[0])
            c2 = (b[i]-b[i-1])/(b[1]-b[0])
            c3 = (a[i+1]-a[i])/(a[1]-a[0])
            c4 = (b[i+1]-b[i])/(b[1]-b[0])
            return np.mean([c1,c2,c3,c4])
        else:
            c1 = (a[i]-a[i-1])/(a[1]-a[0])
            c2 = (b[i]-b[i-1])/(b[1]-b[0])
            c3 = (a[i+1]-a[i])/(a[1]-a[0])
            c4 = (b[i+1]-b[i])/(b[1]-b[0])
            c5 = (a[i+2]-a[i+1])/(a[1]-a[0])
            c6 = (b[i+2]-b[i+1])/(b[1]-b[0])
            return np.mean([c1,c2,c3,c4,c5,c6])
    
    numerator = 0
    for i in range(1, n-1):
        top = np.abs(((a[i+1]-a[i])/(a[1]-a[0])) - ((b[i+1]-b[i])/(b[1]-b[0])))
        
        bot_a = np.abs(lmean(i)-(a[i+1]-a[i])/(a[1]-a[0]))
        bot_b = np.abs(lmean(i)-(b[i+1]-b[i])/(b[1]-b[0]))
        bot = 2*np.max([bot_a, bot_b])

        numerator += (top/bot)
    
    return 1 - (numerator/(n-2))

## compute_MSR

In [None]:
def compute_MSR(G_bicluster, verbose=False):
    
    vert_mean_cluster = np.mean(G_bicluster, axis=0)
    hort_mean_cluster = np.mean(G_bicluster, axis=1).reshape(-1,1)
    global_mean_cluster = np.mean(G_bicluster)
    
    Rsq = (G_bicluster - hort_mean_cluster - vert_mean_cluster + global_mean_cluster)**2
    MSR = np.sum(Rsq)/(len(G_bicluster) * len(G_bicluster[0]))
    
    if verbose:
        print(f"G_bicluster\n{G_bicluster}")
        print(f"vert_mean_cluster\n{vert_mean_cluster}")
        print(f"hort_mean_cluster\n{hort_mean_cluster}")
        print(f"global_mean_cluster\n{global_mean_cluster}")
        print(f"Rsq\n{Rsq}")
    return MSR

## get_setZ

In [None]:
def get_setZ(D_gene_expression, k_max=4, verbose=False, method="intersect"):
    """Get the set Z. (see Eq. (7)).
    
    Sometimes we have to set k_max, maximum number of nearest neighbour
    for each distance 'MSR' and 'Euclidean'.
        
    Parameters
    ----------
    method : "intersect", "summation", "msr", "euclid", default="intersect"
        Method to get set Z.
    """
    G_tg = D_gene_expression[0, :]
    G_remain = D_gene_expression[1:, :]
    
    MSR = np.zeros(len(G_remain))
    euclidean = np.zeros_like(MSR)
    ssim_matrix = np.zeros_like(MSR)
    for i, G_near in enumerate(G_remain):
        MSR[i] = compute_MSR(np.stack((G_tg, G_near)))
        euclidean[i] = np.sqrt(np.sum((G_tg - G_near)**2))/len(G_tg)
        #euclidean[i] = np.sqrt(np.sum((G_tg - G_near)**2))
        ssim_matrix[i] = sssim(G_tg, G_near) # adib_baru
    
    nth_MSR = np.argsort(MSR, kind="stable")
    nth_euclidean = np.argsort(euclidean, kind="stable")
    nth_sum_MSR_euclid = np.argsort(MSR + euclidean, kind="stable")
    nth_ssim = np.argsort(ssim_matrix, kind="stable")[::-1] #adib_baru
    # nth_setZ = np.intersect1d(nth_MSR[:k_max], nth_euclidean[:k_max]) ##euclidean ganti jadi ssim
    nth_setZ = np.intersect1d(nth_MSR[:k_max], nth_ssim[:k_max]) #adib_baru
    
    if method == "intersect":
        G_neigh = G_remain[nth_setZ].copy()
    elif method == "summation":
        G_neigh = G_remain[nth_sum_MSR_euclid[:k_max]].copy()
    elif method == "msr":
        G_neigh = G_remain[nth_MSR[:k_max]].copy()
    elif method == "euclid":
        G_neigh = G_remain[nth_euclidean[:k_max]].copy()

    if verbose:
        print(f"G_tg\n{G_tg}")
        print(f"G_remain\n{G_remain}")
        print(f"MSR\n{MSR}")
        print(f"euclidean\n{euclidean}")
        print(f"nth_MSR\n{nth_MSR}")
        print(f"nth_euclidean\n{nth_euclidean}")
        print(f"nth_euclidean\n{nth_ssim}") #adib_baru
        print(f"nth_setZ\n{nth_setZ}")
        print(f"G_neigh\n{G_neigh}")
    
    return np.concatenate((G_tg[np.newaxis, :], G_neigh))

## column_elimination

In [None]:
def column_elimination(bicluster, target_col, show_plot=False):
    """Perform column elimination in bicluster Z
    
    Returns
    -------
    C_bicluster : ndarray of shape ((u+1)_genes, v_samples)
        A bicluster that consists of nearest rows and columns
        to the target column and target row
    
    target_col_in_C : int
        A column index of target column relative to the C_bicluster
    """
    
    varphi = len(bicluster[0])/2

    bicluster_shape = bicluster.shape
    col_range = np.ones(bicluster_shape[1], dtype=bool)
    col_without_target =  col_range.copy()
    col_without_target[target_col] = False

    if show_plot:
        fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(8, 12), dpi=100)
    
    k = 0
    col_range_final = col_range.copy()
    while True:
        #print(f"col_range  {col_range}")
        max_MSR = compute_MSR(bicluster[:, col_range].copy())
        temp = max_MSR

        MSR_arr = np.ones(bicluster_shape[1]) * np.nan
        delete_col = -1
        for i, p_col in enumerate(range(len(col_without_target))):
            if col_without_target[p_col]:
                col_without_p_col = col_range.copy()
                col_without_p_col[p_col] = False
                
                temp1 = compute_MSR(bicluster[:, col_without_p_col].copy())
                MSR_arr[i] = temp1
            
                if temp1 < max_MSR:
                    max_MSR = temp1
                    delete_col = p_col
        
        if k == 0 and show_plot:
                axes[0].plot(np.arange(len(col_range)), MSR_arr, linewidth=0.5, marker='o', markersize=2, 
                        alpha=0.5, label="k = {:d}".format(k))
                    
        if delete_col != -1:
            col_range[delete_col] = False
            col_without_target = col_range.copy()
            col_without_target[target_col] = False
            
            MSR_arr[delete_col] = np.nan
            if  k != 0 and show_plot:
                axes[0].plot(np.arange(len(col_range)), MSR_arr, linewidth=0.5, marker='o', markersize=2, 
                        alpha=0.5, label="k = {:d}".format(k))
            k += 1
    
        else:
            break
            
        new_max_MSR = compute_MSR(bicluster[:, col_range].copy())
        #print(f"max_MSR  {temp}")
        #print(f"new_max_MSR  {new_max_MSR}")
        #print(f"np.arange(len(col_range))[col_range]\n  {np.arange(len(col_range))[col_range]}")
        if sum(col_range) <= varphi or new_max_MSR >= temp:
            break
    
  
    if show_plot:
        axes[0].grid(linewidth=0.1)
        axes[0].set_xlabel("nth sample or column")
        axes[0].set_ylabel("MSR")
        axes[0].set_xticks(np.arange(bicluster_shape[1], dtype=int))
        axes[0].axvline(x=target_col, linestyle='--', color='gray', linewidth=0.5)

        axes[0].legend(bbox_to_anchor=(1.01, 1), loc='upper left', title='iteration')
        #tick_top = ax.xaxis.set_tick_params(labeltop='on')

        ax1_xlim_min, ax1_xlim_max = axes[0].get_xlim()


        # plot gene expression level before column elimination
        for i, gene in enumerate(bicluster):
            if i == 0:
                axes[1].plot(gene, linewidth=1.5, marker='o', color="red", 
                        markersize=4, markerfacecolor='white', zorder=99)
            else:
                axes[1].plot(gene, linewidth=0.5, marker='o', markersize=2, 
                          alpha=0.5, color='blue')
        axes[1].set_xticks(np.arange(bicluster_shape[1], dtype=int))
        axes[1].axvline(x=target_col, linestyle='--', color='gray', linewidth=0.5)
        axes[1].set_xlim(ax1_xlim_min, ax1_xlim_max)
        axes[1].set_title("before column elimination")
        
        ax2_ylim_min, ax2_ylim_max = axes[1].get_ylim()


        # plot gene expression level after column elimination
        new_col_range = np.arange(bicluster_shape[1], dtype=int)
        new_col_range = new_col_range[col_range]
        for i, gene in enumerate(bicluster[:, col_range]):
            if i == 0:
                axes[2].plot(new_col_range, gene, linewidth=1.5, marker='o', color="red", 
                        markersize=4, markerfacecolor='white', zorder=99)
            else:
                axes[2].plot(new_col_range, gene, linewidth=0.5, marker='o', markersize=2, 
                          alpha=0.5, color='blue')

        # compute alpha_mis
        new_col_range = np.arange(bicluster_shape[1], dtype=int)
        new_col_range = new_col_range[col_range]
        target_col_in_C = np.argwhere(new_col_range == target_col).item()
        alpha_mis = compute_alpha_mis(bicluster[:, col_range].copy(), target_col_in_C)
        axes[2].plot(target_col, alpha_mis, marker='o', color="green", 
                     markersize=6, markerfacecolor='white', zorder=100)
        
        axes[2].set_xticks(np.arange(bicluster_shape[1], dtype=int))
        axes[2].axvline(x=target_col, linestyle='--', color='gray', linewidth=0.5)
        axes[2].set_xlim(ax1_xlim_min, ax1_xlim_max)
        axes[2].set_ylim(ax2_ylim_min, ax2_ylim_max)
        axes[2].set_title("after column elimination")
            
        plt.tight_layout()
    
    
    new_col_range = np.arange(bicluster_shape[1], dtype=int)
    new_col_range = new_col_range[col_range]
    target_col_in_C = np.argwhere(new_col_range == target_col).item()
    
    return bicluster[:, col_range], target_col_in_C

## compute_alpha_mis

In [None]:
def compute_alpha_mis(C_bicluster, target_col, verbose=False):
    # compute W_jl
    C_bicluster_shape = C_bicluster.shape
    col_without_target_col = np.arange(C_bicluster_shape[1], dtype=int)\
                             [np.arange(C_bicluster_shape[1], dtype=int) != target_col]
    C_target_col = C_bicluster[:, target_col].reshape(-1, 1)
    C_remain = C_bicluster[:, col_without_target_col]
  
    euclidean_arr_1 = np.sqrt(np.sum((C_remain - C_target_col)**2, axis=0))/len(C_target_col) ##ganti jadi ssim
    # euclidean_arr_1 = sssim(C_remain, C_target_col) #ga ganti variabel because it's so much work #adib_baru 
    #euclidean_arr_1 = np.sqrt(np.sum((C_remain - C_target_col)**2, axis=0))  # doesn't change the NRMSE
    
    
    #W_jl = euclidean_arr_1/np.sum(euclidean_arr_1) if np.sum(euclidean_arr_1) > 1e-14 else np.zeros_like(euclidean_arr_1)
    #W_jl = euclidean_arr_1/np.sum(euclidean_arr_1)
    if np.any(euclidean_arr_1 < 1e-14):
        idx_nan_weight = np.argwhere(euclidean_arr_1 < 1e-14)
        W_jl = np.zeros_like(euclidean_arr_1)
        W_jl[idx_nan_weight] = 1/len(idx_nan_weight)
    else:
        inv_euclid_arr_1 = 1./euclidean_arr_1  # further genes should have small contribution
        W_jl = (inv_euclid_arr_1) /np.sum(inv_euclid_arr_1) \
                if np.sum(inv_euclid_arr_1) > 1e-14 else np.zeros_like(inv_euclid_arr_1)
    
    if verbose:
        print(f"C_bicluster\n{C_bicluster}")
        print(f"col_without_target_col\n{col_without_target_col}")
        print(f"C_target_col\n{C_target_col}")
        print(f"C_remain\n{C_remain}")
        print(f"euclidean_arr_1\n{euclidean_arr_1}")
        print(f"W_jl\n{W_jl}")
    
    # compute W_i1
    C_target = C_bicluster[0]
    C_remain = C_bicluster[1:]
    euclidean_arr_2 = np.sqrt(np.sum((C_remain - C_target)**2, axis=1))/len(C_target) ##ganti jadi ssim
    # euclidean_arr_2 = sssim(C_remain, C_target) #ga ganti variabel because it's so much work #adib_baru
    #euclidean_arr_2 = np.sqrt(np.sum((C_remain - C_target)**2, axis=1))   # doesn't change the NRMSE
    
    
    #W_i1 = euclidean_arr_2/np.sum(euclidean_arr_2) if np.sum(euclidean_arr_2) > 1e-14 else np.zeros_like(euclidean_arr_2)
    #W_i1 = euclidean_arr_2/np.sum(euclidean_arr_2)
    if np.any(euclidean_arr_2 < 1e-14):
        idx_nan_weight = np.argwhere(euclidean_arr_2 < 1e-14)
        W_i1 = np.zeros_like(euclidean_arr_2)
        W_i1[idx_nan_weight] = 1/len(idx_nan_weight)
    else:
        inv_euclid_arr_2 = 1./euclidean_arr_2  # further genes should have small contribution
        W_i1 = (inv_euclid_arr_2) /np.sum(inv_euclid_arr_2) \
                if np.sum(inv_euclid_arr_2) > 1e-14 else np.zeros_like(inv_euclid_arr_2)
    
    
    # compute alpha
    # Dutta 2018
    R_avg = C_target[col_without_target_col] * W_jl
    C_avg = C_target_col[1:].flatten() * W_i1
    
    # Bose 2015
    # This doesn't make sense to multiply by euclidean distance
    #R_avg = euclidean_arr_1 * W_jl
    #C_avg = euclidean_arr_2 * W_i1

    R_avg = np.sum(R_avg)
    C_avg = np.sum(C_avg)
    
    if verbose:
        print(f"C_target\n{C_target}")
        print(f"C_remain\n{C_remain}")
        print(f"euclidean_arr_2\n{euclidean_arr_2}")
        print(f"W_i1\n{W_i1}")
        
        print(f"C_target[col_without_target_col]\n{C_target[col_without_target_col]}")
        print(f"C_target_col[1:].flatten()\n{C_target_col[1:].flatten()}")
        print(f"R_avg  {R_avg}")
        print(f"C_avg  {C_avg}")
    
    return (R_avg + C_avg)/2.

## SBi_MSREimpute

In [None]:
def SBi_MSRE(y_cluster_val, y_cluster_val_mis, k_max=4, verbose=False):
    """Sequential biclustering with mean square residue error. (Dutta, et.al., 2019)
    
    Parameters
    ----------
    k_max : int, default=4
        Number of closest gene expression with metric provided by `get_setZ`
    
    Algorithm
    ---------
    (1) Replace all the missing values by row averages
    
    
    (2) Sort all m number of genes in ascending order according to 
        their missing rate. I guessed this is to speed up the computation
        by doing the small number of missing rate first
        
        
    (3) For each missing position t in every target x_s do
    
        (a) consider the target gene x_s, as the initial bicluster B
        
        (b) add set Z (formed according to Eq. (7)) containing u number of nearest
            neighbour genes of x_s in the bicluster B as shown in Eq. (8).
            
        (c) repeat from step (i) to step (iv) until the no. of columns in the bicluster B <= varphi
            or no decrease in the MSR score of the bicluster B due to deletion of any column
              (i) Calculate MSR score H(B) of the bicluster B and store it in `Temp`.
             (ii) Max := Temp
            (iii) For every column (p) in the bicluster B except for the target column do:
                  (A) Calculate MSR score H(B) of the bicluster B but ignoring column p and store it in `Temp1`
                  (B) If (`Temp1` < Max) then
                          Max := Temp1
                          Pos := p
             (iv) Delete Pos column from the bicluster B and reduce no. of column of B by 1
        
        (d) rename the modified bicluster B as C
        
        (e) replace missing value alpha by (R_avg + C_avg)/2 where R_avg is the weighted
            average of target gene (here first row) as shown in Eq. (10) and C_avg is the
            weighted average of the target column (here l) as shown in Eq. (11)
            
        (f) place calculated value for alpha at x_{st} position in the input gene expression
            matrix D and set the corresponding position of the indicator matrix F to 1.
    """
    missing_idx = np.argwhere(np.isnan(y_cluster_val_mis))

    F_mat = np.ones_like(y_cluster_val_mis, dtype=int)
    F_mat_shape = F_mat.shape
    F_mat[missing_idx[:,0], missing_idx[:, 1]] = 0
    
    A = y_cluster_val_mis.copy()
    
    # using row average to put initial values in missing values
    for p, q in missing_idx:
        idx_cols_without_nan = np.argwhere(np.isnan(y_cluster_val_mis[p]) == False).flatten()
        A[p, q] = np.sum(y_cluster_val_mis[p, idx_cols_without_nan])/len(y_cluster_val_mis[p])
        
        # doesn't change NRMSE and doesn't give correct alpha_mis for matrix one
        #A[p, q] = np.sum(y_cluster_val_mis[p, idx_cols_without_nan])/len(idx_cols_without_nan)  
        
        if verbose:
            print(f"  p, q = {p}, {q}")
            print(f"  A[p, :]\n{A[p, :]}")
    
    row_mis_rate = (F_mat == 0).sum(axis=1)
    idx_sorted_row_mis_rate = np.argsort(row_mis_rate, kind="stable")
    
    if verbose:
        print(f"missing_idx\n{missing_idx}")

        print(f"A[missing_idx[:, 0], missing_idx[:, 1]]\n"
              +f"{A[missing_idx[:, 0], missing_idx[:, 1]]}")
        print(f"row_mis_rate\n{row_mis_rate}")
        print(f"F_mat\n{F_mat}")
        print(f"F_mat_shape\n{F_mat_shape}")
        print(f"row_mis_rate[idx_sorted_row_mis_rate]\n{row_mis_rate[idx_sorted_row_mis_rate]}")
    
    F_mat = F_mat[idx_sorted_row_mis_rate, :]
    A = A[idx_sorted_row_mis_rate, :]
    
    y_cluster_val_sorted = y_cluster_val[idx_sorted_row_mis_rate, :].copy()
    F_mat_sorted = F_mat.copy()
    A_sorted = A.copy()
    
    if verbose:
        print(f"F_mat (after sorted)\n{F_mat}")
    
    nth_start_imputation_init = np.arange(F_mat_shape[0], dtype=int)\
                                        [row_mis_rate[idx_sorted_row_mis_rate] > 0][0]
    print(f"  nth_start_imputation_init  {nth_start_imputation_init}")
    for i, nth_start_imputation in enumerate(range(nth_start_imputation_init, F_mat_shape[0])):
        
        # process indicator
        if not verbose:
            sys.stdout.write("\r  {:3.0f}%".format(i*100/(F_mat_shape[0] - (nth_start_imputation_init + 1))))
        
        # swap x_s to the first row (you have to use .copy()!!!)
        dummy_index = np.arange(F_mat_shape[0], dtype=int)
        dummy_index[0], dummy_index[nth_start_imputation] \
            = dummy_index[nth_start_imputation].copy(), dummy_index[0].copy()
        F_mat_target_gene = F_mat[nth_start_imputation].copy()
        bicluster = get_setZ(A[dummy_index], k_max=k_max)  
        
        varphi = len(bicluster[0])/2     # from the paper
        target_cols = np.argwhere(F_mat_target_gene == 0).flatten()
        
        if verbose:
            print(f"  nth_start_imputation  {nth_start_imputation}")
            print(f"  dummy_index\n{dummy_index}")
            print(f"  F_mat_target_gene\n{F_mat_target_gene}")
            print(f"  bicluster\n{bicluster}")
            print(f"  target_cols\n{target_cols}")
            print("  --- before imputation")
            print(f"  A[nth_start_imputation]\n{A[nth_start_imputation]}")
            print(f"  F_mat[nth_start_imputation]\n{F_mat[nth_start_imputation]}")
        
        for target_col in target_cols:
            C_bicluster, target_col_in_C = column_elimination(bicluster.copy(), target_col)
                  
            alpha_mis = compute_alpha_mis(C_bicluster, target_col_in_C)

            if verbose:
                print(f"    target_col  {target_col}")
                print(f"    C_bicluster\n{C_bicluster}")
                print(f"    C_bicluster.shape\n{C_bicluster.shape}")
                print(f"    target_col_in_C {target_col_in_C}")
                print(f"    alpha_mis {alpha_mis}")

            A[nth_start_imputation, target_col] = alpha_mis
            F_mat[nth_start_imputation, target_col] = 1

        if verbose:
            print("  -- after imputation")
            print(f"  A[nth_start_imputation]\n{A[nth_start_imputation]}")
            print(f"  F_mat[nth_start_imputation]\n{F_mat[nth_start_imputation]}")
        
        if not verbose:
            sys.stdout.flush()
    
    return y_cluster_val_sorted, A, F_mat_sorted

In [None]:
# SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)

# Performance Analysis

## compute_NRMSE

In [None]:
def compute_NRMSE(D_gene_actual_sorted, D_gene_estimate, F_mat_sorted, verbose=False):
    idx_missing = np.argwhere(F_mat_sorted == 0)
    
    N = len(idx_missing)
    y_estimate = D_gene_estimate[idx_missing[:,0], idx_missing[:,1]]
    
    y_true = D_gene_actual_sorted[idx_missing[:,0], idx_missing[:,1]]  
    
    if verbose:
        print(f"F_mat_sorted\n{F_mat_sorted}")
        print(f"idx_missing\n{idx_missing}")
        print(f"N {N}")
        print(f"y_estimate\n{y_estimate}")
        print(f"y_true\n{y_true}")
    
    
    return (1./np.std(y_true)) * np.sqrt(np.sum((y_true - y_estimate)**2)/N)
    #return np.sqrt(np.sum((y_true - y_estimate)**2)/N)

    # Using nrmse from user
    #return np.sqrt(np.sum((y_true - y_estimate)**2)/N) / (np.max(y_true)-np.min(y_true))

## compute_pearsoncorr

In [None]:
def compute_pearsoncorr(D_gene_actual_sorted, D_gene_estimate, F_mat_sorted, verbose=False):
    idx_missing = np.argwhere(F_mat_sorted == 0)
    
    N = len(idx_missing)
    y_estimate = D_gene_estimate[idx_missing[:,0], idx_missing[:,1]]
    
    y_true = D_gene_actual_sorted[idx_missing[:,0], idx_missing[:,1]]  
    
    if verbose:
        print(f"F_mat_sorted\n{F_mat_sorted}")
        print(f"idx_missing\n{idx_missing}")
        print(f"N {N}")
        print(f"y_estimate\n{y_estimate}")
        print(f"y_true\n{y_true}")

    #return np.sum(y_true-y_true.mean())*(y_estimate-y_estimate.mean()) / ((np.sqrt(np.sum(np.power((y_true-y_true.mean),2)))) * (np.sqrt(np.sum(np.power((y_estimate-y_estimate.mean),2)))))
    #return np.sqrt(np.sum(y_true-y_true.mean())*(y_estimate-y_estimate.mean()))  
    return np.sum(np.multiply((y_true - y_true.mean()),(y_estimate - y_estimate.mean()))) / (np.sqrt(np.sum((y_true-y_true.mean())**2)) * np.sqrt(np.sum((y_estimate-y_estimate.mean())**2)))

## Hasil

### Untuk k = 5%

#### k = 5% (Percobaan 1)

In [None]:
k_max = round(0.05*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.5795426829115352
  comp.time   52.55 s
  Pearson_arr   0.8414752213897891
  comp.time   52.55 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.6047337439684181
  comp.time   90.53 s
  Pearson_arr   0.9583512024444023
  comp.time   90.54 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5198631871916037
  comp.time   121.06 s
  Pearson_arr   0.9209295247537066
  comp.time   121.06 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.637086244372689
  comp.time   139.86 s
  Pearson_arr   0.9610621782444323
  comp.time   139.86 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.728571656902378
  comp.time   159.75 s
  Pearson_arr   0.843256051000744
  comp.time   159.76 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6991454793476615
  comp.time   170.01 s
  Pears

#### k = 5% (Percobaan 2)

In [None]:
k_max = round(0.05*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.79%
  nth_start_imputation_init  594
  100%
  NRMSE_arr   0.6238185660265827
  comp.time   50.31 s
  Pearson_arr   0.9145909501339694
  comp.time   50.31 s
p missing rate: 6.59%
  nth_start_imputation_init  392
  100%
  NRMSE_arr   0.5595051952693265
  comp.time   94.43 s
  Pearson_arr   0.9775942671505145
  comp.time   94.43 s
p missing rate: 8.66%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.5538670600202991
  comp.time   116.51 s
  Pearson_arr   0.962991414608002
  comp.time   116.51 s
p missing rate: 12.37%
  nth_start_imputation_init  174
  100%
  NRMSE_arr   0.749166133861194
  comp.time   150.37 s
  Pearson_arr   0.8595548421642929
  comp.time   150.38 s
p missing rate: 15.04%
  nth_start_imputation_init  129
  100%
  NRMSE_arr   0.6649888174106534
  comp.time   162.58 s
  Pearson_arr   0.9145077805970294
  comp.time   162.58 s
p missing rate: 18.57%
  nth_start_imputation_init  88
  100%
  NRMSE_arr   0.7683909078613397
  comp.time   175.29 s
  Pear

#### k = 5% (Percobaan 3)

In [None]:
k_max = round(0.05*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.06%
  nth_start_imputation_init  567
  100%
  NRMSE_arr   0.567941781229729
  comp.time   50.68 s
  Pearson_arr   0.8972813913155095
  comp.time   50.68 s
p missing rate: 5.96%
  nth_start_imputation_init  413
  100%
  NRMSE_arr   0.4856139751419793
  comp.time   86.98 s
  Pearson_arr   0.9719577182966739
  comp.time   86.98 s
p missing rate: 8.55%
  nth_start_imputation_init  311
  100%
  NRMSE_arr   0.6618286512214887
  comp.time   112.83 s
  Pearson_arr   0.8472688367486501
  comp.time   112.83 s
p missing rate: 11.82%
  nth_start_imputation_init  216
  100%
  NRMSE_arr   0.6849166809203848
  comp.time   135.35 s
  Pearson_arr   0.9287416516106798
  comp.time   135.35 s
p missing rate: 15.01%
  nth_start_imputation_init  143
  100%
  NRMSE_arr   0.6185793702853665
  comp.time   156.56 s
  Pearson_arr   0.918188740869031
  comp.time   156.56 s
p missing rate: 18.04%
  nth_start_imputation_init  102
  100%
  NRMSE_arr   0.7226069962401135
  comp.time   168.78 s
  Pea

#### k = 5% (Percobaan 4)

In [None]:
k_max = round(0.05*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.78%
  nth_start_imputation_init  589
  100%
  NRMSE_arr   0.5495718047712128
  comp.time   56.63 s
  Pearson_arr   0.9538245841382628
  comp.time   56.63 s
p missing rate: 6.30%
  nth_start_imputation_init  407
  100%
  NRMSE_arr   0.6080157985986963
  comp.time   95.09 s
  Pearson_arr   0.8788832072647516
  comp.time   95.10 s
p missing rate: 8.76%
  nth_start_imputation_init  313
  100%
  NRMSE_arr   0.6648589324374816
  comp.time   116.31 s
  Pearson_arr   0.8757093013477729
  comp.time   116.31 s
p missing rate: 11.91%
  nth_start_imputation_init  208
  100%
  NRMSE_arr   0.5929629533508265
  comp.time   144.83 s
  Pearson_arr   0.9569660762879395
  comp.time   144.83 s
p missing rate: 14.50%
  nth_start_imputation_init  157
  100%
  NRMSE_arr   0.6923759457020936
  comp.time   158.91 s
  Pearson_arr   0.9058196296547022
  comp.time   158.91 s
p missing rate: 18.11%
  nth_start_imputation_init  80
  100%
  NRMSE_arr   0.7900916778882509
  comp.time   181.15 s
  Pe

#### k = 5% (Percobaan 5)

In [None]:
k_max = round(0.05*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.02%
  nth_start_imputation_init  574
  100%
  NRMSE_arr   0.5431725064649032
  comp.time   59.17 s
  Pearson_arr   0.9760266246013843
  comp.time   59.17 s
p missing rate: 5.88%
  nth_start_imputation_init  416
  100%
  NRMSE_arr   0.584425115722519
  comp.time   104.82 s
  Pearson_arr   0.9428283079680642
  comp.time   104.82 s
p missing rate: 8.58%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6145807944770794
  comp.time   123.88 s
  Pearson_arr   0.9697120260724434
  comp.time   123.88 s
p missing rate: 11.52%
  nth_start_imputation_init  212
  100%
  NRMSE_arr   0.6596870131174765
  comp.time   150.37 s
  Pearson_arr   0.9228882885703313
  comp.time   150.37 s
p missing rate: 15.26%
  nth_start_imputation_init  135
  100%
  NRMSE_arr   0.6642483511327898
  comp.time   171.31 s
  Pearson_arr   0.8995296469681665
  comp.time   171.31 s
p missing rate: 18.16%
  nth_start_imputation_init  94
  100%
  NRMSE_arr   0.7414122860779137
  comp.time   181.87 s
  P

### Untuk k = 10%

#### k = 10% (Percobaan 1)

In [None]:
k_max = round(0.1*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.5953215560767893
  comp.time   47.85 s
  Pearson_arr   0.8238273303926589
  comp.time   47.86 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.6082569978883788
  comp.time   89.28 s
  Pearson_arr   0.9584175424143094
  comp.time   89.28 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5253102982056784
  comp.time   118.39 s
  Pearson_arr   0.9222717666645684
  comp.time   118.39 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.6160399827715004
  comp.time   140.07 s
  Pearson_arr   0.9471170333619524
  comp.time   140.07 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.6975282954796668
  comp.time   157.92 s
  Pearson_arr   0.8573192960883165
  comp.time   157.93 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6798136760605329
  comp.time   169.12 s
  Pe

#### k = 10% (Percobaan 2)

In [None]:
k_max = round(0.1*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.79%
  nth_start_imputation_init  594
  100%
  NRMSE_arr   0.6233539157609664
  comp.time   46.74 s
  Pearson_arr   0.9204869746377531
  comp.time   46.74 s
p missing rate: 6.59%
  nth_start_imputation_init  392
  100%
  NRMSE_arr   0.5489162406652143
  comp.time   94.79 s
  Pearson_arr   0.9803058400941356
  comp.time   94.79 s
p missing rate: 8.66%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.5542133480572996
  comp.time   116.25 s
  Pearson_arr   0.964919082933679
  comp.time   116.25 s
p missing rate: 12.37%
  nth_start_imputation_init  174
  100%
  NRMSE_arr   0.7516627456749663
  comp.time   150.27 s
  Pearson_arr   0.8757043168139057
  comp.time   150.28 s
p missing rate: 15.04%
  nth_start_imputation_init  129
  100%
  NRMSE_arr   0.6373002497923715
  comp.time   162.34 s
  Pearson_arr   0.9104383486848007
  comp.time   162.34 s
p missing rate: 18.57%
  nth_start_imputation_init  88
  100%
  NRMSE_arr   0.7619475655842495
  comp.time   173.90 s
  Pea

#### k = 10% (Percobaan 3)

In [None]:
k_max = round(0.1*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.06%
  nth_start_imputation_init  567
  100%
  NRMSE_arr   0.5640043359956097
  comp.time   50.77 s
  Pearson_arr   0.9253179236125499
  comp.time   50.78 s
p missing rate: 5.96%
  nth_start_imputation_init  413
  100%
  NRMSE_arr   0.48111133630505915
  comp.time   86.74 s
  Pearson_arr   0.975687397358122
  comp.time   86.74 s
p missing rate: 8.55%
  nth_start_imputation_init  311
  100%
  NRMSE_arr   0.638516754668874
  comp.time   113.35 s
  Pearson_arr   0.8980355121956363
  comp.time   113.35 s
p missing rate: 11.82%
  nth_start_imputation_init  216
  100%
  NRMSE_arr   0.6806969029711741
  comp.time   135.25 s
  Pearson_arr   0.922376114083542
  comp.time   135.25 s
p missing rate: 15.01%
  nth_start_imputation_init  143
  100%
  NRMSE_arr   0.5792864545038877
  comp.time   156.91 s
  Pearson_arr   0.9347654538669695
  comp.time   156.91 s
p missing rate: 18.04%
  nth_start_imputation_init  102
  100%
  NRMSE_arr   0.7189387997138631
  comp.time   168.65 s
  Pea

#### k = 10% (Percobaan 4)

In [None]:
k_max = round(0.1*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.78%
  nth_start_imputation_init  589
  100%
  NRMSE_arr   0.6101100696332625
  comp.time   47.08 s
  Pearson_arr   0.9424077434213756
  comp.time   47.08 s
p missing rate: 6.30%
  nth_start_imputation_init  407
  100%
  NRMSE_arr   0.6413802343555889
  comp.time   91.78 s
  Pearson_arr   0.8722410423225108
  comp.time   91.78 s
p missing rate: 8.76%
  nth_start_imputation_init  313
  100%
  NRMSE_arr   0.6660725073541016
  comp.time   115.40 s
  Pearson_arr   0.865653397949777
  comp.time   115.40 s
p missing rate: 11.91%
  nth_start_imputation_init  208
  100%
  NRMSE_arr   0.5824356646931664
  comp.time   143.82 s
  Pearson_arr   0.9489495547438185
  comp.time   143.82 s
p missing rate: 14.50%
  nth_start_imputation_init  157
  100%
  NRMSE_arr   0.6674180939364419
  comp.time   158.59 s
  Pearson_arr   0.9176221786283476
  comp.time   158.60 s
p missing rate: 18.11%
  nth_start_imputation_init  80
  100%
  NRMSE_arr   0.7933872389315134
  comp.time   181.20 s
  Pea

#### k = 10% (Percobaan 5)

In [None]:
k_max = round(0.1*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.02%
  nth_start_imputation_init  574
  100%
  NRMSE_arr   0.5686027961294619
  comp.time   52.56 s
  Pearson_arr   0.9724393179036636
  comp.time   52.57 s
p missing rate: 5.88%
  nth_start_imputation_init  416
  100%
  NRMSE_arr   0.6012467589218174
  comp.time   93.93 s
  Pearson_arr   0.9463020851678952
  comp.time   93.94 s
p missing rate: 8.58%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6253245114397733
  comp.time   121.62 s
  Pearson_arr   0.9676946254378619
  comp.time   121.63 s
p missing rate: 11.52%
  nth_start_imputation_init  212
  100%
  NRMSE_arr   0.6270650209700747
  comp.time   147.42 s
  Pearson_arr   0.9324130300925574
  comp.time   147.43 s
p missing rate: 15.26%
  nth_start_imputation_init  135
  100%
  NRMSE_arr   0.6528875726604285
  comp.time   169.61 s
  Pearson_arr   0.8969729363542249
  comp.time   169.61 s
p missing rate: 18.16%
  nth_start_imputation_init  94
  100%
  NRMSE_arr   0.7191360333869964
  comp.time   182.54 s
  Pe

### Untuk k = 15%

#### k = 15% (Percobaan 1)

In [None]:
k_max = round(0.15*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.5944623808950944
  comp.time   47.36 s
  Pearson_arr   0.8506385123175971
  comp.time   47.36 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.6152133139345644
  comp.time   91.21 s
  Pearson_arr   0.9580316613967682
  comp.time   91.21 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5404564462858102
  comp.time   117.35 s
  Pearson_arr   0.9237384842856473
  comp.time   117.35 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.627874696493196
  comp.time   140.00 s
  Pearson_arr   0.9460589100436291
  comp.time   140.00 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.7007945800964964
  comp.time   159.73 s
  Pearson_arr   0.8579970198012274
  comp.time   159.73 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6815663195687923
  comp.time   172.05 s
  Pea

In [None]:
y_cluster_val_estimate

array([[2611.340, 2519.650, 2841.680, 2850.220, 2758.710, 2589.800],
       [64.790, 74.140, 73.670, 98.170, 120.140, 122.990],
       [29.810, 34.940, 33.780, 38.840, 29.130, 31.410],
       ...,
       [31.018, 34.940, 29.123, 33.840, 25.660, 31.410],
       [8364.420, 7096.130, 4369.719, 8141.800, 7947.560, 4724.867],
       [110.976, 106.586, 349.650, 148.880, 132.880, 110.633]])

In [None]:
df = pd.DataFrame(y_cluster_val_estimate)
df.head(50)

Unnamed: 0,0,1,2,3,4,5
0,2611.340088,2519.649902,2841.679932,2850.219971,2758.709961,2589.800049
1,64.790001,74.139999,73.669998,98.169998,120.139999,122.989998
2,29.809999,34.939999,33.779999,38.84,29.129999,31.41
3,970.400024,1263.76001,1134.329956,1165.119995,1207.790039,1172.349976
4,63.290001,52.799999,63.150002,113.279999,113.769997,92.900002
5,69.309998,53.919998,113.43,33.84,30.040001,31.41
6,29.809999,34.939999,33.779999,33.84,25.66,31.41
7,4687.75,5106.700195,5429.589844,3573.030029,3692.540039,3558.649902
8,29.809999,34.939999,33.779999,33.84,25.66,31.41
9,29.809999,34.939999,33.779999,33.84,25.66,31.41


In [None]:
df.to_csv('/content/drive/MyDrive/SKRIPSI/hasil imputasi k=15% dan MV 5%.csv', index = False)

#### k = 15% (Percobaan 2)

In [None]:
k_max = round(0.15*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.79%
  nth_start_imputation_init  594
  100%
  NRMSE_arr   0.6530786563879692
  comp.time   44.65 s
  Pearson_arr   0.9138802114081934
  comp.time   44.65 s
p missing rate: 6.59%
  nth_start_imputation_init  392
  100%
  NRMSE_arr   0.5586059064499893
  comp.time   95.08 s
  Pearson_arr   0.9766768113055477
  comp.time   95.08 s
p missing rate: 8.66%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.5522526691370353
  comp.time   116.24 s
  Pearson_arr   0.9643427116370118
  comp.time   116.24 s
p missing rate: 12.37%
  nth_start_imputation_init  174
  100%
  NRMSE_arr   0.7553585679639709
  comp.time   149.86 s
  Pearson_arr   0.8886350467572691
  comp.time   149.86 s
p missing rate: 15.04%
  nth_start_imputation_init  129
  100%
  NRMSE_arr   0.6285237752821569
  comp.time   160.78 s
  Pearson_arr   0.9237406824285922
  comp.time   160.78 s
p missing rate: 18.57%
  nth_start_imputation_init  88
  100%
  NRMSE_arr   0.7612868122094645
  comp.time   174.26 s
  Pe

#### k = 15% (Percobaan 3)

In [None]:
k_max = round(0.15*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.06%
  nth_start_imputation_init  567
  100%
  NRMSE_arr   0.5607120852810978
  comp.time   50.44 s
  Pearson_arr   0.9313080975525646
  comp.time   50.44 s
p missing rate: 5.96%
  nth_start_imputation_init  413
  100%
  NRMSE_arr   0.48876163095568026
  comp.time   86.31 s
  Pearson_arr   0.9769394662968379
  comp.time   86.31 s
p missing rate: 8.55%
  nth_start_imputation_init  311
  100%
  NRMSE_arr   0.6452168516005762
  comp.time   111.99 s
  Pearson_arr   0.907425872894329
  comp.time   111.99 s
p missing rate: 11.82%
  nth_start_imputation_init  216
  100%
  NRMSE_arr   0.6719939294866446
  comp.time   135.16 s
  Pearson_arr   0.9328383516512924
  comp.time   135.17 s
p missing rate: 15.01%
  nth_start_imputation_init  143
  100%
  NRMSE_arr   0.6009294203342825
  comp.time   155.79 s
  Pearson_arr   0.9258471860996296
  comp.time   155.79 s
p missing rate: 18.04%
  nth_start_imputation_init  102
  100%
  NRMSE_arr   0.708238024013402
  comp.time   167.16 s
  Pe

#### k = 15% (Percobaan 4)

In [None]:
k_max = round(0.15*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.78%
  nth_start_imputation_init  589
  100%
  NRMSE_arr   0.6460690962563342
  comp.time   47.66 s
  Pearson_arr   0.934414337527327
  comp.time   47.66 s
p missing rate: 6.30%
  nth_start_imputation_init  407
  100%
  NRMSE_arr   0.6477485320408455
  comp.time   91.99 s
  Pearson_arr   0.8761367661562807
  comp.time   91.99 s
p missing rate: 8.76%
  nth_start_imputation_init  313
  100%
  NRMSE_arr   0.657220023573705
  comp.time   115.93 s
  Pearson_arr   0.8706751300105171
  comp.time   115.93 s
p missing rate: 11.91%
  nth_start_imputation_init  208
  100%
  NRMSE_arr   0.5890479448080534
  comp.time   144.44 s
  Pearson_arr   0.951638069039392
  comp.time   144.44 s
p missing rate: 14.50%
  nth_start_imputation_init  157
  100%
  NRMSE_arr   0.6719475514646234
  comp.time   159.03 s
  Pearson_arr   0.9178254295973729
  comp.time   159.03 s
p missing rate: 18.11%
  nth_start_imputation_init  80
  100%
  NRMSE_arr   0.7967427851026946
  comp.time   181.54 s
  Pears

#### k = 15% (Percobaan 5)

In [None]:
k_max = round(0.15*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.02%
  nth_start_imputation_init  574
  100%
  NRMSE_arr   0.5791234024999269
  comp.time   52.61 s
  Pearson_arr   0.969171604475447
  comp.time   52.61 s
p missing rate: 5.88%
  nth_start_imputation_init  416
  100%
  NRMSE_arr   0.598370518047057
  comp.time   92.46 s
  Pearson_arr   0.9491427764519696
  comp.time   92.46 s
p missing rate: 8.58%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6323838460026472
  comp.time   123.61 s
  Pearson_arr   0.966066892862873
  comp.time   123.62 s
p missing rate: 11.52%
  nth_start_imputation_init  212
  100%
  NRMSE_arr   0.5922939880300181
  comp.time   146.68 s
  Pearson_arr   0.9538046086366403
  comp.time   146.68 s
p missing rate: 15.26%
  nth_start_imputation_init  135
  100%
  NRMSE_arr   0.6416865542096886
  comp.time   170.67 s
  Pearson_arr   0.9035265464908889
  comp.time   170.67 s
p missing rate: 18.16%
  nth_start_imputation_init  94
  100%
  NRMSE_arr   0.7163936611100283
  comp.time   182.41 s
  Pears

### Untuk k = 20%

#### k = 20% (Percobaan 1)

In [None]:
k_max = round(0.2*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.6024870457971565
  comp.time   47.05 s
  Pearson_arr   0.8446066695929367
  comp.time   47.06 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.6028371603570257
  comp.time   89.69 s
  Pearson_arr   0.9508515267438411
  comp.time   89.69 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5489428537416401
  comp.time   119.13 s
  Pearson_arr   0.9206507372446048
  comp.time   119.13 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.632481630759308
  comp.time   139.90 s
  Pearson_arr   0.9462710522278109
  comp.time   139.90 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.7087111020300276
  comp.time   159.90 s
  Pearson_arr   0.8589384361739533
  comp.time   159.90 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6867757581580795
  comp.time   169.92 s
  Pea

#### k = 20% (Percobaan 2)

In [None]:
k_max = round(0.2*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

#### k = 20% (Percobaan 3)

In [None]:
k_max = round(0.2*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.06%
  nth_start_imputation_init  567
  100%
  NRMSE_arr   0.5607705755142367
  comp.time   50.56 s
  Pearson_arr   0.9452587180914958
  comp.time   50.56 s
p missing rate: 5.96%
  nth_start_imputation_init  413
  100%
  NRMSE_arr   0.4923807998817518
  comp.time   86.91 s
  Pearson_arr   0.9775141336323092
  comp.time   86.91 s
p missing rate: 8.55%
  nth_start_imputation_init  311
  100%
  NRMSE_arr   0.6458270964731423
  comp.time   112.82 s
  Pearson_arr   0.910427529182466
  comp.time   112.83 s
p missing rate: 11.82%
  nth_start_imputation_init  216
  100%
  NRMSE_arr   0.6744131897019626
  comp.time   135.42 s
  Pearson_arr   0.933751139116309
  comp.time   135.42 s
p missing rate: 15.01%
  nth_start_imputation_init  143
  100%
  NRMSE_arr   0.600604316417947
  comp.time   156.97 s
  Pearson_arr   0.9248010680868285
  comp.time   156.97 s
p missing rate: 18.04%
  nth_start_imputation_init  102
  100%
  NRMSE_arr   0.7073713323464706
  comp.time   167.90 s
  Pear

#### k = 20% (Percobaan 4)

In [None]:
k_max = round(0.2*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.78%
  nth_start_imputation_init  589
  100%
  NRMSE_arr   0.6644636164050913
  comp.time   46.84 s
  Pearson_arr   0.9308408208718704
  comp.time   46.84 s
p missing rate: 6.30%
  nth_start_imputation_init  407
  100%
  NRMSE_arr   0.6537433472468894
  comp.time   91.98 s
  Pearson_arr   0.875340658025247
  comp.time   91.98 s
p missing rate: 8.76%
  nth_start_imputation_init  313
  100%
  NRMSE_arr   0.6488452504634419
  comp.time   116.80 s
  Pearson_arr   0.8926263683745799
  comp.time   116.80 s
p missing rate: 11.91%
  nth_start_imputation_init  208
  100%
  NRMSE_arr   0.5951134429305978
  comp.time   143.76 s
  Pearson_arr   0.9486975868187257
  comp.time   143.76 s
p missing rate: 14.50%
  nth_start_imputation_init  157
  100%
  NRMSE_arr   0.6731496670854554
  comp.time   159.30 s
  Pearson_arr   0.9178429190905554
  comp.time   159.30 s
p missing rate: 18.11%
  nth_start_imputation_init  80
  100%
  NRMSE_arr   0.7999092934114812
  comp.time   181.44 s
  Pea

#### k = 20% (Percobaan 5)

In [None]:
k_max = round(0.2*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.02%
  nth_start_imputation_init  574
  100%
  NRMSE_arr   0.5839188302781829
  comp.time   54.31 s
  Pearson_arr   0.9681396916390964
  comp.time   54.31 s
p missing rate: 5.88%
  nth_start_imputation_init  416
  100%
  NRMSE_arr   0.601694879280027
  comp.time   93.05 s
  Pearson_arr   0.95130399255419
  comp.time   93.05 s
p missing rate: 8.58%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6345107949135594
  comp.time   122.67 s
  Pearson_arr   0.9670758261872268
  comp.time   122.68 s
p missing rate: 11.52%
  nth_start_imputation_init  212
  100%
  NRMSE_arr   0.5941026949198523
  comp.time   148.97 s
  Pearson_arr   0.9555032562469454
  comp.time   148.97 s
p missing rate: 15.26%
  nth_start_imputation_init  135
  100%
  NRMSE_arr   0.6445930691980971
  comp.time   171.07 s
  Pearson_arr   0.9140442084641016
  comp.time   171.07 s
p missing rate: 18.16%
  nth_start_imputation_init  94
  100%
  NRMSE_arr   0.6964758334218146
  comp.time   181.36 s
  Pears

### Untuk k = 25%

#### k = 25% (Percobaan 1)

In [None]:
k_max = round(0.25*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.6026057610223717
  comp.time   48.71 s
  Pearson_arr   0.8487948468275867
  comp.time   48.72 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.5927991023358937
  comp.time   89.60 s
  Pearson_arr   0.9569457049841089
  comp.time   89.60 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5360201756042656
  comp.time   118.33 s
  Pearson_arr   0.9246671268122787
  comp.time   118.33 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.6377265974522178
  comp.time   141.84 s
  Pearson_arr   0.9457625411412387
  comp.time   141.84 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.7121702528164683
  comp.time   158.62 s
  Pearson_arr   0.8566809756485962
  comp.time   158.62 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.691228024649781
  comp.time   172.25 s
  Pea

#### k = 25% (Percobaan 2)

In [None]:
k_max = round(0.25*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.79%
  nth_start_imputation_init  594
  100%
  NRMSE_arr   0.6270724779083671
  comp.time   45.30 s
  Pearson_arr   0.9440558754461787
  comp.time   45.30 s
p missing rate: 6.59%
  nth_start_imputation_init  392
  100%
  NRMSE_arr   0.5661118351305235
  comp.time   94.44 s
  Pearson_arr   0.9764303647658609
  comp.time   94.44 s
p missing rate: 8.66%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6105236111098352
  comp.time   117.38 s
  Pearson_arr   0.9682299812222155
  comp.time   117.39 s
p missing rate: 12.37%
  nth_start_imputation_init  174
  100%
  NRMSE_arr   0.7586283774221242
  comp.time   148.10 s
  Pearson_arr   0.8987381753757971
  comp.time   148.10 s
p missing rate: 15.04%
  nth_start_imputation_init  129
  100%
  NRMSE_arr   0.6393782112894121
  comp.time   162.67 s
  Pearson_arr   0.9243137302775469
  comp.time   162.67 s
p missing rate: 18.57%
  nth_start_imputation_init  88
  100%
  NRMSE_arr   0.7648945948885778
  comp.time   173.33 s
  Pe

#### k = 25% (Percobaan 3)

In [None]:
k_max = round(0.25*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.06%
  nth_start_imputation_init  567
  100%
  NRMSE_arr   0.5645666623423136
  comp.time   50.69 s
  Pearson_arr   0.9455844148396247
  comp.time   50.69 s
p missing rate: 5.96%
  nth_start_imputation_init  413
  100%
  NRMSE_arr   0.49387164023231783
  comp.time   87.17 s
  Pearson_arr   0.9786262428904631
  comp.time   87.17 s
p missing rate: 8.55%
  nth_start_imputation_init  311
  100%
  NRMSE_arr   0.6492868790596691
  comp.time   111.58 s
  Pearson_arr   0.9075188146731692
  comp.time   111.58 s
p missing rate: 11.82%
  nth_start_imputation_init  216
  100%
  NRMSE_arr   0.6794885219530604
  comp.time   137.12 s
  Pearson_arr   0.9286735280592753
  comp.time   137.12 s
p missing rate: 15.01%
  nth_start_imputation_init  143
  100%
  NRMSE_arr   0.5823165041653806
  comp.time   157.11 s
  Pearson_arr   0.9306707209298691
  comp.time   157.11 s
p missing rate: 18.04%
  nth_start_imputation_init  102
  100%
  NRMSE_arr   0.7085124834096768
  comp.time   168.94 s
  

#### k = 25% (Percobaan 4)

In [None]:
k_max = round(0.25*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.78%
  nth_start_imputation_init  589
  100%
  NRMSE_arr   0.6806671565846818
  comp.time   48.07 s
  Pearson_arr   0.9246374091740553
  comp.time   48.08 s
p missing rate: 6.30%
  nth_start_imputation_init  407
  100%
  NRMSE_arr   0.6481180242902913
  comp.time   92.51 s
  Pearson_arr   0.9018741673727818
  comp.time   92.51 s
p missing rate: 8.76%
  nth_start_imputation_init  313
  100%
  NRMSE_arr   0.6535861644357827
  comp.time   117.35 s
  Pearson_arr   0.892478183922686
  comp.time   117.35 s
p missing rate: 11.91%
  nth_start_imputation_init  208
  100%
  NRMSE_arr   0.5927276458741457
  comp.time   142.93 s
  Pearson_arr   0.9513024802973021
  comp.time   142.93 s
p missing rate: 14.50%
  nth_start_imputation_init  157
  100%
  NRMSE_arr   0.6748258032212547
  comp.time   161.24 s
  Pearson_arr   0.9172029278014163
  comp.time   161.24 s
p missing rate: 18.11%
  nth_start_imputation_init  80
  100%
  NRMSE_arr   0.8016291179040997
  comp.time   180.38 s
  Pea

#### k = 25% (Percobaan 5)

In [None]:
k_max = round(0.25*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.02%
  nth_start_imputation_init  574
  100%
  NRMSE_arr   0.559841501805325
  comp.time   53.26 s
  Pearson_arr   0.9837985259991564
  comp.time   53.27 s
p missing rate: 5.88%
  nth_start_imputation_init  416
  100%
  NRMSE_arr   0.6061490202552389
  comp.time   93.09 s
  Pearson_arr   0.9510728171852824
  comp.time   93.09 s
p missing rate: 8.58%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6386325831414851
  comp.time   124.79 s
  Pearson_arr   0.9684692417115138
  comp.time   124.79 s
p missing rate: 11.52%
  nth_start_imputation_init  212
  100%
  NRMSE_arr   0.5967698167320916
  comp.time   147.07 s
  Pearson_arr   0.9578057354933219
  comp.time   147.07 s
p missing rate: 15.26%
  nth_start_imputation_init  135
  100%
  NRMSE_arr   0.6497455003096655
  comp.time   171.64 s
  Pearson_arr   0.913078790333673
  comp.time   171.64 s
p missing rate: 18.16%
  nth_start_imputation_init  94
  100%
  NRMSE_arr   0.6992786888718222
  comp.time   183.11 s
  Pear

### Untuk k = 30%

#### k = 30% (Percobaan 1)

In [None]:
k_max = round(0.3*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.6089481248944316
  comp.time   51.87 s
  Pearson_arr   0.847975926579995
  comp.time   51.87 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.5992439165991225
  comp.time   95.39 s
  Pearson_arr   0.9579702615627866
  comp.time   95.39 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5382066921612179
  comp.time   126.48 s
  Pearson_arr   0.9256553042180417
  comp.time   126.49 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.6433201565508812
  comp.time   146.25 s
  Pearson_arr   0.9465673433425215
  comp.time   146.26 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.6953195446608946
  comp.time   167.33 s
  Pearson_arr   0.8720020764846032
  comp.time   167.34 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6931022612654125
  comp.time   177.75 s
  Pea

#### k = 30% (Percobaan 2)

In [None]:
k_max = round(0.3*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.79%
  nth_start_imputation_init  594
  100%
  NRMSE_arr   0.6290643175908824
  comp.time   57.12 s
  Pearson_arr   0.9449486946516401
  comp.time   57.13 s
p missing rate: 6.59%
  nth_start_imputation_init  392
  100%
  NRMSE_arr   0.5672461034869967
  comp.time   99.25 s
  Pearson_arr   0.9755826311851378
  comp.time   99.26 s
p missing rate: 8.66%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6143154202342231
  comp.time   121.01 s
  Pearson_arr   0.9679274061529034
  comp.time   121.01 s
p missing rate: 12.37%
  nth_start_imputation_init  174
  100%
  NRMSE_arr   0.7551662916237171
  comp.time   154.82 s
  Pearson_arr   0.8899830049733893
  comp.time   154.82 s
p missing rate: 15.04%
  nth_start_imputation_init  129
  100%
  NRMSE_arr   0.6540114576598929
  comp.time   169.20 s
  Pearson_arr   0.9237717028405911
  comp.time   169.21 s
p missing rate: 18.57%
  nth_start_imputation_init  88
  100%
  NRMSE_arr   0.7639216290723644
  comp.time   179.31 s
  Pe

#### k = 30% (Percobaan 3)

In [None]:
k_max = round(0.3*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.06%
  nth_start_imputation_init  567
  100%
  NRMSE_arr   0.568429220315209
  comp.time   63.39 s
  Pearson_arr   0.9651789645829626
  comp.time   63.39 s
p missing rate: 5.96%
  nth_start_imputation_init  413
  100%
  NRMSE_arr   0.49078823396705734
  comp.time   93.42 s
  Pearson_arr   0.9795873319484569
  comp.time   93.42 s
p missing rate: 8.55%
  nth_start_imputation_init  311
  100%
  NRMSE_arr   0.6441023410833542
  comp.time   118.48 s
  Pearson_arr   0.9068691509194279
  comp.time   118.48 s
p missing rate: 11.82%
  nth_start_imputation_init  216
  100%
  NRMSE_arr   0.6800323258370402
  comp.time   146.14 s
  Pearson_arr   0.9251739261570536
  comp.time   146.14 s
p missing rate: 15.01%
  nth_start_imputation_init  143
  100%
  NRMSE_arr   0.583290801736885
  comp.time   166.83 s
  Pearson_arr   0.9378878741537141
  comp.time   166.83 s
p missing rate: 18.04%
  nth_start_imputation_init  102
  100%
  NRMSE_arr   0.7087134542392645
  comp.time   178.94 s
  Pe

#### k = 30% (Percobaan 4)

In [None]:
k_max = round(0.3*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.78%
  nth_start_imputation_init  589
  100%
  NRMSE_arr   0.6898170765334871
  comp.time   60.86 s
  Pearson_arr   0.9212009842212818
  comp.time   60.86 s
p missing rate: 6.30%
  nth_start_imputation_init  407
  100%
  NRMSE_arr   0.6559004420226158
  comp.time   93.91 s
  Pearson_arr   0.8974571684857243
  comp.time   93.92 s
p missing rate: 8.76%
  nth_start_imputation_init  313
  100%
  NRMSE_arr   0.6535809552639311
  comp.time   119.57 s
  Pearson_arr   0.8974782787475658
  comp.time   119.57 s
p missing rate: 11.91%
  nth_start_imputation_init  208
  100%
  NRMSE_arr   0.6005865462092106
  comp.time   146.27 s
  Pearson_arr   0.9718277151733904
  comp.time   146.27 s
p missing rate: 14.50%
  nth_start_imputation_init  157
  100%
  NRMSE_arr   0.6805642092349036
  comp.time   162.90 s
  Pearson_arr   0.917216497415677
  comp.time   162.90 s
p missing rate: 18.11%
  nth_start_imputation_init  80
  100%
  NRMSE_arr   0.8030594796007469
  comp.time   187.67 s
  Pea

#### k = 30% (Percobaan 5)

In [None]:
k_max = round(0.3*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.02%
  nth_start_imputation_init  574
  100%
  NRMSE_arr   0.5603285306050517
  comp.time   54.92 s
  Pearson_arr   0.9836929637967696
  comp.time   54.92 s
p missing rate: 5.88%
  nth_start_imputation_init  416
  100%
  NRMSE_arr   0.6113111668377893
  comp.time   92.68 s
  Pearson_arr   0.9516825374740282
  comp.time   92.68 s
p missing rate: 8.58%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6387072427333891
  comp.time   126.31 s
  Pearson_arr   0.9627078406733559
  comp.time   126.31 s
p missing rate: 11.52%
  nth_start_imputation_init  212
  100%
  NRMSE_arr   0.5972358538009587
  comp.time   150.21 s
  Pearson_arr   0.9579827261870084
  comp.time   150.21 s
p missing rate: 15.26%
  nth_start_imputation_init  135
  100%
  NRMSE_arr   0.652708918687429
  comp.time   174.87 s
  Pearson_arr   0.9125216060649111
  comp.time   174.87 s
p missing rate: 18.16%
  nth_start_imputation_init  94
  100%
  NRMSE_arr   0.7035970120607788
  comp.time   181.78 s
  Pea

### Untuk k = 40%

#### k = 40% (Percobaan 1)

In [None]:
k_max = round(0.4*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.6175742807265314
  comp.time   49.28 s
  Pearson_arr   0.8442765116462342
  comp.time   49.28 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.6055659743212015
  comp.time   94.95 s
  Pearson_arr   0.9601037827611241
  comp.time   94.95 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5430702678881034
  comp.time   122.05 s
  Pearson_arr   0.9251506487306473
  comp.time   122.06 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.6452395539824853
  comp.time   146.23 s
  Pearson_arr   0.9456587934607881
  comp.time   146.23 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.6801390626524261
  comp.time   164.17 s
  Pearson_arr   0.8647368666298773
  comp.time   164.17 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6763734209180272
  comp.time   183.42 s
  Pe

#### k = 40% (Percobaan 2)

In [None]:
k_max = round(0.4*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.79%
  nth_start_imputation_init  594
  100%
  NRMSE_arr   0.6321134705079324
  comp.time   46.90 s
  Pearson_arr   0.9443122828006918
  comp.time   46.90 s
p missing rate: 6.59%
  nth_start_imputation_init  392
  100%
  NRMSE_arr   0.5729904907860394
  comp.time   98.01 s
  Pearson_arr   0.9749883550574359
  comp.time   98.02 s
p missing rate: 8.66%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6139750716990807
  comp.time   122.04 s
  Pearson_arr   0.9649966988557407
  comp.time   122.04 s
p missing rate: 12.37%
  nth_start_imputation_init  174
  100%
  NRMSE_arr   0.7566095863565234
  comp.time   156.90 s
  Pearson_arr   0.8912167744970002
  comp.time   156.90 s
p missing rate: 15.04%
  nth_start_imputation_init  129
  100%
  NRMSE_arr   0.6568055139909643
  comp.time   167.78 s
  Pearson_arr   0.9250585942927373
  comp.time   167.78 s
p missing rate: 18.57%
  nth_start_imputation_init  88
  100%
  NRMSE_arr   0.7643297835587458
  comp.time   180.81 s
  Pe

#### k = 40% (Percobaan 3)

In [None]:
k_max = round(0.4*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.06%
  nth_start_imputation_init  567
  100%
  NRMSE_arr   0.5682047023146793
  comp.time   53.61 s
  Pearson_arr   0.9658078293451974
  comp.time   53.61 s
p missing rate: 5.96%
  nth_start_imputation_init  413
  100%
  NRMSE_arr   0.4935687104191821
  comp.time   92.28 s
  Pearson_arr   0.9842488415354164
  comp.time   92.28 s
p missing rate: 8.55%
  nth_start_imputation_init  311
  100%
  NRMSE_arr   0.6434324331724813
  comp.time   119.71 s
  Pearson_arr   0.9376901010241416
  comp.time   119.71 s
p missing rate: 11.82%
  nth_start_imputation_init  216
  100%
  NRMSE_arr   0.6804473098760891
  comp.time   143.71 s
  Pearson_arr   0.92351225514421
  comp.time   143.71 s
p missing rate: 15.01%
  nth_start_imputation_init  143
  100%
  NRMSE_arr   0.5901395817975794
  comp.time   165.40 s
  Pearson_arr   0.9423648787977491
  comp.time   165.40 s
p missing rate: 18.04%
  nth_start_imputation_init  102
  100%
  NRMSE_arr   0.7092046486623361
  comp.time   179.42 s
  Pea

#### k = 40% (Percobaan 4)

In [None]:
k_max = round(0.4*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.78%
  nth_start_imputation_init  589
  100%
  NRMSE_arr   0.7005256476529805
  comp.time   48.61 s
  Pearson_arr   0.9205461044479392
  comp.time   48.61 s
p missing rate: 6.30%
  nth_start_imputation_init  407
  100%
  NRMSE_arr   0.6489926060171861
  comp.time   93.77 s
  Pearson_arr   0.9057648376474732
  comp.time   93.77 s
p missing rate: 8.76%
  nth_start_imputation_init  313
  100%
  NRMSE_arr   0.6616275688270884
  comp.time   119.56 s
  Pearson_arr   0.895621914194143
  comp.time   119.57 s
p missing rate: 11.91%
  nth_start_imputation_init  208
  100%
  NRMSE_arr   0.6046265943346942
  comp.time   146.08 s
  Pearson_arr   0.97018382700164
  comp.time   146.08 s
p missing rate: 14.50%
  nth_start_imputation_init  157
  100%
  NRMSE_arr   0.6873728128931431
  comp.time   163.86 s
  Pearson_arr   0.9171782235748097
  comp.time   163.86 s
p missing rate: 18.11%
  nth_start_imputation_init  80
  100%
  NRMSE_arr   0.8027391692979942
  comp.time   182.71 s
  Pears

#### k = 40% (Percobaan 5)

In [None]:
k_max = round(0.4*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.02%
  nth_start_imputation_init  574
  100%
  NRMSE_arr   0.5644734739232052
  comp.time   52.71 s
  Pearson_arr   0.9833542590491782
  comp.time   52.72 s
p missing rate: 5.88%
  nth_start_imputation_init  416
  100%
  NRMSE_arr   0.6164727198347723
  comp.time   92.63 s
  Pearson_arr   0.9499099553894469
  comp.time   92.64 s
p missing rate: 8.58%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.633529596936976
  comp.time   123.18 s
  Pearson_arr   0.9617718558578242
  comp.time   123.18 s
p missing rate: 11.52%
  nth_start_imputation_init  212
  100%
  NRMSE_arr   0.6009728673914153
  comp.time   147.93 s
  Pearson_arr   0.9581374194502276
  comp.time   147.94 s
p missing rate: 15.26%
  nth_start_imputation_init  135
  100%
  NRMSE_arr   0.6541770096043088
  comp.time   169.21 s
  Pearson_arr   0.9137924948999381
  comp.time   169.21 s
p missing rate: 18.16%
  nth_start_imputation_init  94
  100%
  NRMSE_arr   0.7058422631183255
  comp.time   179.39 s
  Pea

### Untuk k = 50%

#### k = 50% (Percobaan 1)

In [None]:
k_max = round(0.5*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.6230992156817572
  comp.time   48.79 s
  Pearson_arr   0.8429443961477483
  comp.time   48.79 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.6071471613358346
  comp.time   93.32 s
  Pearson_arr   0.9610570279288199
  comp.time   93.33 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5515109454910413
  comp.time   123.80 s
  Pearson_arr   0.9228455697093029
  comp.time   123.81 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.6461676900663742
  comp.time   144.33 s
  Pearson_arr   0.9444308542334776
  comp.time   144.34 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.6783112414642193
  comp.time   166.05 s
  Pearson_arr   0.866725833298248
  comp.time   166.06 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6759490274490122
  comp.time   178.96 s
  Pea

#### k = 50% (Percobaan 2)

In [None]:
k_max = round(0.5*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.79%
  nth_start_imputation_init  594
  100%
  NRMSE_arr   0.6353329526210569
  comp.time   48.85 s
  Pearson_arr   0.9516314885793608
  comp.time   48.85 s
p missing rate: 6.59%
  nth_start_imputation_init  392
  100%
  NRMSE_arr   0.5737692163112961
  comp.time   99.18 s
  Pearson_arr   0.9754215987543253
  comp.time   99.19 s
p missing rate: 8.66%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6154539295970276
  comp.time   120.16 s
  Pearson_arr   0.9650948221501066
  comp.time   120.16 s
p missing rate: 12.37%
  nth_start_imputation_init  174
  100%
  NRMSE_arr   0.7561043486242757
  comp.time   155.98 s
  Pearson_arr   0.8854220000182371
  comp.time   155.98 s
p missing rate: 15.04%
  nth_start_imputation_init  129
  100%
  NRMSE_arr   0.6549875825888324
  comp.time   166.92 s
  Pearson_arr   0.9283104747731714
  comp.time   166.92 s
p missing rate: 18.57%
  nth_start_imputation_init  88
  100%
  NRMSE_arr   0.7637047294474244
  comp.time   180.97 s
  Pe

#### k = 50% (Percobaan 3)

In [None]:
k_max = round(0.5*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.06%
  nth_start_imputation_init  567
  100%
  NRMSE_arr   0.4735187459541012
  comp.time   53.95 s
  Pearson_arr   0.9629587892466247
  comp.time   53.95 s
p missing rate: 5.96%
  nth_start_imputation_init  413
  100%
  NRMSE_arr   0.4952410220478155
  comp.time   93.77 s
  Pearson_arr   0.9849070457190614
  comp.time   93.77 s
p missing rate: 8.55%
  nth_start_imputation_init  311
  100%
  NRMSE_arr   0.6498247971410946
  comp.time   118.37 s
  Pearson_arr   0.9355340633034366
  comp.time   118.38 s
p missing rate: 11.82%
  nth_start_imputation_init  216
  100%
  NRMSE_arr   0.6809853033369574
  comp.time   145.65 s
  Pearson_arr   0.9205347415732968
  comp.time   145.65 s
p missing rate: 15.01%
  nth_start_imputation_init  143
  100%
  NRMSE_arr   0.5794433774483726
  comp.time   164.73 s
  Pearson_arr   0.9462467565348426
  comp.time   164.73 s
p missing rate: 18.04%
  nth_start_imputation_init  102
  100%
  NRMSE_arr   0.7118843369499294
  comp.time   179.97 s
  P

#### k = 50% (Percobaan 4)

In [None]:
k_max = round(0.5*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.78%
  nth_start_imputation_init  589
  100%
  NRMSE_arr   0.7065724849407381
  comp.time   48.66 s
  Pearson_arr   0.923754213189941
  comp.time   48.66 s
p missing rate: 6.30%
  nth_start_imputation_init  407
  100%
  NRMSE_arr   0.6567460267616385
  comp.time   94.13 s
  Pearson_arr   0.9014088769720372
  comp.time   94.13 s
p missing rate: 8.76%
  nth_start_imputation_init  313
  100%
  NRMSE_arr   0.6633434630578335
  comp.time   119.36 s
  Pearson_arr   0.8780834542773926
  comp.time   119.36 s
p missing rate: 11.91%
  nth_start_imputation_init  208
  100%
  NRMSE_arr   0.6009153232533142
  comp.time   145.97 s
  Pearson_arr   0.97008843465822
  comp.time   145.98 s
p missing rate: 14.50%
  nth_start_imputation_init  157
  100%
  NRMSE_arr   0.6897171060654751
  comp.time   164.78 s
  Pearson_arr   0.9157654751205555
  comp.time   164.78 s
p missing rate: 18.11%
  nth_start_imputation_init  80
  100%
  NRMSE_arr   0.8040271077666188
  comp.time   183.00 s
  Pears

#### k = 50% (Percobaan 5)

In [None]:
k_max = round(0.5*D_gene_complete.raw_df.shape[0])

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 3.02%
  nth_start_imputation_init  574
  100%
  NRMSE_arr   0.5643806979554085
  comp.time   53.40 s
  Pearson_arr   0.9824920253522559
  comp.time   53.40 s
p missing rate: 5.88%
  nth_start_imputation_init  416
  100%
  NRMSE_arr   0.6214728463425316
  comp.time   92.23 s
  Pearson_arr   0.9478422157470437
  comp.time   92.23 s
p missing rate: 8.58%
  nth_start_imputation_init  303
  100%
  NRMSE_arr   0.6414198425236274
  comp.time   120.75 s
  Pearson_arr   0.9590827798655663
  comp.time   120.75 s
p missing rate: 11.52%
  nth_start_imputation_init  212
  100%
  NRMSE_arr   0.6034275187282931
  comp.time   146.53 s
  Pearson_arr   0.9586708686542034
  comp.time   146.53 s
p missing rate: 15.26%
  nth_start_imputation_init  135
   51%

#### k = 125 (Percobaan 1)

In [None]:
#k_max = 25, 35, 45
k_max = 125

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.5960939111618456
  comp.time   51.14 s
  Pearson_arr   0.8507361283219557
  comp.time   51.14 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.6175173039418621
  comp.time   94.66 s
  Pearson_arr   0.9626709880825008
  comp.time   94.66 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5389321954903415
  comp.time   122.08 s
  Pearson_arr   0.9242911860982941
  comp.time   122.08 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.6296288549697572
  comp.time   146.28 s
  Pearson_arr   0.9453727356129976
  comp.time   146.28 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.7038134392959943
  comp.time   166.14 s
  Pearson_arr   0.8571980575201885
  comp.time   166.15 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6836468033799721
  comp.time   178.07 s
  Pe

#### k = 125 (Percobaan 2)

In [None]:
#k_max = 25, 35, 45
k_max = 125

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

#### k = 125 (Percobaan 3)

In [None]:
#k_max = 25, 35, 45
k_max = 125

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

#### k = 125 (Percobaan 4)

In [None]:
#k_max = 25, 35, 45
k_max = 125

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))#### k = 125 (Percobaan 1)

#### k = 125 (Percobaan 5)

In [None]:
#k_max = 25, 35, 45
k_max = 125

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

#### k = 145 (Percobaan 1)

In [None]:
#k_max = 25, 35, 45
k_max = 145

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.5971177932148954
  comp.time   50.11 s
  Pearson_arr   0.8502935710270406
  comp.time   50.11 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.621518300128091
  comp.time   91.78 s
  Pearson_arr   0.9625547673265112
  comp.time   91.78 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.5397470790379765
  comp.time   124.89 s
  Pearson_arr   0.9255976174539665
  comp.time   124.89 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.6282056744206076
  comp.time   149.82 s
  Pearson_arr   0.9469871819128932
  comp.time   149.83 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.7110300649157654
  comp.time   164.60 s
  Pearson_arr   0.8573613130463994
  comp.time   164.60 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.6868678048004996
  comp.time   175.78 s
  Pea

#### k = 165 (Percobaan 1)

In [None]:
#k_max = 25, 35, 45
k_max = 165

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

p missing rate: 2.88%
  nth_start_imputation_init  586
  100%
  NRMSE_arr   0.6036454680497818
  comp.time   48.79 s
  Pearson_arr   0.8445146591447305
  comp.time   48.79 s
p missing rate: 6.11%
  nth_start_imputation_init  409
  100%
  NRMSE_arr   0.5907442934373217
  comp.time   91.93 s
  Pearson_arr   0.9553014649162211
  comp.time   91.93 s
p missing rate: 8.81%
  nth_start_imputation_init  294
  100%
  NRMSE_arr   0.543442002292941
  comp.time   121.56 s
  Pearson_arr   0.9244610148279934
  comp.time   121.56 s
p missing rate: 12.09%
  nth_start_imputation_init  215
  100%
  NRMSE_arr   0.63272037000178
  comp.time   142.03 s
  Pearson_arr   0.9467866466392119
  comp.time   142.03 s
p missing rate: 14.83%
  nth_start_imputation_init  144
  100%
  NRMSE_arr   0.7091035338106435
  comp.time   161.56 s
  Pearson_arr   0.8588952603003258
  comp.time   161.57 s
p missing rate: 17.08%
  nth_start_imputation_init  99
  100%
  NRMSE_arr   0.691284351164987
  comp.time   174.90 s
  Pearso

#### k = 335

In [None]:
#k_max = 25, 35, 45
k_max = 335

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

#### k = 375

In [None]:
#k_max = 25, 35, 45
k_max = 375

y_cluster_val = D_gene_complete.value.copy()

NRMSE_arr = np.zeros(len(list_missing_files))
Pearson_arr = np.zeros(len(list_missing_files))

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):
    # create y_cluster_val_mis
    start_time = time.perf_counter()
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    p_mis_rate = y_cluster_val_mis.get_p_mis_rate()
    
    # compute y_estimate
    print("p missing rate: {:.2f}%".format(p_mis_rate*10))
    y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted \
        = SBi_MSRE(y_cluster_val, y_cluster_val_mis.value, k_max=k_max)
    print("")
    #print(f"F_mat_mis\n{F_mat_mis}")
    #print(f"F_mat_orig\n{np.argwhere(F_mat_orig == 0)}")
    
    # compute NRMSE
    NRMSE_arr[i] = compute_NRMSE(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  NRMSE_arr  ", NRMSE_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

    # compute Pearson Correlation
    Pearson_arr[i] = compute_pearsoncorr(y_cluster_val_sorted, y_cluster_val_estimate, F_mat_sorted)
    print("  Pearson_arr  ", Pearson_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

### Comparison

In [None]:
class AvgWeightImputation(object):
    """
        A class to define an object of row or column imputation method
        There are two type of average division.
        You can set "full" division. It means whatever your element (NaN or not)
        it will be counted as a number of divisor.
        If you select "partial" division, you only divide the sum of row or column
        by the elements which are not NaN.
        
        Available types of weight: "ones", "random", "euclid"
        
        When we select method="full" and weights="random", we assume
        that NaN elements has zero values such that doesn't contribute
        to the sum.
        
        For the type of weight is "euclid", it will use the same procedure
        in the calculation of `compute_alpha_mis`.
        Selecting {"full", "euclid"} or {"partial", "euclid"} doesn't have any 
        difference, because we have performed rowImputation or colImputation
        before calculating euclidean distance.
    """
    def __init__(self, avg_div="full", method="row", weights="ones", rand_range=[0, 1]):
        self.avg_div = avg_div
        self.method = method
        self.weights = weights
        self.rand_range = rand_range
        
    def fit_transform(self, D_gene_mis):
        avg_div = self.avg_div
        method = self.method
        weights = self.weights
        
        if method == "row":
            A = self.row_avg(self, D_gene_mis, avg_div, weights)
        elif method == "col":
            A = self.col_avg(self, D_gene_mis, avg_div, weights)
        elif method == "rowcol":
            A = self.rowcol_avg(self, D_gene_mis, avg_div, weights)
        else:
            print(f'Set the correct imputation: "row" or "col"')
        
        return A


    @staticmethod
    def row_avg(self, D_gene_mis, avg_div, weights):
        a, b = self.rand_range
        missing_idx = np.argwhere(np.isnan(D_gene_mis))
        A = D_gene_mis.copy()
        D_gene_shape = D_gene_mis.shape
        
        # we need a complete gene expression (without missing val to 
        # compute euclidean distance). Another option is to set
        # the missing val to be zero, but this will differ to the SBi_MSRE algorithm
        # which is set the initial missing val by row average
        if weights == "euclid":
            B = self.row_avg(self, D_gene_mis, avg_div, "ones")
    
    
        if avg_div == "partial":
            
            for p, q in missing_idx:
                
                idx_cols_without_nan = np.argwhere(np.isnan(D_gene_mis[p]) == False).flatten()
                
                if weights == "ones":
                    A[p, q] = np.sum(D_gene_mis[p, idx_cols_without_nan])/len(idx_cols_without_nan)
                elif weights == "random":
                    rand_weights = self.generate_random(self, a, b, len(idx_cols_without_nan))
                    #rand_weights = rng_generator.random(len(idx_cols_without_nan))
                    A[p, q] = np.sum(D_gene_mis[p, idx_cols_without_nan] * rand_weights)/np.sum(rand_weights)
                elif weights == "euclid":
                    idx_cols_without_q = np.arange(D_gene_shape[1], dtype=int)\
                                     [np.arange(D_gene_shape[1], dtype=int) != q]
                    euclid_weights = self.weight_for_row_avg(B, q)
                    A[p, q] = np.sum(B[p, idx_cols_without_q] * euclid_weights)\
                              /np.sum(euclid_weights)
                                        
                else:
                    print(f'Set the correct weights: "ones", "random", or "euclid"')
                    break
                    
        
        elif avg_div == "full":
            
            for p, q in missing_idx:
                
                idx_cols_without_nan = np.argwhere(np.isnan(D_gene_mis[p]) == False).flatten()

                if weights == "ones":
                    A[p, q] = np.sum(D_gene_mis[p, idx_cols_without_nan])/len(D_gene_mis[p])
                elif weights == "random":
                    rand_weights = self.generate_random(self, a, b, len(D_gene_mis[p]))
                    #rand_weights = rng_generator.random(len(D_gene_mis[p]))
                    A[p, q] = np.sum(D_gene_mis[p, idx_cols_without_nan] \
                                     * rand_weights[idx_cols_without_nan])\
                              /np.sum(rand_weights)
                elif weights == "euclid":
                    idx_cols_without_q = np.arange(D_gene_shape[1], dtype=int)\
                                     [np.arange(D_gene_shape[1], dtype=int) != q]
                    euclid_weights = self.weight_for_row_avg(B, q)
                    A[p, q] = np.sum(B[p, idx_cols_without_q] * euclid_weights)\
                              /np.sum(euclid_weights)
                else:
                    print(f'Set the correct weights: "ones", "random", or "euclid"')
                    break
                
        else:
            print(f'Set the correct option for avg_div: "full" or "partial"')
            return None
        
        return A
    
    
    @staticmethod
    def col_avg(self, D_gene_mis, avg_div, weights):
        a, b = self.rand_range
        A = D_gene_mis.copy()
        D_gene_shape = D_gene_mis.shape
        
        # we need a complete gene expression (without missing val to 
        # compute euclidean distance). Another option is to set
        # the missing val to be zero, but this will differ to the SBi_MSRE algorithm
        # which is set the initial missing val by row average
        if weights == "euclid":
            B = self.col_avg(self, D_gene_mis, avg_div, "ones")
        
        if avg_div == "partial":
            
            for col in range(len(D_gene_mis[0])):
                missing_rows = np.argwhere(np.isnan(D_gene_mis[:, col])).flatten()
                row_without_nan = np.argwhere(np.isnan(D_gene_mis[:, col]) == False).flatten()
                
                # Two for loops aren't not slowing down the computation,
                # because the number of missing cols are small
                for row in missing_rows:
                    if weights == "ones":
                        A[row, col] = np.sum(D_gene_mis[row_without_nan, col])/len(row_without_nan)
                    elif weights == "random":
                        rand_weights = self.generate_random(self, a, b, len(row_without_nan))
                        #rand_weights = rng_generator.random(len(row_without_nan))
                        A[row, col] = np.sum(D_gene_mis[row_without_nan, col] * rand_weights)\
                                      /np.sum(rand_weights)
                    elif weights == "euclid":
                        idx_rows_without_row = np.arange(D_gene_shape[0], dtype=int)\
                                               [np.arange(D_gene_shape[0], dtype=int) != row]
                        euclid_weights = self.weight_for_col_avg(B, row)
                        A[row, col] = np.sum(B[idx_rows_without_row, col] * euclid_weights)\
                                      /np.sum(euclid_weights)
                    else:
                        print(f'Set the correct weights: "ones", "random", or "euclid"')
                        break
                    
                    
        elif avg_div == "full":
            for col in range(len(D_gene_mis[0])):
                missing_rows = np.argwhere(np.isnan(D_gene_mis[:, col])).flatten()
                row_without_nan = np.argwhere(np.isnan(D_gene_mis[:, col]) == False).flatten()
                
                for row in missing_rows:
                    if weights == "ones":
                        A[row, col] = np.sum(D_gene_mis[row_without_nan, col])/len(D_gene_mis)
                    elif weights == "random":
                        rand_weights = self.generate_random(self, a, b, len(D_gene_mis))
                        #rand_weights = rng_generator.random(len(D_gene_mis))
                        A[row, col] = np.sum(D_gene_mis[row_without_nan, col] *  rand_weights[row_without_nan])\
                                      /np.sum(rand_weights)
                    elif weights == "euclid":
                        idx_rows_without_row = np.arange(D_gene_shape[0], dtype=int)\
                                               [np.arange(D_gene_shape[0], dtype=int) != row]
                        euclid_weights = self.weight_for_col_avg(B, row)
                        A[row, col] = np.sum(B[idx_rows_without_row, col] * euclid_weights)\
                                      /np.sum(euclid_weights)
                    else:
                        print(f'Set the correct weights: "ones", "random", or "euclid"')
                        break
                    
        else:
            print(f'Set the correct option for avg_div: "full" or "partial"')
            return None
        
        return A
    
    
    @staticmethod
    def rowcol_avg(self, D_gene_mis, avg_div, weights):
        missing_idx = np.argwhere(np.isnan(D_gene_mis))
        A = D_gene_mis.copy()
        
        Arow = self.row_avg(self, D_gene_mis, avg_div, weights)
        Acol = self.col_avg(self, D_gene_mis, avg_div, weights)
                
        for p, q in missing_idx:
            A[p, q] = (Arow[p, q] + Acol[p, q])/2.
        
        
        return A
    
    @staticmethod
    def generate_random(self, a, b, N):
        a, b = self.rand_range
 
        return (b - a)*rng_generator.random(N) + a

    @staticmethod
    def weight_for_row_avg(C_bicluster, target_col):
        C_bicluster_shape = C_bicluster.shape
        col_without_target_col = np.arange(C_bicluster_shape[1], dtype=int)\
                                 [np.arange(C_bicluster_shape[1], dtype=int) != target_col]
        C_target_col = C_bicluster[:, target_col].reshape(-1, 1)
        C_remain = C_bicluster[:, col_without_target_col]

        euclidean_arr = np.sqrt(np.sum((C_remain - C_target_col)**2, axis=0))


        if np.any(euclidean_arr < 1e-14):
            # We only consider the closest and discard the others
            idx_nan_weight = np.argwhere(euclidean_arr < 1e-14)
            W_jl = np.zeros_like(euclidean_arr)
            W_jl[idx_nan_weight] = 1/len(idx_nan_weight)
        else:
            inv_euclid_arr = 1./euclidean_arr  # further genes should have small contribution
            W_jl = (inv_euclid_arr) /np.sum(inv_euclid_arr) \
                    if np.sum(inv_euclid_arr) > 1e-14 else np.zeros_like(inv_euclid_arr)
        
        return W_jl
    
    @staticmethod
    def weight_for_col_avg(C_bicluster, target_row):
        C_bicluster_shape = C_bicluster.shape
        row_without_target_row = np.arange(C_bicluster_shape[0], dtype=int)\
                                 [np.arange(C_bicluster_shape[0], dtype=int) != target_row]
        
        C_target_row = C_bicluster[target_row]
        C_remain = C_bicluster[row_without_target_row]

        euclidean_arr = np.sqrt(np.sum((C_remain - C_target_row)**2, axis=1))

        if np.any(euclidean_arr < 1e-14):
            # We only consider the closest and discard the others
            idx_nan_weight = np.argwhere(euclidean_arr < 1e-14)
            W_ik = np.zeros_like(euclidean_arr)
            W_ik[idx_nan_weight] = 1/len(idx_nan_weight)
        else:
            inv_euclid_arr = 1./euclidean_arr   # further genes should have small contribution
            W_ik = (inv_euclid_arr) /np.sum(inv_euclid_arr) \
                    if np.sum(inv_euclid_arr) > 1e-14 else np.zeros_like(inv_euclid_arr)
        
        return W_ik

In [None]:
# lower bound should be >= 0
rand_range = [0, 2]

NRMSE_row_avg_arr = np.zeros_like(NRMSE_arr)
NRMSE_col_avg_arr = np.zeros_like(NRMSE_arr)
NRMSE_rowcol_avg_arr = np.zeros_like(NRMSE_arr)

row_imputer = AvgWeightImputation(method="row", weights="euclid")
col_imputer = AvgWeightImputation(method="col", weights="euclid")
rowcol_imputer = AvgWeightImputation(method="rowcol", weights="euclid")

for i, y_cluster_val_mis in enumerate(D_gene_mis_arr):    
    start_time = time.perf_counter()
    
    y_cluster_val_estimate_row = row_imputer.fit_transform(y_cluster_val_mis.value)
    y_cluster_val_estimate_col = col_imputer.fit_transform(y_cluster_val_mis.value)
    y_cluster_val_estimate_rowcol = rowcol_imputer.fit_transform(y_cluster_val_mis.value)
    
    F_mat_orig = y_cluster_val_mis.get_F_mat()
    
    # compute NRMSE
    NRMSE_row_avg_arr[i] = compute_NRMSE(y_cluster_val, y_cluster_val_estimate_row, F_mat_orig)
    NRMSE_col_avg_arr[i] = compute_NRMSE(y_cluster_val, y_cluster_val_estimate_col, F_mat_orig)
    NRMSE_rowcol_avg_arr[i] = compute_NRMSE(y_cluster_val, y_cluster_val_estimate_rowcol, F_mat_orig)
    #print("  NRMSE_arr  ", NRMSE_rowcol_avg_arr[i])
    print("  comp.time   {:.2f} s".format(time.perf_counter() - start_time))

In [None]:
gold = "#D4AF37"

fig, ax = plt.subplots()

p_mis_rate_arr = [D_gene_mis.get_p_mis_rate() for D_gene_mis in D_gene_mis_arr]

ax.plot(p_mis_rate_arr, NRMSE_arr, "ro-", label="SBiMSRE",
        markeredgecolor='red', markerfacecolor="white");
ax.plot(p_mis_rate_arr, NRMSE_rowcol_avg_arr, "o-.", color=gold, label="RowColAvgWeight",
        markeredgecolor=gold, markerfacecolor="white")

ax.set_xlabel("Rate of missing entries")
ax.set_ylabel("NRMSE")

ax.legend(loc="upper left", bbox_to_anchor=[1.03, 1], handlelength=4, borderaxespad=0)

plt.show(fig)

In [None]:
gold = "#D4AF37"

fig, ax = plt.subplots()

p_mis_rate_arr = [D_gene_mis.get_p_mis_rate() for D_gene_mis in D_gene_mis_arr]

ax.plot(p_mis_rate_arr, NRMSE_arr, "ro-", label="SBiMSRE",
        markeredgecolor='red', markerfacecolor="white");
ax.plot(p_mis_rate_arr, NRMSE_row_avg_arr, "bo--", label="RowAvgWeight",
        markeredgecolor='blue', markerfacecolor="white");
ax.plot(p_mis_rate_arr, NRMSE_col_avg_arr, "go:", label="ColAvgWeight",
        markeredgecolor='green', markerfacecolor="white")
ax.plot(p_mis_rate_arr, NRMSE_rowcol_avg_arr, "o-.", color=gold, label="RowColAvgWeight",
        markeredgecolor=gold, markerfacecolor="white")

ax.set_xlabel("Rate of missing entries")
ax.set_ylabel("NRMSE")

ax.legend(loc="upper left", bbox_to_anchor=[1.03, 1], handlelength=4, borderaxespad=0)

plt.show(fig)