In this notebook we explain the reason why the similar_rate_over90, similar_rate_over95, similar_rate_over99 of IMR90, NHEK in  `GSE63525_summary_similar_rate_percentage_table` is nearly zero.

Note that we found the `similar_rate` of the Juicer-created PC1 versus the Sklearn-calculated PC1 (Using Juicer-created Pearson as input) is only about 0.78 for IMR90 chrY, and only about 0.85 for nhek chrY. That is the reason why the "similar_rate_over90", "similar_rate_over95", ""similar_rate_over99" is quite low (We use the Juicer-created PC1 as ground truth, actually the Sklearn-calculated PC1 is highly similar to most of the columns in the covariance matrix).

In [1]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd
from sklearn.decomposition import PCA
from hicpep import peptools

def flip_tracks(track1_np: np.ndarray, track2_np: np.ndarray):
    if len(track1_np) != len(track2_np):
        print("The length of track1_np is different with track2_np")
        print(f"Length of track1_np: {len(track1_np)}")
        print(f"Length of track2_np: {len(track2_np)}")

    a = track1_np[~np.isnan(track1_np)]
    b = track2_np[~np.isnan(track2_np)]
    cos_sim = dot(a, b) / (norm(a) * norm(b))

    if cos_sim < 0:
        track2_np = -track2_np

    return track1_np, track2_np

In [2]:
pearson = "/media/jordan990301/Samsung_T5/HiC_Datasets/data_for_hicpep/data_store/data/rao_2014/juicer_outputs/imr90/100000/pearsons/pearson_chrY.txt"
pearson_df = pd.read_table(pearson, header=None, sep="\s+")
pearson_np = pearson_df.values # Turn into numpy.ndarray
pearson_np = pearson_np.astype('float64')
diag = np.diag(pearson_np)
diag_valid = ~np.isnan(diag)
ixgrid = np.ix_(diag_valid, diag_valid) # Record the position of the valid sub-matrix.
pearson_np = pearson_np[ixgrid]

pca = PCA(n_components=3)
pca.fit(pearson_np)
print(f"Explained variance ratio for the top 3 PC in IMR90 chrY")
print(pca.explained_variance_ratio_, '\n')
sklearn_pc1_np = pca.components_[0]

# Full back the origin NaN value
tmp = np.full(len(diag_valid), np.nan) 
tmp[diag_valid] = sklearn_pc1_np
sklearn_pc1_np = tmp

pc1 = f"/media/jordan990301/Samsung_T5/HiC_Datasets/data_for_hicpep/data_store/data/rao_2014/juicer_outputs/imr90/100000/eigenvector/pc1_chrY.txt"
pc1_df = pd.read_table(pc1, header=None)
pc1_np = pc1_df.values # Turn into numpy format
pc1_np = pc1_np.flatten() # Turn into 1D vector

sklearn_pc1_np, pc1_np = flip_tracks(sklearn_pc1_np, pc1_np)

print("The similarity between Sklean calculated PC1 verses the Juicer created PC1 is only about 0.78")
print(peptools.calc_similarity(sklearn_pc1_np, pc1_np))

Explained variance ratio for the top 3 PC in IMR90 chrY
[0.97955514 0.00423985 0.00384376] 

The similarity between Sklean calculated PC1 verses the Juicer created PC1 is only about 0.78
{'total_entry_num': 594, 'valid_entry_num': 82, 'similar_num': 64, 'similar_rate': 0.7804878048780488}


In [3]:
cov_np = np.cov(pearson_np, bias=True)
similar_rates_over90_count = 0
similar_rates_over95_count = 0
similar_rates_over99_count = 0

for i in range(len(cov_np[0])):
    est_np = np.full(len(diag_valid), np.nan)
    est_np[diag_valid] = cov_np[i]

    sklearn_pc1_np, est_np = flip_tracks(sklearn_pc1_np, est_np)
    similar_info = peptools.calc_similarity(track1_np=sklearn_pc1_np, track2_np=est_np)

    if similar_info["similar_rate"] >= float(0.9):
        similar_rates_over90_count += 1
    
    if similar_info["similar_rate"] >= float(0.95):
        similar_rates_over95_count += 1

    if similar_info["similar_rate"] >= float(0.99):
        similar_rates_over99_count += 1
    
similar_rates_over90 = float(similar_rates_over90_count / len(cov_np[0]))
similar_rates_over95 = float(similar_rates_over95_count / len(cov_np[0]))
similar_rates_over99 = float(similar_rates_over99_count / len(cov_np[0]))

print("For IMR90 ChrY using Sklearn-created PC1_np as ground Truth")
print(f"similar_rates_over90 = {similar_rates_over90}")
print(f"similar_rates_over95 = {similar_rates_over95}")
print(f"similar_rates_over99 = {similar_rates_over99}")


For IMR90 ChrY using Sklearn-created PC1_np as ground Truth
similar_rates_over90 = 1.0
similar_rates_over95 = 1.0
similar_rates_over99 = 0.9634146341463414


In [4]:
pearson = "/media/jordan990301/Samsung_T5/HiC_Datasets/data_for_hicpep/data_store/data/rao_2014/juicer_outputs/nhek/100000/pearsons/pearson_chrY.txt"
pearson_df = pd.read_table(pearson, header=None, sep="\s+")
pearson_np = pearson_df.values # Turn into numpy.ndarray
pearson_np = pearson_np.astype('float64')
diag = np.diag(pearson_np)
diag_valid = ~np.isnan(diag)
ixgrid = np.ix_(diag_valid, diag_valid) # Record the position of the valid sub-matrix.
pearson_np = pearson_np[ixgrid]

pca = PCA(n_components=3)
pca.fit(pearson_np)
print(f"Explained variance ratio for the top 3 PC in NHEK chrY")
print(pca.explained_variance_ratio_, '\n')
sklearn_pc1_np = pca.components_[0]

# Full back the origin NaN value
tmp = np.full(len(diag_valid), np.nan) 
tmp[diag_valid] = sklearn_pc1_np
sklearn_pc1_np = tmp

pc1 = f"/media/jordan990301/Samsung_T5/HiC_Datasets/data_for_hicpep/data_store/data/rao_2014/juicer_outputs/nhek/100000/eigenvector/pc1_chrY.txt"
pc1_df = pd.read_table(pc1, header=None)
pc1_np = pc1_df.values # Turn into numpy format
pc1_np = pc1_np.flatten() # Turn into 1D vector

sklearn_pc1_np, pc1_np = flip_tracks(sklearn_pc1_np, pc1_np)

print("The similarity between Sklean calculated PC1 verses the Juicer created PC1 is only about 0.85")
print(peptools.calc_similarity(sklearn_pc1_np, pc1_np))

Explained variance ratio for the top 3 PC in NHEK chrY
[0.98469243 0.00588189 0.00207806] 

The similarity between Sklean calculated PC1 verses the Juicer created PC1 is only about 0.85
{'total_entry_num': 594, 'valid_entry_num': 63, 'similar_num': 54, 'similar_rate': 0.8571428571428571}


In [5]:
cov_np = np.cov(pearson_np, bias=True)
similar_rates_over90_count = 0
similar_rates_over95_count = 0
similar_rates_over99_count = 0

for i in range(len(cov_np[0])):
    est_np = np.full(len(diag_valid), np.nan)
    est_np[diag_valid] = cov_np[i]

    sklearn_pc1_np, est_np = flip_tracks(sklearn_pc1_np, est_np)
    similar_info = peptools.calc_similarity(track1_np=sklearn_pc1_np, track2_np=est_np)

    if similar_info["similar_rate"] >= float(0.9):
        similar_rates_over90_count += 1
    
    if similar_info["similar_rate"] >= float(0.95):
        similar_rates_over95_count += 1

    if similar_info["similar_rate"] >= float(0.99):
        similar_rates_over99_count += 1
    
similar_rates_over90 = float(similar_rates_over90_count / len(cov_np[0]))
similar_rates_over95 = float(similar_rates_over95_count / len(cov_np[0]))
similar_rates_over99 = float(similar_rates_over99_count / len(cov_np[0]))

print("For NHEK ChrY using Sklearn-created PC1_np as ground Truth")
print(f"similar_rates_over90 = {similar_rates_over90}")
print(f"similar_rates_over95 = {similar_rates_over95}")
print(f"similar_rates_over99 = {similar_rates_over99}")


For NHEK ChrY using Sklearn-created PC1_np as ground Truth
similar_rates_over90 = 1.0
similar_rates_over95 = 0.9682539682539683
similar_rates_over99 = 0.9047619047619048
