In [16]:
import numpy as np
import pandas as pd
import hicstraw
from matplotlib import pyplot as plt
np.set_printoptions(suppress=True)
# np.set_printoptions(precision=2)

In [17]:
hic_path = "/media/jordan990301/Samsung_T5/HiC_Datasets/Rao_2014/GM12878/MAPQGE30/GSE63525_GM12878_insitu_primary_replicate_combined_30.hic"
input_path = "/home/jordan990301/PCA_Experiments/outputs/GM12878/100Kb/origin"

In [18]:
hic = hicstraw.HiCFile(hic_path)
chrom_list = []

for chrom in hic.getChromosomes():
    if (chrom.name != "All" and chrom.name != "MT"):
        chrom_list.append(chrom.name)

print(chrom_list)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']


In [19]:
for chrom in chrom_list:
    # Read in the Pearson correlatin matrix
    pearson_df = pd.read_table(f"{input_path}/origin_pearson_chrom{chrom}.txt", header=None, sep=" ")
    pearson_df.pop(pearson_df.columns[-1])
    pearson_df = pearson_df.dropna(axis=0, how="all").reset_index(drop=True)
    pearson_df = pearson_df.dropna(axis=1, how="all")

    # Turn pearson_df into numpy format
    pearson_np = pearson_df.values

    # Read in the Eigenvector 1
    EV1_df = pd.read_table(f"{input_path}/origin_pc1_chrom{chrom}.txt", header=None, sep=" ")
    EV1_df = EV1_df.dropna(axis=0, how="all").reset_index(drop=True)
    EV1_np = EV1_df.values
    EV1_np = EV1_np.flatten()

    if (len(pearson_np) == len(EV1_np)):
        # According the steps in SVD, set x as pearson_df, and set y as x' / np.sqrt(n)
        n = len(pearson_np)
        pearson_np_T = np.transpose(pearson_np)
        y = pearson_np_T / np.sqrt(n)
        cov_pearson_np = np.matmul(np.transpose(y), y) # Covariance matrix of pearson_df

        # display((cov_pearson_df > 0) == (np.corrcoef(pearson_df) > 0)) ## The sign should all be the same.

        cov_pearson_np_absSum = [np.sum(np.abs(row)) for row in cov_pearson_np] # Turn list into tuple with index, ex: (0, 2)
        cov_pearson_np_absSum = list(enumerate(cov_pearson_np_absSum)) 
        sorted_list = sorted(cov_pearson_np_absSum, key=lambda x: x[1], reverse=True) 
        
        # The sign of the pearson with the largest absSum in cov_pearson_np_absSum should correspond with the patterns of EV1.
        EV1_pred = cov_pearson_np[sorted_list[0][0]] > 0

        EV1_Pos = EV1_np > 0
        EV1_pred_Pos = EV1_pred > 0
        EV1_Compare_EV1_pred = EV1_pred_Pos == EV1_Pos

        with open(f'/home/jordan990301/PCA_Experiments/outputs/Logs/EV1-CorrD_largest_absSum/GM12878/100Kb/patterns_compare/GM12878_chr{chrom}.txt',"w+") as f:
            for i in EV1_Compare_EV1_pred:
                f.write(str(i))
                f.write('\n')
        
        with open(f'/home/jordan990301/PCA_Experiments/outputs/Logs/EV1-CorrD_largest_absSum/GM12878/100Kb/difference_count/GM12878_chr{chrom}.txt',"w+") as f:
            compare_true = list(EV1_Compare_EV1_pred).count(True)
            compare_false = list(EV1_Compare_EV1_pred).count(False)
            f.write('Total Entry count:')
            f.write(str(len(pearson_np)) + '\n')
            f.write('Compare True:')
            f.write(str(compare_true) + '\n')
            f.write('Compare False:')
            f.write(str(compare_false) + '\n')
    
        # Visualization
        plot_x_axis = [i + 1 for i in range(len(pearson_np))]
        output_plot_path = "/home/jordan990301/PCA_Experiments/outputs/plots/EV1-CorrD_largest_absSum/GM12878/100Kb"
        
        EV1_Colors = ['b' if i else 'r' for i in EV1_Pos]
        EV1_pred_Viz = [1 if i else -1 for i in EV1_pred_Pos]

        plt.xticks(np.arange(0, len(pearson_np), 50)) 
        plt.rcParams["figure.figsize"] = [20, 5]
        plt.rcParams["figure.autolayout"] = True
        plt.scatter(plot_x_axis, EV1_pred_Viz, c=EV1_Colors)
        plt.savefig(f'{output_plot_path}/GM12878_Chr{chrom}_100Kb.png')
        plt.clf() 

<Figure size 2000x500 with 0 Axes>