In [68]:
%%bash
CELL_LINE="K562"
RESOLUTION="100Kb"

rm -rf "/home/jordan990301/PCA_Experiments/outputs/Logs/EV1-CorrD_largest_absSum/$CELL_LINE/$RESOLUTION"
mkdir -p "/home/jordan990301/PCA_Experiments/outputs/Logs/EV1-CorrD_largest_absSum/$CELL_LINE/$RESOLUTION/patterns_compare"
mkdir -p "/home/jordan990301/PCA_Experiments/outputs/Logs/EV1-CorrD_largest_absSum/$CELL_LINE/$RESOLUTION/difference_count"

rm -rf "/home/jordan990301/PCA_Experiments/outputs/plots/EV1-CorrD_largest_absSum/$CELL_LINE/$RESOLUTION"
mkdir -p "/home/jordan990301/PCA_Experiments/outputs/plots/EV1-CorrD_largest_absSum/$CELL_LINE/$RESOLUTION/Scatter"
mkdir -p "/home/jordan990301/PCA_Experiments/outputs/plots/EV1-CorrD_largest_absSum/$CELL_LINE/$RESOLUTION/Line"

In [69]:
import numpy as np
import pandas as pd
import hicstraw
from matplotlib import pyplot as plt
np.set_printoptions(suppress=True)
# np.set_printoptions(precision=2)

In [70]:
cell_line = 'K562'
resolution = "100Kb"
figsize = 50

# file_name = 'GSE63525_GM12878_insitu_primary_replicate_combined_30.hic'
file_name = 'GSE63525_K562_combined_30.hic'

hic_path = f"/media/jordan990301/Samsung_T5/HiC_Datasets/Rao_2014/{cell_line}/MAPQGE30/{file_name}"
input_path = f"/home/jordan990301/PCA_Experiments/data/juicer_outputs/{cell_line}/{resolution}/origin"

In [71]:
hic = hicstraw.HiCFile(hic_path)
chrom_list = []

for chrom in hic.getChromosomes():
    if (chrom.name != "All" and chrom.name != "MT"):
        chrom_list.append(chrom.name)

print(chrom_list)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']


In [72]:
for chrom in chrom_list:
    # Read in the Pearson correlatin matrix
    pearson_df = pd.read_table(f"{input_path}/origin_pearson_chrom{chrom}.txt", header=None, sep=" ")
    pearson_df.pop(pearson_df.columns[-1])
    pearson_df = pearson_df.dropna(axis=0, how="all").reset_index(drop=True)
    pearson_df = pearson_df.dropna(axis=1, how="all")

    # Turn into numpy format
    pearson_np = pearson_df.values

    # Read in the Eigenvector 1
    EV1_df = pd.read_table(f"{input_path}/origin_pc1_chrom{chrom}.txt", header=None, sep=" ")
    EV1_df = EV1_df.dropna(axis=0, how="all").reset_index(drop=True)
    EV1_np = EV1_df.values
    EV1_np = EV1_np.flatten()

    if (len(pearson_np) == len(EV1_np)):
        # According the steps in SVD, set x as pearson_df, and set y as x' / np.sqrt(n)
        n = len(pearson_np)
        pearson_np_T = np.transpose(pearson_np)
        y = pearson_np_T / np.sqrt(n)
        cov_pearson_np = np.matmul(np.transpose(y), y) # Covariance matrix of pearson_df

        # display((cov_pearson_df > 0) == (np.corrcoef(pearson_df) > 0)) ## The sign should all be the same.

        cov_pearson_np_absSum = [np.sum(np.abs(row)) for row in cov_pearson_np] # Turn list into tuple with index, ex: (0, 2)
        cov_pearson_np_absSum = list(enumerate(cov_pearson_np_absSum)) 
        sorted_list = sorted(cov_pearson_np_absSum, key=lambda x: x[1], reverse=True) 
        
        # The sign of the pearson with the largest absSum in cov_pearson_np_absSum should correspond with the patterns of EV1.
        cov_pearson_np_Selected = cov_pearson_np[sorted_list[0][0]]

        # Flip the sign if the sign of the 1st entries are not the same.
        if (cov_pearson_np_Selected[0] > 0) != (EV1_np[0] > 0):
            cov_pearson_np_Selected = -cov_pearson_np_Selected

        EV1_Pos = EV1_np > 0
        cov_pearson_np_Selected_Pos = cov_pearson_np_Selected > 0
        EV1_Pos_vs_EV1_pred_Pos = cov_pearson_np_Selected_Pos == EV1_Pos
        
    # break
        with open(f'/home/jordan990301/PCA_Experiments/outputs/Logs/EV1-CorrD_largest_absSum/{cell_line}/{resolution}/patterns_compare/{cell_line}_chr{chrom}.txt',"w+") as f:
            for i in EV1_Pos_vs_EV1_pred_Pos:
                f.write(str(i))
                f.write('\n')
        
        with open(f'/home/jordan990301/PCA_Experiments/outputs/Logs/EV1-CorrD_largest_absSum/{cell_line}/{resolution}/difference_count/{cell_line}_chr{chrom}.txt',"w+") as f:
            compare_true = list(EV1_Pos_vs_EV1_pred_Pos).count(True)
            compare_false = list(EV1_Pos_vs_EV1_pred_Pos).count(False)
            f.write('Total Entry count:')
            f.write(str(len(pearson_np)) + '\n')
            f.write('Compare True:')
            f.write(str(compare_true) + '\n')
            f.write('Compare False:')
            f.write(str(compare_false) + '\n')
    
        # Visualization
        plot_x_axis = [i + 1 for i in range(len(pearson_np))]
        output_plot_path = f"/home/jordan990301/PCA_Experiments/outputs/plots/EV1-CorrD_largest_absSum/{cell_line}/{resolution}/Scatter"
        
        EV1_Colors = ['b' if i else 'r' for i in EV1_Pos]
        EV1_pred_Viz = [1 if i else -1 for i in cov_pearson_np_Selected_Pos]

        plt.xticks(np.arange(0, len(pearson_np), 50)) 
        plt.rcParams["figure.figsize"] = [figsize, 5]
        plt.rcParams["figure.autolayout"] = True
        
        plt.scatter(plot_x_axis, EV1_pred_Viz, c=EV1_Colors)
        plt.savefig(f'{output_plot_path}/{cell_line}_Chr{chrom}.png')
        plt.clf() 

        output_plot_path = f"/home/jordan990301/PCA_Experiments/outputs/plots/EV1-CorrD_largest_absSum/{cell_line}/{resolution}/Line"
        cov_pearson_np_Selected_Norm = (cov_pearson_np_Selected - np.mean(cov_pearson_np_Selected)) / np.std(cov_pearson_np_Selected)
        EV1_np_Times100 = EV1_np * 100
        
        plt.xticks(np.arange(0, len(pearson_np), 50)) 
        plt.rcParams["figure.figsize"] = [figsize, 5]
        plt.rcParams["figure.autolayout"] = True
        
        plt.plot(EV1_np_Times100, c='r')
        plt.plot(cov_pearson_np_Selected_Norm, c='b')
        plt.savefig(f'{output_plot_path}/{cell_line}_Chr{chrom}.png')
        plt.clf()

<Figure size 5000x500 with 0 Axes>