In [7]:
!pwd

/home/jordan990301/Projects/HiC-PC1_Approximation/notebooks/Lieberman_2009


In [8]:
%%bash
CELL_LINE="GM06690"
RESOLUTION="1Mb"
OUTPUT_PATH="/home/jordan990301/Projects/HiC-PC1_Approximation/outputs"

# rm -rf "$OUTPUT_PATH/plots/EV1-covD_absSumMax/$CELL_LINE"
# rm -rf "$OUTPUT_PATH/plots/EV1-covD_absSumMin/$CELL_LINE"

mkdir -p "$OUTPUT_PATH/plots/EV1-covD_absSumMax/$CELL_LINE/$RESOLUTION/line"
mkdir -p "$OUTPUT_PATH/plots/EV1-covD_absSumMax/$CELL_LINE/$RESOLUTION/scatter"
mkdir -p "$OUTPUT_PATH/plots/EV1-covD_absSumMin/$CELL_LINE/$RESOLUTION/line"
mkdir -p "$OUTPUT_PATH/plots/EV1-covD_absSumMin/$CELL_LINE/$RESOLUTION/scatter"

In [9]:
import numpy as np
import pandas as pd
from copy import deepcopy
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
np.set_printoptions(suppress=True)

In [10]:
cell_line = 'GM06690'
resolution = "1Mb"
resolution_val = 1000000
figsize = 20

In [11]:
chrom_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']

In [12]:
for chrom in chrom_list:
    # Read in the Pearson correlatin matrix
    pearson_df = pd.read_table(f"/home/jordan990301/Projects/HiC-PC1_Approximation/data/Lieberman_Aiden-datasets/heatmaps/HIC_gm06690_chr{chrom}_chr{chrom}_{resolution_val}_pearson.txt", index_col=0, header=1, sep="\t")
    pearson_df.pop(pearson_df.columns[-1])
    pearson_np = deepcopy(pearson_df.values) # Turn into numpy format
    # remove 0 rows and 0 columns
    # https://stackoverflow.com/questions/11188364/remove-zero-lines-2-d-numpy-array
    pearson_np = pearson_np[~np.all(pearson_np == 0, axis=1)]
    pearson_np = pearson_np[:, ~np.all(pearson_np == 0, axis=0)]

    # Zero means
    pearson_np = pearson_np - pearson_np.mean(axis=1, keepdims=True) # Zero mean of Pearson correlaton matrix

    # Read in the Eigenvector 1
    EV1_df = pd.read_table(f"/home/jordan990301/Projects/HiC-PC1_Approximation/data/Lieberman_Aiden-datasets/eigenvectors/GM-combined.ctg{chrom}.ctg{chrom}.{resolution_val}bp.hm.eigenvector.tab", header=None, sep="\t")
    EV1_df = EV1_df.iloc[:, [2]]
    EV1_np = deepcopy(EV1_df.values) # Turn into numpy format
    EV1_np = EV1_np.flatten() # Turn into 1D vector

    # Remove 0/NaN
    EV1_np = EV1_np[EV1_np != 0]

    print(f"\nchr{chrom}")
    print(len(pearson_np))
    print(len(EV1_np))

    del pearson_df, EV1_df

    if len(pearson_np) == len(EV1_np) and len(pearson_np) == len(pearson_np[0]):
        # According the steps in SVD, set x as pearson_df, and set y as x' / np.sqrt(n)
        n = len(pearson_np)
        pearson_T_np = deepcopy(np.transpose(pearson_np))
        y_np = deepcopy(pearson_T_np / np.sqrt(n))
        cov_np = deepcopy(np.matmul(np.transpose(y_np), y_np)) # Covariance matrix of pearson_df

        # Main idea
        cov_absSum = [np.sum(np.abs(row)) for row in cov_np] 
        cov_absSum = list(enumerate(cov_absSum)) # Turn list into tuple with index, ex: (index, absSum)
        sorted_cov_absSum = sorted(cov_absSum, key=lambda x: x[1], reverse=True) 

        for sorted_index in [0, -1]:
            # The sign of the pearson with the largest absSum in cov_pearson_np_absSum should correspond with the patterns of EV1.
            cov_selected_np = cov_np[sorted_cov_absSum[sorted_index][0]]

            # Flip the sign if the corrcoef of cov_pearson_np_Selected and EV1_np is negative.
            if np.corrcoef(cov_selected_np, EV1_np)[0][1] < 0:
                cov_selected_np = -cov_selected_np

            EV1_pos_np = EV1_np > 0
            cov_selected_pos_np = cov_selected_np > 0
            EV1_pos_VS_cov_selected_pos_np = EV1_pos_np == cov_selected_pos_np

            output_path="/home/jordan990301/Projects/HiC-PC1_Approximation/outputs"

            if sorted_index == 0:
                cov_selected_type = "Max"
                log_path = f"{output_path}/logs/EV1-covD_absSumMax/{cell_line}/{resolution}" 
                linePlot_path = f"{output_path}/plots/EV1-covD_absSumMax/{cell_line}/{resolution}/line" 
                scatterPlot_path = f"{output_path}/plots/EV1-covD_absSumMax/{cell_line}/{resolution}/scatter" 
            elif sorted_index == -1:
                cov_selected_type = "Min"
                log_path = f"{output_path}/logs/EV1-covD_absSumMin/{cell_line}/{resolution}" 
                linePlot_path = f"{output_path}/plots/EV1-covD_absSumMin/{cell_line}/{resolution}/line" 
                scatterPlot_path = f"{output_path}/plots/EV1-covD_absSumMin/{cell_line}/{resolution}/scatter" 
            
            correctNum = list(EV1_pos_VS_cov_selected_pos_np).count(True)
            correctRate = correctNum / len(pearson_np)

            print(f"{cov_selected_type}:")
            print(correctNum)
            print(correctRate)
        
            # Visualization
            plot_x_axis = [i + 1 for i in range(len(pearson_np))]
            cov_selected_Dots = [1 if i else -1 for i in cov_selected_pos_np]
            EV1_colors_values = [1 if i else 0 for i in EV1_pos_np]
            EV1_colors = ListedColormap(['r', 'b'])
            scatter_labels = ["Juicer's PC1 < 0", "Juicer's PC1 > 0"]

            plt.xticks(np.arange(0, len(pearson_np), 50)) 
            plt.rcParams["figure.figsize"] = [figsize, 5]
            plt.rcParams["figure.autolayout"] = True
            scatter =  plt.scatter(plot_x_axis, cov_selected_Dots, c=EV1_colors_values, cmap=EV1_colors)
            plt.legend(handles=scatter.legend_elements()[0], labels=scatter_labels, fontsize="20", loc="center left")
            # print(scatter.legend_elements()[0])
            plt.title(f"chromosome: {cell_line}_chromosome{chrom}, resolution: {resolution}, entryNum: {len(pearson_np)}, correctNum = {correctNum}, correctRate={np.round(correctRate, 2)}", fontsize=20, loc="left")
            plt.savefig(f'{scatterPlot_path}/{cell_line}_chr{chrom}.png')
            plt.clf() 

            cov_pearson_np_Selected_Norm = deepcopy((cov_selected_np - np.mean(cov_selected_np)) / np.std(cov_selected_np))
            EV1_np_Norm = deepcopy((EV1_np - np.mean(EV1_np)) / np.std(EV1_np))
            
            plt.xticks(np.arange(0, len(pearson_np), 50)) 
            plt.rcParams["figure.figsize"] = [figsize, 5]
            plt.rcParams["figure.autolayout"] = True
            plt.plot(EV1_np_Norm, c='r')
            plt.plot(cov_pearson_np_Selected_Norm, c='b')
            plt.legend(["Juicer's PC1", "approximated PC1-pattern"], fontsize="20", loc ="upper left")
            plt.title(f"chromosome: {cell_line}_chromosome{chrom}, resolution: {resolution}, entryNum: {len(pearson_np)}", fontsize=20, loc="left")
            plt.savefig(f'{linePlot_path}/{cell_line}_chr{chrom}.png')
            plt.clf()


chr1
229
229
Max:
226
0.9868995633187773


Min:
169
0.7379912663755459

chr2
241
241
Max:
241
1.0
Min:
224
0.9294605809128631

chr3
197
197
Max:
196
0.9949238578680203
Min:
96
0.4873096446700508

chr4
190
190
Max:
189
0.9947368421052631
Min:
174
0.9157894736842105

chr5
179
179
Max:
173
0.9664804469273743
Min:
166
0.9273743016759777

chr6
169
169
Max:
167
0.9881656804733728
Min:
160
0.9467455621301775

chr7
157
157
Max:
157
1.0
Min:
111
0.7070063694267515

chr8
145
145
Max:
139
0.9586206896551724
Min:
96
0.6620689655172414

chr9
124
124
Max:
124
1.0
Min:
81
0.6532258064516129

chr10
135
135
Max:
134
0.9925925925925926
Min:
78
0.5777777777777777

chr11
133
133
Max:
133
1.0
Min:
102
0.7669172932330827

chr12
132
132
Max:
132
1.0
Min:
91
0.6893939393939394

chr13
98
98
Max:
98
1.0
Min:
54
0.5510204081632653

chr14
89
89
Max:
89
1.0
Min:
55
0.6179775280898876

chr15
83
83
Max:
83
1.0
Min:
69
0.8313253012048193

chr16
81
81
Max:
81
1.0
Min:
40
0.49382716049382713

chr17
79
79
Max:
79
1.0
Min:
69
0.8734177215189873

chr18
77
77
Max:


<Figure size 2000x500 with 0 Axes>