### GM12878 1Mb

In [1]:
import numpy as np
import pandas as pd
import hicstraw
from sklearn.decomposition import PCA
from numpy import dot
from numpy.linalg import norm

cell_line = "GM12878"
resolution = "1Mb"
hic_path = "/home/jordan990301/PCA_Experiments/data/Rao_2014/GM12878/MAPQGE30/GSE63525_GM12878_insitu_primary_replicate_combined_30.hic"
input_path = f"/home/jordan990301/PCA_Experiments/outputs/{cell_line}/{resolution}/origin"
output_path = f"/home/jordan990301/PCA_Experiments/outputs/{cell_line}/{resolution}"

chrom_info = {
    "Cell_Line": [],
    "Resolution": [],
    "Chromosome": [],
    "Explained_Variance_PC1": [],
    "Explained_Variance_PC2": [],
    "Explained_Variance_PC3": [],
    "Sum_PC1_PC2": [],
    "Entries_PC1": [],
    "Difference_Count_PC1": [],
    "Cosine_Similarity_PC1": []
}
output_df = pd.DataFrame(chrom_info)

In [2]:
hic = hicstraw.HiCFile(hic_path)
chrom_list= []

for chrom in hic.getChromosomes():
    if (chrom.name != "All" and chrom.name != "MT"):
        chrom_list.append(chrom.name)

for chrom in chrom_list:
    ### Calculated from juicer_tools
    pearson_df = pd.read_table(f"{input_path}/origin_pearson_chrom{chrom}.txt", header=None, sep=" ")
    pearson_df.pop(pearson_df.columns[-1])

    pearson_df = pearson_df.dropna(axis=0, how="all").reset_index(drop=True)
    pearson_df = pearson_df.dropna(axis=1, how="all")

    numpy_pearson_df = pearson_df.values
    pearson_df = pd.DataFrame(numpy_pearson_df)

    #### Calc PCA
    pca = PCA(n_components=len(pearson_df[0]))
    pca.fit(pearson_df)

    My_PC1 = pca.components_[0]
    np.savetxt(f'{output_path}/My_PC1/My_PC1_chrom{chrom}.txt', My_PC1, delimiter='\n', fmt='%1.4f')

    Juicer_PC1 = pd.read_table(f"{input_path}/origin_pc1_chrom{chrom}.txt", header=None, sep=" ")
    Juicer_PC1 = Juicer_PC1.dropna(axis=0, how="all").reset_index(drop=True)
    Juicer_PC1 = Juicer_PC1[0]
    Juicer_PC1 = Juicer_PC1.to_numpy()
    np.savetxt(f'{output_path}/origin_dropna/origin_dropna_PC1_chrom{chrom}.txt', Juicer_PC1, delimiter='\n', fmt='%1.4f')

    cos_sim = dot(My_PC1, Juicer_PC1) / (norm(My_PC1) * norm(Juicer_PC1))

    My_PC1_Pos = My_PC1 > 0
    Juicer_PC1_Pos = Juicer_PC1 > 0

    Difference_Count_PC1 = 0
    for i in range(My_PC1_Pos.shape[0]):
        if(My_PC1_Pos[i] != Juicer_PC1_Pos[i]):
            Difference_Count_PC1 += 1

    if(cos_sim < 0):
        Difference_Count_PC1 = My_PC1_Pos.shape[0] - Difference_Count_PC1

    output_df.loc[len(output_df)] = [
        cell_line,
        resolution,
        chrom, 
        pca.explained_variance_ratio_[0],
        pca.explained_variance_ratio_[1],
        pca.explained_variance_ratio_[2],
        pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1],
        My_PC1_Pos.shape[0],
        Difference_Count_PC1,
        cos_sim
    ] 

display(output_df)

Unnamed: 0,Cell_Line,Resolution,Chromosome,Explained_Variance_PC1,Explained_Variance_PC2,Explained_Variance_PC3,Sum_PC1_PC2,Entries_PC1,Difference_Count_PC1,Cosine_Similarity_PC1
0,GM12878,1Mb,1,0.84493,0.069885,0.040365,0.914815,230,7,-0.984993
1,GM12878,1Mb,2,0.865367,0.078456,0.039431,0.943822,242,3,0.989763
2,GM12878,1Mb,3,0.869681,0.058441,0.042336,0.928122,196,6,-0.969955
3,GM12878,1Mb,4,0.621698,0.355446,0.010419,0.977145,190,8,0.972116
4,GM12878,1Mb,5,0.671706,0.29095,0.020601,0.962656,179,2,0.994042
5,GM12878,1Mb,6,0.913586,0.045915,0.026105,0.959502,170,7,0.976556
6,GM12878,1Mb,7,0.714069,0.174393,0.062453,0.888461,158,7,-0.985678
7,GM12878,1Mb,8,0.703275,0.243101,0.032962,0.946376,145,6,0.985752
8,GM12878,1Mb,9,0.689983,0.26311,0.029925,0.953092,125,1,0.996129
9,GM12878,1Mb,10,0.768365,0.113979,0.046576,0.882344,134,3,-0.9945


In [3]:
with pd.ExcelWriter('/home/jordan990301/PCA_Experiments/outputs/xlsx/GM12878.xlsx', mode='w') as writer:  
    output_df.to_excel(writer, sheet_name='1Mb')

### GM12878 100Kb

In [4]:
import numpy as np
import pandas as pd
import hicstraw
from sklearn.decomposition import PCA
from numpy import dot
from numpy.linalg import norm

cell_line = "GM12878"
resolution = "100Kb"
hic_path = "/home/jordan990301/PCA_Experiments/data/Rao_2014/GM12878/MAPQGE30/GSE63525_GM12878_insitu_primary_replicate_combined_30.hic"
input_path = f"/home/jordan990301/PCA_Experiments/outputs/{cell_line}/{resolution}/origin"
output_path = f"/home/jordan990301/PCA_Experiments/outputs/{cell_line}/{resolution}"

chrom_info = {
    "Cell_Line": [],
    "Resolution": [],
    "Chromosome": [],
    "Explained_Variance_PC1": [],
    "Explained_Variance_PC2": [],
    "Explained_Variance_PC3": [],
    "Sum_PC1_PC2": [],
    "Entries_PC1": [],
    "Difference_Count_PC1": [],
    "Cosine_Similarity_PC1": []
}
output_df = pd.DataFrame(chrom_info)

In [5]:
hic = hicstraw.HiCFile(hic_path)
chrom_list= []

for chrom in hic.getChromosomes():
    if (chrom.name != "All" and chrom.name != "MT"):
        chrom_list.append(chrom.name)

for chrom in chrom_list:
    ### Calculated from juicer_tools
    pearson_df = pd.read_table(f"{input_path}/origin_pearson_chrom{chrom}.txt", header=None, sep=" ")
    pearson_df.pop(pearson_df.columns[-1])

    pearson_df = pearson_df.dropna(axis=0, how="all").reset_index(drop=True)
    pearson_df = pearson_df.dropna(axis=1, how="all")

    numpy_pearson_df = pearson_df.values
    pearson_df = pd.DataFrame(numpy_pearson_df)

    #### Calc PCA
    pca = PCA(n_components=len(pearson_df[0]))
    pca.fit(pearson_df)

    My_PC1 = pca.components_[0]
    np.savetxt(f'{output_path}/My_PC1/My_PC1_chrom{chrom}.txt', My_PC1, delimiter='\n', fmt='%1.4f')

    Juicer_PC1 = pd.read_table(f"{input_path}/origin_pc1_chrom{chrom}.txt", header=None, sep=" ")
    Juicer_PC1 = Juicer_PC1.dropna(axis=0, how="all").reset_index(drop=True)
    Juicer_PC1 = Juicer_PC1[0]
    Juicer_PC1 = Juicer_PC1.to_numpy()
    np.savetxt(f'{output_path}/origin_dropna/origin_dropna_PC1_chrom{chrom}.txt', Juicer_PC1, delimiter='\n', fmt='%1.4f')

    cos_sim = dot(My_PC1, Juicer_PC1) / (norm(My_PC1) * norm(Juicer_PC1))

    My_PC1_Pos = My_PC1 > 0
    Juicer_PC1_Pos = Juicer_PC1 > 0

    Difference_Count_PC1 = 0
    for i in range(My_PC1_Pos.shape[0]):
        if(My_PC1_Pos[i] != Juicer_PC1_Pos[i]):
            Difference_Count_PC1 += 1

    if(cos_sim < 0):
        Difference_Count_PC1 = My_PC1_Pos.shape[0] - Difference_Count_PC1

    output_df.loc[len(output_df)] = [
        cell_line,
        resolution,
        chrom, 
        pca.explained_variance_ratio_[0],
        pca.explained_variance_ratio_[1],
        pca.explained_variance_ratio_[2],
        pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1],
        My_PC1_Pos.shape[0],
        Difference_Count_PC1,
        cos_sim
    ] 

display(output_df)

Unnamed: 0,Cell_Line,Resolution,Chromosome,Explained_Variance_PC1,Explained_Variance_PC2,Explained_Variance_PC3,Sum_PC1_PC2,Entries_PC1,Difference_Count_PC1,Cosine_Similarity_PC1
0,GM12878,100Kb,1,0.825665,0.101433,0.038888,0.927098,2276,80,-0.98724
1,GM12878,100Kb,2,0.891768,0.069644,0.022725,0.961412,2390,73,-0.985915
2,GM12878,100Kb,3,0.852978,0.09698,0.024746,0.949958,1951,93,-0.973541
3,GM12878,100Kb,4,0.720087,0.259097,0.010518,0.979184,1882,88,0.97323
4,GM12878,100Kb,5,0.80718,0.130045,0.048374,0.937224,1780,53,0.985658
5,GM12878,100Kb,6,0.92076,0.038219,0.022058,0.958978,1679,40,-0.981395
6,GM12878,100Kb,7,0.815196,0.101787,0.039582,0.916983,1562,60,-0.982115
7,GM12878,100Kb,8,0.856942,0.086257,0.040318,0.943199,1434,35,0.981048
8,GM12878,100Kb,9,0.745002,0.197107,0.029316,0.942109,1214,14,0.992731
9,GM12878,100Kb,10,0.820951,0.114246,0.025792,0.935197,1321,43,-0.990348


In [6]:
with pd.ExcelWriter('/home/jordan990301/PCA_Experiments/outputs/xlsx/GM12878.xlsx', mode='a') as writer:  
    output_df.to_excel(writer, sheet_name='100Kb')

### K562 1Mb

In [7]:
import numpy as np
import pandas as pd
import hicstraw
from sklearn.decomposition import PCA
from numpy import dot
from numpy.linalg import norm

cell_line = "K562"
resolution = "1Mb"
hic_path = "/media/jordan990301/Samsung_T5/HiC_Datasets/Rao_2014/K562/MAPQGE30/GSE63525_K562_combined_30.hic"
input_path = f"/home/jordan990301/PCA_Experiments/outputs/{cell_line}/{resolution}/origin"
output_path = f"/home/jordan990301/PCA_Experiments/outputs/{cell_line}/{resolution}"

chrom_info = {
    "Cell_Line": [],
    "Resolution": [],
    "Chromosome": [],
    "Explained_Variance_PC1": [],
    "Explained_Variance_PC2": [],
    "Explained_Variance_PC3": [],
    "Sum_PC1_PC2": [],
    "Entries_PC1": [],
    "Difference_Count_PC1": [],
    "Cosine_Similarity_PC1": []
}
output_df = pd.DataFrame(chrom_info)

In [8]:
hic = hicstraw.HiCFile(hic_path)
chrom_list= []

for chrom in hic.getChromosomes():
    if (chrom.name != "All" and chrom.name != "MT"):
        chrom_list.append(chrom.name)

for chrom in chrom_list:
    ### Calculated from juicer_tools
    pearson_df = pd.read_table(f"{input_path}/origin_pearson_chrom{chrom}.txt", header=None, sep=" ")
    pearson_df.pop(pearson_df.columns[-1])

    pearson_df = pearson_df.dropna(axis=0, how="all").reset_index(drop=True)
    pearson_df = pearson_df.dropna(axis=1, how="all")

    numpy_pearson_df = pearson_df.values
    pearson_df = pd.DataFrame(numpy_pearson_df)

    #### Calc PCA
    pca = PCA(n_components=len(pearson_df[0]))
    pca.fit(pearson_df)

    My_PC1 = pca.components_[0]
    np.savetxt(f'{output_path}/My_PC1/My_PC1_chrom{chrom}.txt', My_PC1, delimiter='\n', fmt='%1.4f')

    Juicer_PC1 = pd.read_table(f"{input_path}/origin_pc1_chrom{chrom}.txt", header=None, sep=" ")
    Juicer_PC1 = Juicer_PC1.dropna(axis=0, how="all").reset_index(drop=True)
    Juicer_PC1 = Juicer_PC1[0]
    Juicer_PC1 = Juicer_PC1.to_numpy()
    np.savetxt(f'{output_path}/origin_dropna/origin_dropna_PC1_chrom{chrom}.txt', Juicer_PC1, delimiter='\n', fmt='%1.4f')

    cos_sim = dot(My_PC1, Juicer_PC1) / (norm(My_PC1) * norm(Juicer_PC1))

    My_PC1_Pos = My_PC1 > 0
    Juicer_PC1_Pos = Juicer_PC1 > 0

    Difference_Count_PC1 = 0
    for i in range(My_PC1_Pos.shape[0]):
        if(My_PC1_Pos[i] != Juicer_PC1_Pos[i]):
            Difference_Count_PC1 += 1

    if(cos_sim < 0):
        Difference_Count_PC1 = My_PC1_Pos.shape[0] - Difference_Count_PC1

    output_df.loc[len(output_df)] = [
        cell_line,
        resolution,
        chrom, 
        pca.explained_variance_ratio_[0],
        pca.explained_variance_ratio_[1],
        pca.explained_variance_ratio_[2],
        pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1],
        My_PC1_Pos.shape[0],
        Difference_Count_PC1,
        cos_sim
    ] 

display(output_df)

Unnamed: 0,Cell_Line,Resolution,Chromosome,Explained_Variance_PC1,Explained_Variance_PC2,Explained_Variance_PC3,Sum_PC1_PC2,Entries_PC1,Difference_Count_PC1,Cosine_Similarity_PC1
0,K562,1Mb,1,0.848549,0.083723,0.045826,0.932272,230,5,-0.981095
1,K562,1Mb,2,0.914594,0.047185,0.01836,0.961778,242,9,-0.979839
2,K562,1Mb,3,0.867816,0.051487,0.037102,0.919303,196,10,0.958543
3,K562,1Mb,4,0.834354,0.137115,0.011668,0.971469,190,5,0.96745
4,K562,1Mb,5,0.71722,0.238233,0.018224,0.955453,179,7,0.971778
5,K562,1Mb,6,0.906882,0.050163,0.023149,0.957045,170,6,0.963228
6,K562,1Mb,7,0.854632,0.065232,0.037839,0.919864,158,7,-0.963174
7,K562,1Mb,8,0.933024,0.028109,0.020493,0.961133,145,4,0.968995
8,K562,1Mb,9,0.851956,0.099758,0.034651,0.951714,125,4,0.965678
9,K562,1Mb,10,0.860191,0.050861,0.04138,0.911052,134,2,-0.997024


In [9]:
with pd.ExcelWriter('/home/jordan990301/PCA_Experiments/outputs/xlsx/K562.xlsx', mode='w') as writer:  
    output_df.to_excel(writer, sheet_name='1Mb')

### K562 100Kb

In [10]:
import numpy as np
import pandas as pd
import hicstraw
from sklearn.decomposition import PCA
from numpy import dot
from numpy.linalg import norm

cell_line = "K562"
resolution = "100Kb"
hic_path = "/media/jordan990301/Samsung_T5/HiC_Datasets/Rao_2014/K562/MAPQGE30/GSE63525_K562_combined_30.hic"
input_path = f"/home/jordan990301/PCA_Experiments/outputs/{cell_line}/{resolution}/origin"
output_path = f"/home/jordan990301/PCA_Experiments/outputs/{cell_line}/{resolution}"

chrom_info = {
    "Cell_Line": [],
    "Resolution": [],
    "Chromosome": [],
    "Explained_Variance_PC1": [],
    "Explained_Variance_PC2": [],
    "Explained_Variance_PC3": [],
    "Sum_PC1_PC2": [],
    "Entries_PC1": [],
    "Difference_Count_PC1": [],
    "Cosine_Similarity_PC1": []
}
output_df = pd.DataFrame(chrom_info)

In [11]:
hic = hicstraw.HiCFile(hic_path)
chrom_list= []

for chrom in hic.getChromosomes():
    if (chrom.name != "All" and chrom.name != "MT"):
        chrom_list.append(chrom.name)

for chrom in chrom_list:
    ### Calculated from juicer_tools
    pearson_df = pd.read_table(f"{input_path}/origin_pearson_chrom{chrom}.txt", header=None, sep=" ")
    pearson_df.pop(pearson_df.columns[-1])

    pearson_df = pearson_df.dropna(axis=0, how="all").reset_index(drop=True)
    pearson_df = pearson_df.dropna(axis=1, how="all")

    numpy_pearson_df = pearson_df.values
    pearson_df = pd.DataFrame(numpy_pearson_df)

    #### Calc PCA
    pca = PCA(n_components=len(pearson_df[0]))
    pca.fit(pearson_df)

    My_PC1 = pca.components_[0]
    np.savetxt(f'{output_path}/My_PC1/My_PC1_chrom{chrom}.txt', My_PC1, delimiter='\n', fmt='%1.4f')

    Juicer_PC1 = pd.read_table(f"{input_path}/origin_pc1_chrom{chrom}.txt", header=None, sep=" ")
    Juicer_PC1 = Juicer_PC1.dropna(axis=0, how="all").reset_index(drop=True)
    Juicer_PC1 = Juicer_PC1[0]
    Juicer_PC1 = Juicer_PC1.to_numpy()
    np.savetxt(f'{output_path}/origin_dropna/origin_dropna_PC1_chrom{chrom}.txt', Juicer_PC1, delimiter='\n', fmt='%1.4f')

    cos_sim = dot(My_PC1, Juicer_PC1) / (norm(My_PC1) * norm(Juicer_PC1))

    My_PC1_Pos = My_PC1 > 0
    Juicer_PC1_Pos = Juicer_PC1 > 0

    Difference_Count_PC1 = 0
    for i in range(My_PC1_Pos.shape[0]):
        if(My_PC1_Pos[i] != Juicer_PC1_Pos[i]):
            Difference_Count_PC1 += 1

    if(cos_sim < 0):
        Difference_Count_PC1 = My_PC1_Pos.shape[0] - Difference_Count_PC1

    output_df.loc[len(output_df)] = [
        cell_line,
        resolution,
        chrom, 
        pca.explained_variance_ratio_[0],
        pca.explained_variance_ratio_[1],
        pca.explained_variance_ratio_[2],
        pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1],
        My_PC1_Pos.shape[0],
        Difference_Count_PC1,
        cos_sim
    ] 

display(output_df)

Unnamed: 0,Cell_Line,Resolution,Chromosome,Explained_Variance_PC1,Explained_Variance_PC2,Explained_Variance_PC3,Sum_PC1_PC2,Entries_PC1,Difference_Count_PC1,Cosine_Similarity_PC1
0,K562,100Kb,1,0.886976,0.063079,0.028069,0.950055,2275,73,0.980658
1,K562,100Kb,2,0.94172,0.02918,0.011362,0.9709,2390,93,0.974582
2,K562,100Kb,3,0.930261,0.021275,0.017743,0.951536,1951,122,-0.961927
3,K562,100Kb,4,0.919786,0.054033,0.004721,0.973819,1882,71,0.964648
4,K562,100Kb,5,0.8664,0.091571,0.012747,0.957971,1779,83,-0.969466
5,K562,100Kb,6,0.937282,0.028133,0.015044,0.965415,1679,68,-0.968246
6,K562,100Kb,7,0.900795,0.045865,0.017853,0.946659,1562,92,-0.961959
7,K562,100Kb,8,0.960044,0.015603,0.00713,0.975647,1433,62,0.970806
8,K562,100Kb,9,0.904163,0.057242,0.023575,0.961405,1204,63,-0.958274
9,K562,100Kb,10,0.920591,0.029948,0.020362,0.950539,1323,58,0.984162


In [None]:
with pd.ExcelWriter('/home/jordan990301/PCA_Experiments/outputs/xlsx/K562.xlsx', mode='a') as writer:  
    output_df.to_excel(writer, sheet_name='100Kb')