In [14]:
# %%bash
# CELL_LINE="GM12878"
# RESOLUTION="25Kb"

# for CHROM in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 "X" "Y"
# do
# mkdir -p "/home/jordan990301/PCA_Experiments/outputs/logs/EV1-covD_absSumMax/$CELL_LINE/$RESOLUTION/chr$CHROM"
# mkdir -p "/home/jordan990301/PCA_Experiments/outputs/logs/EV1-covD_absSumMin/$CELL_LINE/$RESOLUTION/chr$CHROM"
# done

# mkdir -p "/home/jordan990301/PCA_Experiments/outputs/plots/EV1-covD_absSumMax/$CELL_LINE/$RESOLUTION/line"
# mkdir -p "/home/jordan990301/PCA_Experiments/outputs/plots/EV1-covD_absSumMax/$CELL_LINE/$RESOLUTION/scatter"
# mkdir -p "/home/jordan990301/PCA_Experiments/outputs/plots/EV1-covD_absSumMin/$CELL_LINE/$RESOLUTION/line"
# mkdir -p "/home/jordan990301/PCA_Experiments/outputs/plots/EV1-covD_absSumMin/$CELL_LINE/$RESOLUTION/scatter"

In [15]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
np.set_printoptions(suppress=True)

In [16]:
cell_line = 'GM12878'
resolution = "25Kb"
figsize = 100
input_path = f"/media/jordan990301/Samsung_T5/HiC_Datasets/juicer_outputs/{cell_line}/{resolution}"

In [17]:
# chrom_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']
chrom_list = ['1', '2', '3']

In [18]:
outputMax_df = pd.DataFrame(columns=['cellLine', 'resolution', 'chromosome', "cov_selected_type(absSum)", "binsNum", "correctNum", "correctRate"])
outputMin_df = pd.DataFrame(columns=['cellLine', 'resolution', 'chromosome', "cov_selected_type(absSum)", "binsNum", "correctNum", "correctRate"])

In [19]:
for chrom in chrom_list:
    # Read in the Pearson correlatin matrix
    pearson_df = pd.read_table(f"{input_path}/pearson_chr{chrom}.txt", header=None, sep=" ")
    pearson_df.pop(pearson_df.columns[-1])
    pearson_df = pearson_df.dropna(axis=0, how="all").reset_index(drop=True)
    pearson_df = pearson_df.dropna(axis=1, how="all")
    pearson_np = pearson_df.values # Turn into numpy format

    # Read in the Eigenvector 1
    EV1_df = pd.read_table(f"{input_path}/pc1_chr{chrom}.txt", header=None, sep=" ")
    EV1_df = EV1_df.dropna(axis=0, how="all").reset_index(drop=True)
    EV1_np = EV1_df.values # Turn into numpy format
    EV1_np = EV1_np.flatten() # Turn into 1D vector

    if len(pearson_np) == len(EV1_np) and len(pearson_np) == len(pearson_np[0]):
        # According the steps in SVD, set x as pearson_df, and set y as x' / np.sqrt(n)
        n = len(pearson_np)
        pearson_T_np = np.transpose(pearson_np)
        y_np = pearson_T_np / np.sqrt(n)
        cov_np = np.matmul(np.transpose(y_np), y_np) # Covariance matrix of pearson_np

        # Main idea
        cov_absSum = [np.sum(np.abs(row)) for row in cov_np] 
        cov_absSum = list(enumerate(cov_absSum)) # Turn list into tuple with index, ex: (index, absSum)
        sorted_cov_absSum = sorted(cov_absSum, key=lambda x: x[1], reverse=True) 
    
        for sorted_index in [0, -1]:
            # The sign of the pearson with the largest absSum in cov_pearson_np_absSum should correspond with the patterns of EV1.
            cov_selected_np = cov_np[sorted_cov_absSum[sorted_index][0]]

            # Flip the sign if the corrcoef of cov_pearson_np_Selected and EV1_np is negative.
            if np.corrcoef(cov_selected_np, EV1_np)[0][1] < 0:
                cov_selected_np = -cov_selected_np

            EV1_pos_np = EV1_np > 0
            cov_selected_pos_np = cov_selected_np > 0
            EV1_pos_VS_cov_selected_pos_np = EV1_pos_np == cov_selected_pos_np

            if sorted_index == 0:
                cov_selected_type = "Max"
                log_path = f"/home/jordan990301/PCA_Experiments/outputs/logs/EV1-covD_absSumMax/{cell_line}/{resolution}" 
                linePlot_path = f"/home/jordan990301/PCA_Experiments/outputs/plots/EV1-covD_absSumMax/{cell_line}/{resolution}/line" 
                scatterPlot_path = f"/home/jordan990301/PCA_Experiments/outputs/plots/EV1-covD_absSumMax/{cell_line}/{resolution}/scatter" 
            elif sorted_index == -1:
                cov_selected_type = "Min"
                log_path = f"/home/jordan990301/PCA_Experiments/outputs/logs/EV1-covD_absSumMin/{cell_line}/{resolution}" 
                linePlot_path = f"/home/jordan990301/PCA_Experiments/outputs/plots/EV1-covD_absSumMin/{cell_line}/{resolution}/line" 
                scatterPlot_path = f"/home/jordan990301/PCA_Experiments/outputs/plots/EV1-covD_absSumMin/{cell_line}/{resolution}/scatter" 
            
            with open(f"{log_path}/chr{chrom}/patterns.txt", "w+") as f:
                for i in EV1_pos_VS_cov_selected_pos_np:
                    f.write(str(i))
                    f.write('\n')
        
            correctNum = list(EV1_pos_VS_cov_selected_pos_np).count(True)
            correctRate = correctNum / len(pearson_np)

            new_row_df = pd.DataFrame(
                [[cell_line, resolution, f"chr{chrom}", cov_selected_type, len(pearson_np), correctNum, correctRate]],
                columns=['cellLine', 'resolution', 'chromosome', "cov_selected_type(absSum)", "binsNum", "correctNum", "correctRate"]
            )

            if sorted_index == 0:
                outputMax_df = pd.concat([outputMax_df, new_row_df], ignore_index=True)
            elif sorted_index == -1:
                outputMin_df = pd.concat([outputMin_df, new_row_df], ignore_index=True)
        
            # Visualization
            plot_x_axis = [i + 1 for i in range(len(pearson_np))]
            EV1_Colors = ['b' if i else 'r' for i in EV1_pos_np]
            cov_selected_Dots = [1 if i else -1 for i in cov_selected_pos_np]

            plt.xticks(np.arange(0, len(pearson_np), 50)) 
            plt.rcParams["figure.figsize"] = [figsize, 5]
            plt.rcParams["figure.autolayout"] = True
            plt.title(f"entry: {len(pearson_np)}, correctNum = {correctNum}, correctRate={correctRate}", fontsize=50, loc='left')
            
            plt.scatter(plot_x_axis, cov_selected_Dots, c=EV1_Colors)
            plt.savefig(f'{scatterPlot_path}/{cell_line}_chr{chrom}.png')
            plt.clf() 

            cov_pearson_np_Selected_Norm = (cov_selected_np - np.mean(cov_selected_np)) / np.std(cov_selected_np)
            EV1_np_Norm = (EV1_np - np.mean(EV1_np)) / np.std(EV1_np)
            
            plt.xticks(np.arange(0, len(pearson_np), 50)) 
            plt.rcParams["figure.figsize"] = [figsize, 5]
            plt.rcParams["figure.autolayout"] = True
            plt.title(f"binsNum: {len(pearson_np)}, correctNum = {correctNum}, correctRate={correctRate}", fontsize=50, loc='left')
            
            plt.plot(EV1_np_Norm, c='r')
            plt.plot(cov_pearson_np_Selected_Norm, c='b')
            plt.savefig(f'{linePlot_path}/{cell_line}_chr{chrom}.png')
            plt.clf()

<Figure size 10000x500 with 0 Axes>

In [20]:
excel_path = f"/home/jordan990301/PCA_Experiments/outputs/logs/{resolution}_summary.xlsx"
output_df = pd.concat([outputMax_df, outputMin_df], ignore_index=True)

with pd.ExcelWriter(excel_path, mode="w") as writer:
    output_df.to_excel(writer, sheet_name=f"{cell_line}")