In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
sns.set()

In [2]:
def read_np_file(pwd, filename):
    import numpy as np
    data = np.load(pwd + filename, allow_pickle = True)
    y_target_label = pd.DataFrame(data['y_t_np'])
#     label_names = list(data['label_names'])
    return y_target_label#, label_names

In [3]:
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorical-categorical association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    import scipy.stats as ss
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

In [4]:
def generate_correlation_matrix(y_data, label_names):
    correlation_matrix = np.zeros(shape=(y_data.shape[1], y_data.shape[1]))
    for i in range(y_data.shape[1]):
        for j in range(y_data.shape[1]):
            if i == j: continue
            confusion_matrix = pd.crosstab(y_data.iloc[:, i], y_data.iloc[:, j])
            correlation_matrix[i, j] = cramers_corrected_stat(confusion_matrix)
    np.fill_diagonal(correlation_matrix, 1)

    correlation_matrix = pd.DataFrame(correlation_matrix)
    correlation_matrix.index = label_names
    correlation_matrix.columns = label_names
    return correlation_matrix

In [5]:
def draw_correlation_figure(correlation_matrix, save_fig_pwd = False, save_fig_filename = False):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(30,30))
    sns.heatmap(correlation_matrix, linewidths = 0.5, annot = True, vmax = 1, cmap = "Blues")
#     plt.show()
    if save_fig_pwd and save_fig_filename:
        print(save_fig_pwd + save_fig_filename)
        plt.savefig(save_fig_pwd + save_fig_filename, dpi=600)
    return

In [6]:
def generate_label_coappearence_matrix(y_data, label_names):
    label_pairs = np.zeros((y_data.shape[1], y_data.shape[1]), dtype=int)
    for i in range(y_data.shape[1]):
        for j in range(y_data.shape[1]):
                for k in range(y_data.shape[0]):
                    if y_data.values[k, i] == y_data.values[k, j] and y_data.values[k, i] == 1:
                        label_pairs[i, j] += 1

    label_pairs = pd.DataFrame(label_pairs)
    label_pairs.index = label_names
    label_pairs.columns = label_names
    return label_pairs

In [7]:
def all_process_to_correlation(pwd, filename, save_fig_pwd, save_fig_filename, label_names):
    y_data = read_np_file(pwd, filename)
    
    correlation_matrix = generate_correlation_matrix(y_data, label_names)
    label_pairs = generate_label_coappearence_matrix(y_data, label_names)
    
    draw_correlation_figure(correlation_matrix, save_fig_pwd = save_fig_pwd, save_fig_filename = save_fig_filename)
    
    return correlation_matrix, label_pairs

In [8]:
pwd = '/home/chujunyi/2_Program/2_output_file/2_multilabel/1_X_Y_data/'
filename = 'v2_NR_U_T_xy_np_rdkit_morgan_radius2.npz'
nr_label_names = ['hsa10062', 'hsa190', 'hsa2099', 'hsa2100', 'hsa2101', 'hsa2103',
       'hsa2104', 'hsa2908', 'hsa3172', 'hsa3174', 'hsa367', 'hsa4306',
       'hsa5241', 'hsa5465', 'hsa5467', 'hsa5468', 'hsa5914', 'hsa5915',
       'hsa5916', 'hsa6095', 'hsa6096', 'hsa6097', 'hsa6256', 'hsa6257',
       'hsa6258', 'hsa7067', 'hsa7068', 'hsa7376', 'hsa7421', 'hsa8013',
       'hsa8856', 'hsa9970', 'hsa9971']
save_fig_pwd = '/home/chujunyi/2_Program/2_output_file/2_multilabel/'
save_fig_filename = 'v2_NR_Updated_target_label_cram_v_correlation.pdf'

correlation_matrix, label_pairs = all_process_to_correlation(pwd, filename, save_fig_pwd, save_fig_filename, nr_label_names)

/home/chujunyi/2_Program/2_output_file/2_multilabel/v2_NR_Updated_target_label_cram_v_correlation.pdf


In [9]:
label_pairs.to_csv('/home/chujunyi/2_Program/2_output_file/2_multilabel/NR_Updated_target_label__label_pair_intersection.csv')
correlation_matrix.to_csv('/home/chujunyi/2_Program/2_output_file/2_multilabel/NR_Updated_target_label__label_correlation.csv') # put it into paper

In [10]:
### 弦图