In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
a_name = 'TCGA-BRCA'
b_name = 'CCLE'
a_path = '/home/baprice/Projects/00014_geneJIVE/data/prejive/tcgabrca_prejive.csv'
b_path = '/home/baprice/Projects/00014_geneJIVE/data/prejive/ccle_nohaem_prejive.csv'
a_joint_path = '/home/baprice/Projects/00014_geneJIVE/data/gj_tcgabrca_ccle/gj_tcgabrca_ccle_275+150/gj_tcgabrca_ccle_275+150_tcgabrca_joint.csv'
b_joint_path = '/home/baprice/Projects/00014_geneJIVE/data/gj_tcgabrca_ccle/gj_tcgabrca_ccle_275+150/gj_tcgabrca_ccle_275+150_ccle_joint.csv'
a_indiv_path = '/home/baprice/Projects/00014_geneJIVE/data/gj_tcgabrca_ccle/gj_tcgabrca_ccle_275+150/gj_tcgabrca_ccle_275+150_tcgabrca_individual.csv'
b_indiv_path = '/home/baprice/Projects/00014_geneJIVE/data/gj_tcgabrca_ccle/gj_tcgabrca_ccle_275+150/gj_tcgabrca_ccle_275+150_ccle_individual.csv'

a = pd.read_csv(a_path, index_col=0)
b = pd.read_csv(b_path, index_col=0)
a_joint = pd.read_csv(a_joint_path, index_col=0)
b_joint = pd.read_csv(b_joint_path, index_col=0)
a_indiv = pd.read_csv(a_indiv_path, index_col=0)
b_indiv = pd.read_csv(b_indiv_path, index_col=0)

In [None]:
import pickle

with open('/home/baprice/Projects/00014_geneJIVE/data/gj_tcgabrca_ccle/gj_tcgabrca_ccle_275+150/gj_tcgabrca_ccle_275+150_ajive.p', 'rb') as p:
    ajive = pickle.load(p)

## Figure 2A: Variation Explained Plot

In [None]:
def getVarianceExplained(original, joint, individual, label):
    from numpy.linalg import norm
    joint_var = norm(joint)**2/norm(original)**2
    individual_var = norm(individual)**2/norm(original)**2
    residual_var = 1-joint_var-individual_var
    return pd.DataFrame([np.round(joint_var*100, 2), np.round(individual_var*100,2), np.round(residual_var*100,2)], index=['Joint','Individual','Residual'], columns=[label])

def plotVarianceExplained(df, figsize=[12,8]):
    import matplotlib.patheffects as PathEffects
    sns.set(context='talk', style='ticks')
    sns.set_style('ticks')
    ax = df.plot.bar(stacked=True, figsize=figsize, table=False, rot='horizontal')
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(reversed(handles), reversed(labels))
    
    for rect in ax.patches:
        # Find where everything is located
        height = rect.get_height()
        width = rect.get_width()
        x = rect.get_x()
        y = rect.get_y()

        # The height of the bar is the data value and can be used as the label
        label_text = f'{height:.2f}'  
        label_x = x + width - 0.16  
        label_y = y + height / 2 
        txt = ax.text(label_x, label_y, label_text + '%', ha='right', va='center', fontsize=16, color='white', fontweight='black')
        txt.set_path_effects([PathEffects.withStroke(linewidth=0.5, foreground='black')])
        
    ax.set_ylabel("% Variation Explained", fontsize=18)
    plt.show()
    
    
    plt.tight_layout()
       
plt_df = getVarianceExplained(a, a_joint, a_indiv, a_name).join(getVarianceExplained(b, b_joint, b_indiv, b_name)).T
plotVarianceExplained(plt_df)

## Figure 2B

In [None]:
tcgabrca_j_unnorm = ajive.blocks['A'].joint.svals()*ajive.blocks['A'].joint.loadings()
tcgabrca_i_unnorm = ajive.blocks['A'].individual.svals()*ajive.blocks['A'].individual.loadings()
samp_joint_prop = np.sum(tcgabrca_j_unnorm**2, axis=1)/np.sum(a.T**2, axis=0)
samp_indiv_prop = np.sum(tcgabrca_i_unnorm**2, axis=1)/np.sum(a.T**2, axis=0)
prop_df = pd.DataFrame([samp_joint_prop, samp_indiv_prop], index=['Joint Proportion','Individual Proportion']).T
ccle_j_unnorm = ajive.blocks['B'].joint.svals()*ajive.blocks['B'].joint.loadings()
ccle_i_unnorm = ajive.blocks['B'].individual.svals()*ajive.blocks['B'].individual.loadings()

samp_joint_prop = np.sum(ccle_j_unnorm**2, axis=1)/np.sum(b.T**2, axis=0)
samp_indiv_prop = np.sum(ccle_i_unnorm**2, axis=1)/np.sum(b.T**2, axis=0)
#delete 3 duplicate cell lines?
samp_joint_prop = samp_joint_prop.drop(samp_joint_prop[pd.DataFrame(samp_joint_prop).index.duplicated()].index)
samp_indiv_prop = samp_indiv_prop.drop(samp_indiv_prop[pd.DataFrame(samp_indiv_prop).index.duplicated()].index)

prop_df = pd.DataFrame([samp_joint_prop, samp_indiv_prop], index=['Joint Proportion','Individual Proportion']).T
prop_df['TISSUE'] = prop_df.index.map(lambda i: '_'.join(i.split('_')[1:]))

prop_df['%Joint'] = prop_df['Joint Proportion']*100
prop_df['%Individual'] = prop_df['Individual Proportion']*100

prop_df['Ratio'] = prop_df.apply(lambda i: i['Joint Proportion']/i['Individual Proportion'], axis=1)

tissue_sort = prop_df.reset_index().groupby('TISSUE').mean().sort_values('Ratio').index.tolist()

plt.figure(figsize=[12,8], facecolor='w')
sns.boxplot(data=prop_df, x='TISSUE', y='Ratio', order=tissue_sort)
plt.xticks(size=12, rotation=55, ha='right', rotation_mode="anchor");
plt.title('Cell Line Tissue Population Best Represented by TCGA-BRCA')
plt.ylabel(r'$\frac{Joint\ Proportion}{Individual\ Proportion}$')


# Joint Statistic

In [None]:
#Distance from point to line (default line being y=x) 
def distance(x1, y1, a=-1, b=1, c=0):  
    return abs((a * x1 + b * y1 + c)) / (np.sqrt(a * a + b * b)) 

def joint_statistics(joint, individual):
    fudge = 0.01
    return np.var(joint, axis=1).clip(lower=fudge)-(np.var(individual, axis=1).clip(lower=fudge))

def permute_expected_statistics(joint, individual, n_perm=20):
    joined = joint.join(individual, lsuffix='_J', rsuffix='_I')
    perms = []
    sig_counts = []
    for p in range(n_perm):
        col_perm = np.random.permutation(joined.columns.tolist())
        j_perm = joined[col_perm[:len(col_perm)//2]]
        i_perm = joined[col_perm[len(col_perm)//2:]]
        exp_stat = np.sort(joint_statistics(j_perm, i_perm))[::-1]
        perms.append(exp_stat)
    
    return pd.DataFrame(perms)

def permute_expected_statistics2(joint, individual, n_perm=20):
    joined = joint.join(individual, lsuffix='_J', rsuffix='_I')
    perms = []
    sig_counts = []
    for p in range(n_perm):
        col_perm = np.random.permutation(joined.columns.tolist())
        j_perm = joined[col_perm[:len(col_perm)//2]]
        i_perm = joined[col_perm[len(col_perm)//2:]]
        exp_stat = joint_statistics(j_perm, i_perm)
        perms.append(exp_stat)
    
    return pd.DataFrame(perms).T

def qvalueTable(observed, permutations, thresholds=2000, decimals=4):
    obs_sigs = []
    perm_sigs = []
    fdrs = []
    qs = []
    for q in np.round(np.linspace(0, np.max([np.abs(observed.min()),np.abs(observed.max())]), num=thresholds), decimals=decimals):
        obs_sig = (np.abs(obs) > q).sum()
        perm_sig = (np.abs(permutations) > q).to_numpy().sum()
        fdr = perm_sig/(obs_sig+perm_sig)
        obs_sigs.append(obs_sig)
        perm_sigs.append(perm_sig)
        fdrs.append(fdr)
        qs.append(q)
        
    return_df = pd.DataFrame([qs, obs_sigs, perm_sigs, fdrs], index=['threshold','observed','expected','fdr']).T
    
    return return_df

def jstat_maplot(joint, individual):
    m = joint_statistics(joint, individual)
    a = average_j_stat(joint, individual)
    
    sns.set_context('notebook')
    plt.figure(figsize=[18,10])
    p = sns.scatterplot(y=m, x=a, alpha=0.35)
    ymin, ymax = p.axes.get_ylim()
    abs_ymax = max(abs(ymin), abs(ymax))
    plt.ylim([-abs_ymax, abs_ymax])
    plt.ylabel(r'$\log{\frac{\sigma_J}{\sigma_I}}$', size=30, rotation=0, labelpad=50)
    plt.xlabel(r'$\frac{1}{2}\log({\sigma_J\sigma_I})$', size=30, rotation=0)
    plt.axhline(0, color='k', zorder=1)
    
def average_j_stat(joint, individual):
    return 0.5 * np.log(np.std(joint, axis=1)*np.std(individual, axis=1))
