#  jDR Integration using AJIVE Template

### Example of using AJIVE to integrate TCGA-BRCA with CCLE across genes

In [1]:
import os
from pathlib import Path
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from jive.AJIVE import AJIVE
from jive.PCA import PCA
import warnings
import time, datetime
import pickle

## Data Paths and Settings

Data preparation:
- Genes in columns
- Genes with no expression in either matrix removed
- Genes not present in both matricies removed
- Columns (genes) matched across matrices
- Expression values upper quantile normalized and log-transformed 
- Genes were mean-centered
- Samples were mean-centered

Initial Signal Rank Selection:
- Use bootstrap approach described in paper or determine using scree plots

In [2]:
#Input Data Paths
a_path = 'https://webshare.bioinf.unc.edu/public/baprice/AJIVE_jDR_Integration/TCGABRCAxCCLE/tcgabrca_pre-jdri.csv'
b_path = 'https://webshare.bioinf.unc.edu/public/baprice/AJIVE_jDR_Integration/TCGABRCAxCCLE/ccle_pre-jdri.csv'

#Initial Signal Ranks
ra = 135
rb = 35

#File naming and output directory
name_prefix = 'jdri_tcgabrca-ccle'
a_name = 'tcgabrca'
b_name = 'ccle'
output_dir = './output/'

## Read in input data

In [3]:
a = pd.read_csv(a_path, index_col=0)
b = pd.read_csv(b_path, index_col=0)

In [4]:
a.head()

Unnamed: 0,DDX11L1,WASH7P,RP11-34P13.3,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,RP11-34P13.8,CICP27,...,MT-CO2,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB
TCGA-BH-A0HY-01A-11R,-0.639297,-0.949359,-0.123983,-0.077398,-0.017432,-0.007055,-0.007908,0.00387,-0.16126,-0.098728,...,0.562768,0.69129,0.543548,0.205025,0.728795,0.72917,0.535604,0.04194,-0.248343,0.701796
TCGA-A2-A0YF-01A-21R,-0.616668,-0.116746,-0.123983,-0.077398,-0.017432,-0.007055,-0.007908,0.703331,0.950287,-0.419425,...,-0.198678,-0.231757,-0.119248,0.194086,-0.729839,0.155718,0.185972,-0.683557,-1.151444,-0.035726
TCGA-A2-A1G1-01A-21R,0.737667,-0.489838,-0.123983,-0.077398,-0.017432,-0.007055,-0.007908,-0.545296,-1.798553,-1.118488,...,-0.142097,0.157175,-0.091664,-0.213526,-0.506023,-0.021578,0.023225,0.230931,0.235601,-0.216279
TCGA-LL-A5YL-01A-12R,-0.639297,1.345017,0.127726,-0.077398,-0.017432,-0.007055,-0.007908,1.693426,0.783717,0.152178,...,-0.411455,-3.09391,-1.092732,-0.637256,-0.050363,-2.964207,-1.331103,-1.799112,-0.931833,-1.293489
TCGA-E2-A105-01A-11R,2.655376,0.470626,-0.123983,-0.077398,-0.017432,-0.007055,-0.007908,-0.297281,0.251407,0.571214,...,0.595887,0.349906,0.174117,0.376955,0.399411,0.761726,0.577637,-0.018062,-0.389095,0.283378


In [5]:
b.head()

Unnamed: 0_level_0,DDX11L1,WASH7P,RP11-34P13.3,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,RP11-34P13.8,CICP27,...,MT-CO2,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB
CCLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22RV1_PROSTATE,0.575571,0.14913,-0.327859,-0.334919,-0.069974,-0.022882,-0.025035,0.847763,0.183748,0.907296,...,-0.282142,-2.083604,-0.273867,-0.126954,-0.857323,-0.92539,-0.256161,-0.024146,-0.567404,0.167963
2313287_STOMACH,-0.775893,-0.293381,-0.327859,-0.334919,-0.069974,-0.022882,-0.025035,1.068873,1.087933,-1.127,...,-0.280673,-2.52633,-0.243032,-0.276581,-1.041216,-1.320903,-0.198307,-0.346256,-0.601891,-0.160467
253J_URINARY_TRACT,-0.775893,0.431428,0.706149,2.496974,0.77823,0.488577,-0.025035,-0.577209,-0.334034,-0.559183,...,1.079786,0.873433,0.870774,0.692736,1.171786,0.80621,0.822191,0.629264,0.678264,0.349247
253JBV_URINARY_TRACT,0.24381,0.506233,0.647678,3.039768,0.980002,-0.022882,0.456894,-0.510602,0.047202,-0.367859,...,-0.076084,0.019612,-0.109078,-0.110159,0.175195,-0.13025,-0.147294,-0.051593,-0.145113,-0.600159
42MGBA_CENTRAL_NERVOUS_SYSTEM,-0.775893,0.511025,0.245783,-0.334919,-0.069974,-0.022882,-0.025035,-0.362404,0.20612,-0.676622,...,-0.334194,-0.082449,-0.200066,-0.763888,-0.732326,-0.485535,-0.535861,-0.463347,-0.220031,-0.322003


## Run AJIVE

#### AJIVE integrates across rows.

*AssertionError* indicates mismatched number of rows

Runtime depends on size of input matrices and initial signal ranks

TCGA-BRCA x CCLE ~ 3hrs

In [6]:
#Transpose matrices
a = a.T
b = b.T

#Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#Run AJIVE
jive_start = time.time()
ajive = AJIVE(init_signal_ranks={'A': ra, 'B': rb})
ajive.fit(blocks={'A': a, 'B': b})
jive_end = time.time()
jive_time = str(datetime.timedelta(seconds=jive_end-jive_start))
print('AJIVE time: ' + jive_time)

#Save AJIVE Object
pickle_file = open(os.path.join(output_dir, name_prefix +  '_ajive.p'), 'wb')
pickle.dump(ajive, pickle_file)
pickle_file.close()

#Convert Reprojection Matrices to DataFrames
a_joint = pd.DataFrame(ajive.blocks['A'].joint.full_, index=a.index, columns=a.columns)
a_individual = pd.DataFrame(ajive.blocks['A'].individual.full_, index=a.index, columns=a.columns)
b_joint = pd.DataFrame(ajive.blocks['B'].joint.full_, index=b.index, columns=b.columns)
b_individual = pd.DataFrame(ajive.blocks['B'].individual.full_, index=b.index, columns=b.columns)
a_noise = pd.DataFrame(ajive.blocks['A'].noise_, index=a.index, columns=a.columns)
b_noise = pd.DataFrame(ajive.blocks['B'].noise_, index=b.index, columns=b.columns)

#Save DataFrames
a_noise.to_csv(os.path.join(output_dir, name_prefix + '_' + a_name +  '-noise.csv'))
b_noise.to_csv(os.path.join(output_dir, name_prefix + '_' + b_name + '-noise.csv'))
a_joint.to_csv(os.path.join(output_dir, name_prefix + '_' + a_name + '-joint.csv'))
b_joint.to_csv(os.path.join(output_dir, name_prefix + '_' + b_name +  '-joint.csv'))
a_individual.to_csv(os.path.join(output_dir, name_prefix + '_' + a_name +  '-individual.csv'))
b_individual.to_csv(os.path.join(output_dir, name_prefix + '_' + b_name + '-individual.csv'))


AJIVE time: 0:51:05.817891
