# Gene-wise Cohort Integration using AJIVE Template

In [12]:
import os
from pathlib import Path
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from jive.AJIVE import AJIVE
from jive.PCA import PCA
import warnings
import time, datetime
import pickle

## Data Paths and Settings

Data preparation:
- Genes in columns
- Genes with no expression in either matrix removed
- Genes not present in both matricies removed
- Columns (genes) matched across matrices
- Expression values upper quantile normalized and log-transformed 
- Genes were mean-centered
- Samples were mean-centered

Initial Signal Rank Selection:
- Use bootstrap approach described in paper or determine using scree plots

In [17]:
#Input Data Paths
a_path = 'https://webshare.bioinf.unc.edu/public/baprice/Genewise_Cohort_Integration/TCGABRCAxCCLE/tcgabrca_pre-gwci.csv'
b_path = 'https://webshare.bioinf.unc.edu/public/baprice/Genewise_Cohort_Integration/TCGABRCAxCCLE/ccle_pre-gwci.csv'

#Initial Signal Ranks
ra = 135
rb = 35

#File naming and output directory
name_prefix = 'gwci_tcgabrca-ccle'
a_name = 'tcgabrca'
b_name = 'ccle'
output_dir = './output/'

## Read in input data

In [18]:
a = pd.read_csv(a_path, index_col=0)
b = pd.read_csv(b_path, index_col=0)

In [30]:
a.head()

Unnamed: 0,DDX11L1,WASH7P,RP11-34P13.3,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,RP11-34P13.8,CICP27,...,MT-CO2,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB
TCGA-BH-A0HY-01A-11R,-0.639297,-0.949359,-0.123983,-0.077398,-0.017432,-0.007055,-0.007908,0.00387,-0.16126,-0.098728,...,0.562768,0.69129,0.543548,0.205025,0.728795,0.72917,0.535604,0.04194,-0.248343,0.701796
TCGA-A2-A0YF-01A-21R,-0.616668,-0.116746,-0.123983,-0.077398,-0.017432,-0.007055,-0.007908,0.703331,0.950287,-0.419425,...,-0.198678,-0.231757,-0.119248,0.194086,-0.729839,0.155718,0.185972,-0.683557,-1.151444,-0.035726
TCGA-A2-A1G1-01A-21R,0.737667,-0.489838,-0.123983,-0.077398,-0.017432,-0.007055,-0.007908,-0.545296,-1.798553,-1.118488,...,-0.142097,0.157175,-0.091664,-0.213526,-0.506023,-0.021578,0.023225,0.230931,0.235601,-0.216279
TCGA-LL-A5YL-01A-12R,-0.639297,1.345017,0.127726,-0.077398,-0.017432,-0.007055,-0.007908,1.693426,0.783717,0.152178,...,-0.411455,-3.09391,-1.092732,-0.637256,-0.050363,-2.964207,-1.331103,-1.799112,-0.931833,-1.293489
TCGA-E2-A105-01A-11R,2.655376,0.470626,-0.123983,-0.077398,-0.017432,-0.007055,-0.007908,-0.297281,0.251407,0.571214,...,0.595887,0.349906,0.174117,0.376955,0.399411,0.761726,0.577637,-0.018062,-0.389095,0.283378


In [31]:
b.head()

Unnamed: 0_level_0,DDX11L1,WASH7P,RP11-34P13.3,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,RP11-34P13.8,CICP27,...,MT-CO2,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB
CCLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AU565_BREAST,1.614785,-0.964276,0.122092,-0.125271,-0.035807,-0.02642,-0.026807,-0.266645,-0.443138,0.42194,...,-0.223326,-0.124896,-0.203346,-1.054528,-0.268155,-0.140064,-0.169014,-0.37561,-0.244299,-0.543585
BT20_BREAST,1.465711,-1.012668,0.290662,-0.125271,-0.035807,-0.02642,-0.026807,0.127074,0.765924,1.033953,...,0.317843,0.071909,0.148781,0.134461,-0.356781,0.084648,0.127303,-0.111025,-0.01484,0.101115
BT474_BREAST,-1.036782,-0.209281,-0.348288,-0.125271,-0.035807,-0.02642,-0.026807,0.584599,0.444714,-1.034605,...,0.35401,-0.094392,0.138747,0.487778,0.14006,-0.122656,0.002683,0.114069,-0.027399,0.354121
BT483_BREAST,0.458421,0.285037,0.510951,0.14823,-0.035807,-0.02642,-0.026807,0.198408,-0.28065,-0.338366,...,0.292114,0.695395,0.480546,0.321053,0.694982,0.408865,0.287507,0.498307,0.707676,0.864065
BT549_BREAST,0.253864,-0.714756,-0.348288,-0.125271,-0.035807,-0.02642,-0.026807,-3.280847,-1.740915,-1.832638,...,-0.535056,-0.432776,-0.382815,-0.458983,-0.564023,-0.311215,-0.580568,-0.362071,-0.422679,-0.236125


## Run AJIVE

#### AJIVE integrates across rows.  Genes must be rows for GWCI

*AssertionError* indicates mismatched number of rows

Runtime depends on size of input matrices and initial signal ranks

TCGA-BRCA x CCLE ~ 3hrs

In [32]:
#Transpose matrices
a = a.T
b = b.T

#Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#Run AJIVE
jive_start = time.time()
ajive = AJIVE(init_signal_ranks={'A': ra, 'B': rb})
ajive.fit(blocks={'A': a, 'B': b})
jive_end = time.time()
jive_time = str(datetime.timedelta(seconds=jive_end-jive_start))
print('AJIVE time: ' + jive_time)

#Save AJIVE Object
pickle_file = open(os.path.join(output_dir, name_prefix +  '_ajive.p'), 'wb')
pickle.dump(ajive, pickle_file)
pickle_file.close()

#Save AJIVE matrices
a_joint = pd.DataFrame(ajive.blocks['A'].joint.full_, index=a.index, columns=a.columns)
a_individual = pd.DataFrame(ajive.blocks['A'].individual.full_, index=a.index, columns=a.columns)
b_joint = pd.DataFrame(ajive.blocks['B'].joint.full_, index=b.index, columns=b.columns)
b_individual = pd.DataFrame(ajive.blocks['B'].individual.full_, index=b.index, columns=b.columns)
a_noise = pd.DataFrame(ajive.blocks['A'].noise_, index=a.index, columns=a.columns)
b_noise = pd.DataFrame(ajive.blocks['B'].noise_, index=b.index, columns=b.columns)

a_noise.to_csv(os.path.join(output_dir, name_prefix + '_' + a_name +  '-noise.csv'))
b_noise.to_csv(os.path.join(output_dir, name_prefix + '_' + b_name + '-noise.csv'))
a_joint.to_csv(os.path.join(output_dir, name_prefix + '_' + a_name + '-joint.csv'))
b_joint.to_csv(os.path.join(output_dir, name_prefix + '_' + b_name +  '-joint.csv'))
a_individual.to_csv(os.path.join(output_dir, name_prefix + '_' + a_name +  '-individual.csv'))
b_individual.to_csv(os.path.join(output_dir, name_prefix + '_' + b_name + '-individual.csv'))


removing column 19
AJIVE time: 0:49:46.244961
