In [1]:
%load_ext autoreload
%autoreload 2

In [66]:
import numpy as np
import pandas as pd
import os
import pyNBS as nbs

# Preprocess data

In [67]:
fn = '/cellar/data/users/wzhang1984/Firehose/Firehose__2015_08_21/analyses/OV/Mutation_Assessor/OV-TP.maf.annotated'
df = pd.read_table(fn)

In [68]:
df = df.loc[(df.loc[:,'is_flank']==0) & (df.loc[:,'is_silent']==0),:]

In [69]:
df['pat'] = df.loc[:,'patient'].str[:12]

In [70]:
df.loc[:,['pat','Hugo_Symbol']].to_csv('../data/OV_pat2mut.txt',sep='\t',header=False,index=False)

In [71]:
os.system('cp /cellar/data/users/wzhang1984/forNBS/FIsInGene_031516_with_annotations.txt ../data')
os.system('cp /cellar/data/users/wzhang1984/PCAWG/pat2clin4surv.txt ../data')

0

# Network propagation (iteration)

In [72]:
file_name = '../data/FIsInGene_031516_with_annotations.txt'
output_dir = '../data/'
network, gene2index=nbs.load_network(file_name,output_dir)

* Loading PPI...
	- Edges: 228919
	- Nodes: 12175
* Removing self-loops, multi-edges, and restricting to largest connected component...
	- Largest CC Edges: 228827
	- Largest CC Nodes: 12033
* Saving updated node list to file...


In [73]:
file_name = '../data/OV_pat2mut.txt'
mutation_profile, pat2index = nbs.load_mutation(file_name,output_dir,gene2index)

	- Genes in adjacency matrix: 6079
* Saving patient list to file...


In [77]:
rst_prob = 0.4
converge_rate = 0.0001

# run network propagation
pat_diff = nbs.run_diffusion(network,rst_prob,mutation_profile,converge_rate)

# write propagated network on hard disk
with open('{}/prop_pat_mut.npy'.format(output_dir),'w') as file_handle:
    np.save(file_handle,pat_diff)

print 'Finish propagating the data...'

0 iteration: delta is 3.65070073619
1 iteration: delta is 0.841077657404
2 iteration: delta is 0.176831939612
3 iteration: delta is 0.0786278752818
4 iteration: delta is 0.033845809479
5 iteration: delta is 0.0171747203311
6 iteration: delta is 0.00844417147056
7 iteration: delta is 0.00441606380949
8 iteration: delta is 0.0022625442709
9 iteration: delta is 0.00120190763346
10 iteration: delta is 0.000629437209068
11 iteration: delta is 0.000338617162376
12 iteration: delta is 0.000179910329278
13 iteration: delta is 9.78905215158e-05
Finish propagating the data...


# Network propagation (using PPR matrix)

In [None]:
# Load network

file_name = '../data/FIsInGene_031516_with_annotations.txt'
output_dir = '../data/'
network, gene2index=nbs.load_network(file_name,output_dir)

In [None]:
# It takes a long time to compute the inverse matrix. But it only has to be done once.

rst_prob = 0.5

PPR = nbs.create_ppr_matrix(network,rst_prob,network_output_dir)

In [None]:
# load PPR matrix

output_dir = '../data/'
PPR = np.load('{}/ppr_0.5.npy'.format(network_output_dir))

In [None]:
# Load mutations

output_dir = '../data/'
file_name = '../data/OV_pat2mut.txt'
mutation_profile, pat2index = nbs.load_mutation(file_name,output_dir,gene2index)

In [None]:
# Network propagation

pat_diff = nbs.run_diffusion_PPR(PPR,mutation_profile)

# write propagated network on hard disk
with open('{}/prop_pat_mut.npy'.format(output_dir),'w') as file_handle:
    np.save(file_handle,pat_diff)

print 'Finish propagating the data...'

# Clustering

In [11]:
M_prop = np.load('../data/prop_pat_mut.npy')
with open('../data/index_genes') as file_handle:
    genes = [a[1] for a in [line.split() for line in file_handle.read().splitlines()]]
with open('../data/index_patients') as file_handle:
    pats = [a[1] for a in [line.split() for line in file_handle.read().splitlines()]]
M_prop=pd.DataFrame(data=M_prop,index=pats,columns=genes)

In [12]:
M_prop_pca, pca_components, explained_variance_ratio = nbs.run_pca(M_prop)

In [78]:
explained_variance_ratio[:100].sum()

0.30268131715982532

In [94]:
labels = nbs.run_clustering_mp(M_prop_pca.iloc[:,:100], 10, nbs.run_SpectralClustering)

In [None]:
labels.K10.value_counts()

# Survival analysis

In [95]:
nbs.run_coxph('../data/pat2clin4surv.txt', labels, '../data/survival/')

0

# Subnetworks

In [96]:
M = pd.DataFrame(data=mutation_profile,index=pats,columns=genes)
K = labels.loc[:,'K10']
test_n_processes = 24
pat2mut_fn = '../data/OV_pat2mut.txt'
network_fn = '../data/FIsInGene_031516_with_annotations.txt'
output_dir = '../data/network'
ttest_fdr_cut = 0.3

nbs.subnetwork_wrapper(M, M_prop, K, test_n_processes, pat2mut_fn, network_fn, output_dir, ttest_fdr_cut)

Finish ttest
Finish Fisher exact test
Summarizing subtype signatures
Summarizing subnetworks


0

# **Clustering without propagation (scratch)

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering

In [160]:
M=pd.DataFrame(data=mutation_profile,index=pats,columns=genes)

In [161]:
M = M.subtract(M.mean())

In [164]:
pca=PCA()
pca.fit(M)
M_pca=pca.transform(M)
M_pca=pd.DataFrame(data=M_pca,index=pats)
PCs=['PC{}'.format(i+1) for i in M_pca.columns]
M_pca.columns=PCs
pca_components_=pca.components_
pca_components_=pd.DataFrame(data=pca_components_,columns=genes)
pca_components_.index=PCs

In [172]:
pca.explained_variance_ratio_[:58].sum()

0.30362309774442547

In [181]:
labels2=[]
for n_clusters in range(2,11):
    print n_clusters
    cluster=SpectralClustering(affinity='nearest_neighbors', n_clusters=n_clusters, n_init=1000, 
                               eigen_solver='arpack', eigen_tol=0.0001, assign_labels='discretize')
    cluster.fit(M_pca.iloc[:,:13])
    labels2.append(cluster.labels_)

2
3
4
5
6
7
8
9
10


In [182]:
labels2=pd.DataFrame(data=np.array(labels2).T,index=M.index,columns=['K{}'.format(i) for i in range(2,11)])

In [183]:
nbs.run_coxph('../data/pat2clin4surv.txt', labels2, '../data/survival/')

0