# Graph-Sparse Logistic Regression applied to the real proteomics data from the TCGA/CPTAC Ovarian Cancer dataset. 

In [1]:
%pylab inline

import sys
import pickle
import pandas as pd
import networkx as nx

from sklearn.preprocessing import LabelEncoder

from matplotlib_venn import venn3, venn3_circles, venn2

repo_path = '/Users/alex/Documents/gslr/'
interactome_path = repo_path + 'experiments/algorithms/pcsf/inbiomap_temp.tsv'

sys.path.append(repo_path + 'gslr/')
import gslr

Populating the interactive namespace from numpy and matplotlib


### I. Load Ovarian Cancer Proteomics Dataset

In [2]:
# medullo = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/medullo_inbiomap_exp.tsv', index_col=0)
dataset = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/ovarian_inbiomap_exp.tsv', index_col=0)
# brca = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/brca_inbiomap_exp.tsv', index_col=0)

# medullo_labels = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/raw/medullo_labels.csv', index_col=0)
labels = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/raw/ovarian_labels.csv', index_col=0)
# brca_labels = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/raw/brca_labels.csv', index_col=0)

In [3]:
dataset.shape, labels.shape

((206, 16349), (206, 1))

### II. Load Interactome

In [4]:
inbiomap_experimentally = pd.read_csv(interactome_path, sep='\t', names=['protein1','protein2','cost'])
inbiomap_experimentally.head()

Unnamed: 0,protein1,protein2,cost
0,ZNF91,NDEL1,1.253
1,ZNF91,ELAVL1,1.254
2,ZNF91,SUMO1,1.245
3,ZNF91,SUMO3,1.245
4,ZNF91,CHMP5,1.241


In [5]:
(edges, nodes) = pd.factorize(inbiomap_experimentally[["protein1","protein2"]].unstack())
edges = edges.reshape(inbiomap_experimentally[["protein1","protein2"]].shape, order='F')
edges

array([[    0,  1228],
       [    0,  1279],
       [    0,  4071],
       ..., 
       [14190, 14237],
       [14191, 14378],
       [14192, 14539]])

In [6]:
dataset.columns

Index(['ZNF91', 'NDEL1', 'ELAVL1', 'SUMO1', 'SUMO3', 'CHMP5', 'UBC', 'HTT',
       'E2F4', 'ACP5',
       ...
       'SPANXN4', 'ZNF605', 'SERPINB10', 'ANKAR', 'RRH', 'DHH', 'CYSLTR1',
       'ZNF268', 'COL23A1', 'MEDAG'],
      dtype='object', length=16349)

In [7]:
nodes

Index(['ZNF91', 'ACP5', 'SLC27A2', 'PAX9', 'ADAM15', 'ELOVL2', 'DDX60L',
       'FGF7', 'CDHR5', 'LYPD3',
       ...
       'CNR2', 'GIG44', 'LINC00588', 'TAAR2', 'CHRNE', 'ANKAR', 'DHH',
       'CYSLTR1', 'COL23A1', 'MEDAG'],
      dtype='object', length=16349)

### IV. Prepare Dataset

In [8]:
dataset = dataset.transpose().reindex(index=nodes).transpose()
X = dataset.values
dataset.head()

Unnamed: 0,ZNF91,ACP5,SLC27A2,PAX9,ADAM15,ELOVL2,DDX60L,FGF7,CDHR5,LYPD3,...,CNR2,GIG44,LINC00588,TAAR2,CHRNE,ANKAR,DHH,CYSLTR1,COL23A1,MEDAG
PNNL-TCGA-09-1664,0.0,0.0,0.0,0.0,0.279,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNNL-TCGA-13-1484,0.0,0.175,0.0,0.0,0.0,0.0,-0.443,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNNL-TCGA-13-1488,0.0,0.0,0.0,0.0,-0.462,0.0,0.621,0.0,0.0,-0.398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNNL-TCGA-13-1489,0.0,0.0,-0.267,0.0,0.0,0.0,-0.601,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNNL-TCGA-13-1494,0.0,0.0,0.0117,0.0,0.053,0.0,-1.26,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
labels = labels.values.flatten().tolist()

In [11]:
labeler = LabelEncoder()
labeler.fit(labels)
y = labeler.transform(labels)
y

array([0, 1, 2, 0, 2, 2, 2, 2, 1, 0, 0, 0, 0, 1, 2, 2, 0, 3, 2, 4, 4, 0, 0,
       0, 2, 2, 4, 0, 0, 4, 3, 0, 0, 3, 3, 2, 0, 1, 2, 1, 3, 5, 4, 2, 0, 4,
       2, 4, 2, 0, 4, 0, 1, 2, 3, 3, 0, 4, 3, 3, 0, 2, 2, 2, 0, 1, 2, 4, 2,
       1, 1, 2, 0, 3, 3, 1, 0, 0, 2, 1, 0, 5, 0, 2, 0, 5, 1, 3, 2, 2, 2, 2,
       2, 2, 0, 3, 2, 1, 2, 2, 2, 1, 0, 0, 1, 1, 1, 3, 3, 2, 0, 0, 2, 2, 0,
       2, 1, 0, 4, 5, 3, 0, 0, 0, 3, 2, 5, 1, 1, 0, 3, 0, 1, 2, 1, 0, 3, 0,
       0, 0, 4, 0, 4, 2, 0, 4, 0, 1, 4, 0, 1, 0, 2, 0, 3, 2, 3, 2, 2, 0, 3,
       0, 2, 4, 1, 2, 0, 2, 1, 3, 2, 2, 0, 0, 3, 4, 1, 2, 1, 1, 1, 0, 0, 0,
       1, 3, 0, 0, 4, 0, 3, 0, 1, 3, 1, 3, 0, 0, 1, 0, 1, 0, 1, 3, 1, 4])

### V. Graph-Sparse Logistic Regression

In [12]:
d = len(nodes)
c = 2

graph_opts = gslr.GraphOptions(edges=edges, root=-1, num_clusters=1, pruning='strong')

sparsity_low = 150
sparsity_high = 350

verbosity_level = 1

num_steps = 25
possible_steps = np.array([0.03, 0.1, 0.3])
steps = np.tile(possible_steps, (num_steps, 1))

W0 = np.zeros((c, d))

In [None]:
W_hat, losses = gslr.gslr(X, y, W0, sparsity_low, sparsity_high, graph_opts, steps, verbosity_level, edge_costs=inbiomap_experimentally.cost.values, edge_costs_multiplier=6)

iteration 1:
