In [42]:

import numpy as np

import torch
import networkx as nx
import pandas as pd
from utils import CHANNEL_MARKERS


In [35]:
cd lungcancer

/Users/tiany4/Documents/00_mom_stuff/LungCancer


In [47]:
df = pd.read_csv('data/features_withloc.csv')

In [48]:
df = df[df.spot==1]

In [49]:
df.loc[:,['Location X','Location Y']]

Unnamed: 0,Location X,Location Y
0,876.0,583.0
1,34.0,1211.0
2,551.5,3774.5
3,249.5,1577.5
4,1375.5,411.0
...,...,...
6217,3850.0,1477.5
6218,2617.0,779.5
6219,2187.5,3358.5
6220,1337.0,1768.5


In [45]:
np.unique(df.spot.values, return_counts=True)

(array([1]), array([6222]))

In [None]:
['glass', 'spot', 'DAPI_mean', 'DAPI_std', 'DAPI_median', 'DAPI_mad',
       'DAPI_lower_quartile', 'DAPI_upper_quartile', 'Location X',
       'Location Y', 'PDGFRB_mean', 'PDGFRB_std', 'PDGFRB_median',
       'PDGFRB_mad', 'PDGFRB_lower_quartile', 'PDGFRB_upper_quartile',
       'Location X.1', 'Location Y.1', 'PDGFRA_mean', 'PDGFRA_std',
       'PDGFRA_median', 'PDGFRA_mad', 'PDGFRA_lower_quartile',
       'PDGFRA_upper_quartile', 'Location X.2', 'Location Y.2', 'FAP_mean',
       'FAP_std', 'FAP_median', 'FAP_mad', 'FAP_lower_quartile',
       'FAP_upper_quartile', 'Location X.3', 'Location Y.3', 'SMA_mean',
       'SMA_std', 'SMA_median', 'SMA_mad', 'SMA_lower_quartile',
       'SMA_upper_quartile', 'Location X.4', 'Location Y.4', 'PanEpiMask_mean',
       'PanEpiMask_std', 'PanEpiMask_median', 'PanEpiMask_mad',
       'PanEpiMask_lower_quartile', 'PanEpiMask_upper_quartile',
       'Location X.5', 'Location Y.5', 'PanEpiMask_dist_mean',
       'PanEpiMask_dist_std', 'PanEpiMask_dist_median', 'PanEpiMask_dist_mad',
       'PanEpiMask_dist_lower_quartile', 'PanEpiMask_dist_upper_quartile',
       'Location X.6', 'Location Y.6'],

In [46]:
df.columns

Index(['glass', 'spot', 'DAPI_mean', 'DAPI_std', 'DAPI_median', 'DAPI_mad',
       'DAPI_lower_quartile', 'DAPI_upper_quartile', 'Location X',
       'Location Y', 'PDGFRB_mean', 'PDGFRB_std', 'PDGFRB_median',
       'PDGFRB_mad', 'PDGFRB_lower_quartile', 'PDGFRB_upper_quartile',
       'Location X.1', 'Location Y.1', 'PDGFRA_mean', 'PDGFRA_std',
       'PDGFRA_median', 'PDGFRA_mad', 'PDGFRA_lower_quartile',
       'PDGFRA_upper_quartile', 'Location X.2', 'Location Y.2', 'FAP_mean',
       'FAP_std', 'FAP_median', 'FAP_mad', 'FAP_lower_quartile',
       'FAP_upper_quartile', 'Location X.3', 'Location Y.3', 'SMA_mean',
       'SMA_std', 'SMA_median', 'SMA_mad', 'SMA_lower_quartile',
       'SMA_upper_quartile', 'Location X.4', 'Location Y.4', 'PanEpiMask_mean',
       'PanEpiMask_std', 'PanEpiMask_median', 'PanEpiMask_mad',
       'PanEpiMask_lower_quartile', 'PanEpiMask_upper_quartile',
       'Location X.5', 'Location Y.5', 'PanEpiMask_dist_mean',
       'PanEpiMask_dist_std', 'PanEpi

In [None]:
def build_graphs_from_raw(file_path, max_dist=70):
    """ Build a nx graph from raw csv file.
    Generate graph edges based on cells' euclidian distance. 
    A max distance is set as a threshold 
    to decide wether or not two nodes are connected."""
    df = pd.read_csv(file_path, dtype=float)
    
    node_features = {}
    edge_features = {}
    for i in range(df.shape[0]):
        G.add_node(i)

    coordinates = df.iloc[:, 1:4].values
    markers = df[CHANNEL_MARKERS].values
    sizes = df[['CELL AREA', 'CELL VOLUMETRY']].values

    for i in range(G.number_of_nodes()):
        node_features[i] = {'coord': coordinates[i],
                            'marker': markers[i], 'size': sizes[i]}
    nx.set_node_attributes(G, node_features)

    for i in range(G.number_of_nodes()):  # iterate nodes
        neighbors = []  # list of neighbor index
        dists = []  # list of distance between cells
        for j in range(G.number_of_nodes()):
            # euclidian distance
            dist = np.linalg.norm(coordinates[i, :]-coordinates[j, :])
    #         print(dist)
            if dist < max_dist:  # critical distance
                dists.append(dist)
                neighbors.append(j)
            else:
                continue

        for k, n in enumerate(neighbors):
            if n > i:
                G.add_edge(i, n)
                edge_features[(i, n)] = {
                    "distance": dists[k],
                    "edge_type": 0 if dists[k] < 45 else 1
                }
    nx.set_edge_attributes(G, edge_features)
    return G


def get_edges_max_neighbors(coordinates, threshold=5):
    ''' generate graph edges based on cells' euclidian distance. 
    By default, 5 closest neighbors are considered as connected.
    Self loop is inserted'''
    num_nodes = coordinates.shape[0]
    edge_index = []  # list of edge indice

    for i in range(num_nodes):  # iterate nodes
        neighbors = []  # list of neighbor index
        dists = []  # list of distance between cells
        for j in range(num_nodes):
            # euclidian distance
            dist = np.linalg.norm(coordinates[i, :]-coordinates[j, :])
            # number of neighbors should be less than the threshold + 1(self loop)
            if len(dists) < threshold+1:
                dists.append(dist)
                neighbors.append(j)
            else:
                # only a certain number of closest cells are considered as connected
                if dist >= np.max(dists):
                    continue
                else:
                    idx = np.argmax(dists)
                    dists[idx] = dist
                    neighbors[idx] = j
        for n in neighbors:
            if [i, n] not in edge_index:
                edge_index.append([i, n])
                edge_index.append([n, i])

    edge_index = torch.tensor(
        edge_index, dtype=torch.long)  # resulted edge index
    # reshape the edge_index tensor to match GAE models
    edge_index = edge_index.t().contiguous()
    return edge_index


def get_edges_threshold_distance(coordinates, max_dist=100):
    ''' generate graph edges based on cells' euclidian distance, a critical distance is set as a threshold 
    to decide wether or not two nodes are connected. Self loop inserted'''

    num_nodes = coordinates.shape[0]
    edge_index = []  # list of edge indice

    for i in range(num_nodes):  # iterate nodes
        neighbors = []  # list of neighbor index
        dists = []  # list of distance between cells
        for j in range(num_nodes):
            # euclidian distance
            dist = np.linalg.norm(coordinates[i, :]-coordinates[j, :])
            if dist < max_dist:  # critical distance
                dists.append(dist)
                neighbors.append(j)
            else:
                continue

        for n in neighbors:
            if [i, n] not in edge_index:
                edge_index.append([i, n])
                edge_index.append([n, i])

    edge_index = torch.tensor(edge_index, dtype=torch.long)
    # reshape the edge_index tensor to match GAE models
    edge_index = edge_index.t().contiguous()
    return edge_index
