# preliminary Cibersort Cluster Plots
```
aedavids@ucsc.edu
11/18/2022
```

ref:
- [python_louvain](https://python-louvain.readthedocs.io/en/latest/https://python-louvain.readthedocs.io/en/latest/)
- [kaggle louvain-algorithm](https://www.kaggle.com/code/lsjsj92/network-graph-with-louvain-algorithm/notebookhttps://www.kaggle.com/code/lsjsj92/network-graph-with-louvain-algorithm/notebook)
- [umap](https://umap-learn.readthedocs.io/en/latest/basic_usage.htmlhttps://umap-learn.readthedocs.io/en/latest/basic_usage.html)

In [1]:
from IPython.display import display
import community as community_louvain
import networkx as nx
import numpy as np
import pandas as pd
import pathlib as pl
from sklearn.metrics import pairwise_distances
import umap 

# load cibersort results
cibersortOut = pl.Path("/scratch/aedavids/cibersort.out/GTEx_TCGA_TrainGroupby_mixture-2022-10-18-07.40.54-PDT")
cibersortRet = cibersortOut.joinpath("CIBERSORTx_GTEx_TCGA_TrainGroupby_mixture-2022-10-18-07.40.54-PDT_Results.txt")
cibersortFractionsDF = pd.read_csv(cibersortRet, sep='\t')
print("cibersortFractionsDF.shape:{}".format(cibersortFractionsDF.shape))

# pull a small sample to debug algorithm
statCols = ['Mixture', 'P-value', 'Correlation', 'RMSE']
testFractionsDF = cibersortFractionsDF.drop(columns=statCols).sample(frac=0.1)
print("\ntestFractionsDF.shape:{}".format(testFractionsDF.shape))

testFractionsDF.iloc[:, 0:5].head()

  from .autonotebook import tqdm as notebook_tqdm
2022-11-18 18:14:19.416318: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-18 18:14:19.416409: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


cibersortFractionsDF.shape:(15801, 87)

testFractionsDF.shape:(1580, 83)


Unnamed: 0,ACC,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta
4886,0.000159,0.085722,0.0,0.0,0.032369
3782,0.0,0.192748,0.148297,0.0,0.08487
14633,0.0,0.119645,0.000938,0.0,0.073376
3770,0.0,0.0,0.133925,0.0,0.0
3801,0.000133,0.077444,0.004934,0.0,0.028633


In [2]:
#n_samples_X, n_features
distanceMatrixNP = pairwise_distances(testFractionsDF, metric='euclidean')
print(distanceMatrixNP.shape)
distanceMatrixNP[0:5, 0:2]

(1580, 1580)


array([[0.        , 0.42058879],
       [0.42058879, 0.        ],
       [0.57329621, 0.4982591 ],
       [0.55754933, 0.48503299],
       [0.7679776 , 0.72328519]])

In [3]:
def findNearestNeighborsForRow( rowNP, k):
    '''
    
    Notes: 
        The point can have multiple neighbors with the same distance
        
        Given B is a nearest neighbor of A does not imply that A is a nearest neighbor of B 
        
    arguments
        rowNP: type, NumpyArray
            a sample
            
        k: type int
            the number of nearest neighbors to find
    
     returns:
            a row in adjacency matrix format of k nearest neighbors as
            two numpy arrays. The values in the first array are the indices 
            for the row argument nearest neighbors. The values of the second
            array are the distances
    '''
    sortedIdx = np.argsort(rowNP)
    sortedDistances = rowNP[sortedIdx]

    # 
    # if we are running using results for true knn the diagonal will be 0
    # all other values will be > 0
    # if we are running bb-knn some the distance matrix is not square
    # that is to say we may or may not have a zero
    # 
    start = 0
    if sortedDistances[0] == 0:
        start = 1 

    neighborsDistances = sortedDistances[start: k + start]
    neighborsIdxs= sortedIdx[start: k + start]
    
    return (neighborsIdxs, neighborsDistances)

In [4]:
def testFindNearestNeighborsForRow():
    print("create unit test")
    testNP = np.arange(0,25).reshape((5,5))
    pairWiseDistanceTestNP = pairwise_distances(testNP, metric='euclidean')
    nearestNeighborsIdx, nearestNeighborsDistance = findNearestNeighborsForRow( pairWiseDistanceTestNP[2], k=2)
    print(nearestNeighborsIdx)
    print(nearestNeighborsDistance)
    
testFindNearestNeighborsForRow()

create unit test
[1 3]
[11.18033989 11.18033989]


In [5]:
def getNeighbors( pairwiseDistanceMatrixNP, k):
    '''
    arguments:
        pairwiseDistanceMatrix: type numpy
        
        k: type int, number of neighbors to find
        
    returns:
        (knnIndices, knnDistance)
        
        a row in adjacency matrix format of k nearest neighbors as
        two numpy matries. The values in knnIndices are idx 
        for the row argument nearest neighbors. The values in knnDistance
        are the distances
    '''
    n = pairwiseDistanceMatrixNP.shape[0]
    print("pairwiseDistanceMatrixNP.shape:{}".format(pairwiseDistanceMatrixNP.shape))
    knnIndices = np.zeros((n,k))
    knnDistance = np.zeros((n,k))
    for i in range(n):
        row = pairwiseDistanceMatrixNP[i,:]
        nearestNeighborsIdx, nearestNeighborsDistance = findNearestNeighborsForRow( row, k )
        knnIndices[i] = nearestNeighborsIdx
        knnDistance[i] = nearestNeighborsDistance

        # print("\n**** i:{}".format(i))
        # print("row:\n{}".format(row))
        # print("nearestNeighborsIdx:\n{}".format(nearestNeighborsIdx))
        # print("nearestNeighborsDistance:\n{}".format(nearestNeighborsDistance))
            
    return (knnIndices, knnDistance)
    

In [6]:
def testGetNeighbors():
    print("create unit test")
    testNP = np.arange(0,25).reshape((5,5))
    pairWiseDistanceTestNP = pairwise_distances(testNP, metric='euclidean')
    knnIndices, knnDistance = getNeighbors( pairWiseDistanceTestNP, k= 2)
    print("knnIndices\n{}".format(knnIndices))
    print("\nknnDistance\n{}".format(knnDistance))
    
testGetNeighbors()

create unit test
pairwiseDistanceMatrixNP.shape:(5, 5)
knnIndices
[[1. 2.]
 [0. 2.]
 [1. 3.]
 [2. 4.]
 [3. 2.]]

knnDistance
[[11.18033989 22.36067977]
 [11.18033989 11.18033989]
 [11.18033989 11.18033989]
 [11.18033989 11.18033989]
 [11.18033989 22.36067977]]


# run louvian 

In [None]:
# get edges and weight
edges = df_edges[['source', 'target']].values.tolist()
weights = [float(l) for l in df_edges.value.values.tolist()]

In [40]:
def createGraph(knnIndices, knnDistance):
    '''
    aedwip
    arguments:
        knnIndices
        
        knnDistance
        
    returns:
        aedwip
    '''    
    # create a list of directed edges and edge weights
    # an edge src is the origin knnIndices row idx, the targers are the values knnIndices[i]
    knnIndices = knnIndices.astype(int)
    nRows, nCols = knnIndices.shape
    edges = [0] * (nRows * nCols)
    weights = [0] * (nRows * nCols)
    edgeIdx = 0
    for i in range(nRows):
        row = knnIndices[i]
        src = i
        for j in range(nCols):
            target = row[j]
            edges[edgeIdx] = [src, target]
            weights[edgeIdx] = knnDistance[src, j]
            edgeIdx += 1
            
    print("\nedges")
    for i in range(5):
        start = i * 3
        print("{}".format(edges[start: start + 3]))
    print("\nweights")
    for i in range(5):
        start = i * 3
        print("{}".format(weights[start: start + 3]))
    
    # Create Graph
    graph = nx.Graph(directed=False)
    graph.add_edges_from(edges)
    print("")
    for cnt, a in enumerate(graph.edges(data=True)):
        print("cnt:{} a:{} weights[cnt]:{}".format(cnt, a, weights[cnt]))
        graph.edges[(a[0], a[1])]['weight'] = weights[cnt]   
        
    print("\n*********graph\n")
    print(graph)
    
    return graph
    

    
def testCreateGraph():
    print("create unit test")
    testNP = np.arange(0,25).reshape((5,5))
    pairWiseDistanceTestNP = pairwise_distances(testNP, metric='euclidean')
    knnIndices, knnDistance = getNeighbors( pairWiseDistanceTestNP, k= 3)
    print("knnIndices\n{}".format(knnIndices))
    print("\nknnDistance\n{}".format(knnDistance)) 
    
    # begin test
    print("\n**** begin createGraph()test")
    graph = createGraph(knnIndices, knnDistance)
    print("\n*********graph\n")
    print(graph)    
    
  
testCreateGraph()

create unit test
pairwiseDistanceMatrixNP.shape:(5, 5)
knnIndices
[[1. 2. 3.]
 [0. 2. 3.]
 [1. 3. 0.]
 [2. 4. 1.]
 [3. 2. 1.]]

knnDistance
[[11.18033989 22.36067977 33.54101966]
 [11.18033989 11.18033989 22.36067977]
 [11.18033989 11.18033989 22.36067977]
 [11.18033989 11.18033989 22.36067977]
 [11.18033989 22.36067977 33.54101966]]

**** begin createGraph()test

edges
[[0, 1], [0, 2], [0, 3]]
[[1, 0], [1, 2], [1, 3]]
[[2, 1], [2, 3], [2, 0]]
[[3, 2], [3, 4], [3, 1]]
[[4, 3], [4, 2], [4, 1]]

weights
[11.180339887498949, 22.360679774997898, 33.54101966249684]
[11.180339887498949, 11.180339887498949, 22.360679774997898]
[11.180339887498949, 11.180339887498949, 22.360679774997898]
[11.180339887498949, 11.180339887498949, 22.360679774997898]
[11.180339887498949, 22.360679774997898, 33.54101966249684]

cnt:0 a:(0, 1, {}) weights[cnt]:11.180339887498949
cnt:1 a:(0, 2, {}) weights[cnt]:22.360679774997898
cnt:2 a:(0, 3, {}) weights[cnt]:33.54101966249684
cnt:3 a:(1, 2, {}) weights[cnt]:11.18

In [50]:
def simpleLouvain(G):
    """ Louvain method github basic example"""
    partition = community_louvain.best_partition(G)
#     pos = graphviz_layout(G)
    
#     max_k_w = []
#     for com in set(partition.values()):
#         list_nodes = [nodes for nodes in partition.keys()
#                       if partition[nodes] == com]
#         max_k_w = max_k_w + [list_nodes]

    
#     node_mapping = {}
#     map_v = 0
#     for node in graph.nodes():
#         node_mapping[node] = map_v
#         map_v += 1

#     community_num_group = len(max_k_w)
#     color_list_community = [[] for i in range(len(G.nodes()))]
    
#     # color
#     for i in graph.nodes():
#         for j in range(community_num_group):
#             if i in max_k_w[j]:
#                 color_list_community[node_mapping[i]] = j
    
#     return graph, pos, color_list_community, community_num_group, max_k_w
    return partition


def testSimpleLouvain():
    testNP = np.arange(0,25).reshape((5,5))
    pairWiseDistanceTestNP = pairwise_distances(testNP, metric='euclidean')
    knnIndices, knnDistance = getNeighbors( pairWiseDistanceTestNP, k= 3)
    graph = createGraph(knnIndices, knnDistance)
    
    # begin test
    # The partition dictionary, with communities numbered from 0 to number of communities
    partitionDict = simpleLouvain(graph)
    
    for clusterId, members in partitionDict.items():
        print("\nclusterid:{}\n members:{}".format(clusterId,members))
        
    return partitionDict
    
partitionDict = testSimpleLouvain()

pairwiseDistanceMatrixNP.shape:(5, 5)

edges
[[0, 1], [0, 2], [0, 3]]
[[1, 0], [1, 2], [1, 3]]
[[2, 1], [2, 3], [2, 0]]
[[3, 2], [3, 4], [3, 1]]
[[4, 3], [4, 2], [4, 1]]

weights
[11.180339887498949, 22.360679774997898, 33.54101966249684]
[11.180339887498949, 11.180339887498949, 22.360679774997898]
[11.180339887498949, 11.180339887498949, 22.360679774997898]
[11.180339887498949, 11.180339887498949, 22.360679774997898]
[11.180339887498949, 22.360679774997898, 33.54101966249684]

cnt:0 a:(0, 1, {}) weights[cnt]:11.180339887498949
cnt:1 a:(0, 2, {}) weights[cnt]:22.360679774997898
cnt:2 a:(0, 3, {}) weights[cnt]:33.54101966249684
cnt:3 a:(1, 2, {}) weights[cnt]:11.180339887498949
cnt:4 a:(1, 3, {}) weights[cnt]:11.180339887498949
cnt:5 a:(1, 4, {}) weights[cnt]:22.360679774997898
cnt:6 a:(2, 3, {}) weights[cnt]:11.180339887498949
cnt:7 a:(2, 4, {}) weights[cnt]:11.180339887498949
cnt:8 a:(3, 4, {}) weights[cnt]:22.360679774997898

*********graph

Graph with 5 nodes and 9 edges

clusterid:

In [51]:
print(type(partitionDict))
print(partitionDict.keys())
for k in partitionDict.keys():
    print("partitionDict[{}]:{}".format(k, partitionDict[k]))

<class 'dict'>
dict_keys([0, 1, 2, 3, 4])
partitionDict[0]:0
partitionDict[1]:1
partitionDict[2]:0
partitionDict[3]:0
partitionDict[4]:1


In [60]:
# There are 2 best clustors, 0 and 1
# we started with 5 nodes
# nodes 0,2,3 are in cluster id 0
# notes 1,4   are in cluster id 1

max_k_w = []
# com stands for community
print(partitionDict.values())
for com in set(partitionDict.values()):
    print("com:{}".format(com))
    list_nodes = [nodes for nodes in partitionDict.keys()
                  if partitionDict[nodes] == com]
    max_k_w = max_k_w + [list_nodes]
    
print(max_k_w)

dict_values([0, 1, 0, 0, 1])
com:0
com:1
[[0, 2, 3], [1, 4]]


In [63]:
aedwip plot graph, is our interpertation correct

SyntaxError: invalid syntax (1463501384.py, line 1)

In [None]:
plot graph, is our interpertation correct

In [None]:
umap
create mulitple point plogs
color by gtex/tcga
cluster id, 
... 

In [25]:
testNP = np.arange(0,25).reshape((5,5))
print(testNP)
print(testNP[2,2])


[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]
12
