# Graph clustering using the Louvain algorithm

In [1]:
import pickle
import networkx as nx
import numpy as np

In [2]:
# Set the random seed for reproducibility
import numpy as np
np.random.seed(42)
# Set the seed in python
import random
random.seed(42)

In [3]:
graph = pickle.load(open('../data/filtered_graph.pickle','rb'))

In [7]:
gene_id_to_name = pickle.load(open('../data/gene_id_to_name.pickle','rb'))
protein_id_to_name = pickle.load(open('../data/protein_id_to_name.pickle','rb'))

## Louvain Clustering

In [17]:
# Cluster the graph using the Louvain method
from networkx.algorithms.community import louvain_communities, greedy_modularity_communities
communities = greedy_modularity_communities(graph, weight=None)

In [18]:
len(communities)

15

## Calculate enrichment of genes in each cluster in the network

In [21]:
from pprint import pprint
import json
import requests

In [22]:
go_names = pickle.load(open('../data/go_bio_process_2021_names.pickle','rb'))

In [23]:
len(go_names)

14937

In [24]:
def query_enrichr(names, gene_set_library):
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
    genes_str = '\n'.join(names)
    description = 'Example gene list'
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')

    data = json.loads(response.text)
#     print(data)

    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'
    user_list_id = data['userListId']
    response = requests.get(
        ENRICHR_URL + query_string % (user_list_id, gene_set_library)
     )
    if not response.ok:
        raise Exception('Error fetching enrichment results')
    return json.loads(response.content)

In [25]:
def print_enrichment(response, gene_set_library, threshold=.01, pos_filter_words=None, neg_filter_words=None, n=None):
    if n is None:
        n = len(response[gene_set_library])
    
    for i,enr in enumerate(response[gene_set_library][:n]):
        pvalue = float(enr[6])
        term = enr[1]
        match=False
        if pvalue < threshold:
            if pos_filter_words:
                if any([word in term for word in pos_filter_words]):
                    match=True
            else:
                match=True
            if neg_filter_words:
                if any([word in term for word in neg_filter_words]):
                    match=False
            else:
                match=True
                    
        if match:
            for j in [1, 6]:
                print(headers[j], enr[j])
            print(', '.join(enr[5]))
            print('-')

In [26]:
gene_set_library = 'GO_Biological_Process_2021'

In [27]:
headers = ['Rank', 'Term name', 'P-value', 'Z-score', 'Combined score', 'Overlapping genes', 
           'Adjusted p-value', 'Old p-value', 'Old adjusted p-value']

In [66]:
protein_id_to_name = pickle.load(open('../data/protein_id_to_name.pickle','rb'))

In [67]:
protein_id_to_synonyms = pickle.load(open('../data/protein_id_to_synonyms.pickle','rb'))

In [74]:
responses = []
i=0
matched = 0
total = 0
no_protein = []
not_in_go = []
for genes in communities:
    # Get the names of the genes in the community
    gene_names = []
    for gene in genes:
        total+=1
        matched_to_go = False
        # print(i, gene)
        if gene in protein_id_to_synonyms:
            for name in protein_id_to_synonyms[gene]:
                if name in go_names and not matched_to_go:
                    # print(i, gene, name)
                    gene_names.append(name)
                    matched+=1
                    matched_to_go = True
            if not matched_to_go:    
                not_in_go.append((gene, protein_id_to_name[gene]))
        else:
            no_protein.append(gene)
    response = query_enrichr(gene_names, gene_set_library)
    responses.append((i, response))
    i+=1
print(matched, total, len(no_protein), len(not_in_go))
    

252 271 0 19


In [75]:
# for p in not_in_go: print(p)

In [76]:
positive_words = ['differentiation', 'development', 'signal', 'matrix', 'organization', 'proliferation', 'stem', 'pathway', 'morpho', 'mesoderm', 'endoderm', 'different', 'specification']
negative_words = ['transcription']

In [77]:
for block_id, response in responses:
    print("------------------------------------")
    print("BLOCK", block_id)
    print("------------------------------------")    
    print_enrichment(response, gene_set_library, pos_filter_words=positive_words, neg_filter_words=negative_words, n=10)

------------------------------------
BLOCK 0
------------------------------------
Term name extracellular matrix organization (GO:0030198)
Adjusted p-value 1.7453883599159708e-07
VTN, MMP25, COL4A2, COL5A1, MMP15, MMP2, SERPINH1, A2M, MMP9
-
Term name endodermal cell differentiation (GO:0035987)
Adjusted p-value 2.2988642956325475e-07
VTN, COL4A2, MMP15, MMP2, MMP9
-
Term name endoderm formation (GO:0001706)
Adjusted p-value 2.855289674710371e-07
VTN, COL4A2, MMP15, MMP2, MMP9
-
------------------------------------
BLOCK 1
------------------------------------
------------------------------------
BLOCK 2
------------------------------------
Term name circulatory system development (GO:0072359)
Adjusted p-value 9.907843961233842e-08
TBX1, FOXC2, MEF2C, FOXC1, FOXF1, ISL1, MIXL1
-
Term name heart development (GO:0007507)
Adjusted p-value 5.215457476601786e-07
TBX1, FOXC2, MEF2C, FOXC1, FOXF1, ISL1, MIXL1
-
Term name mesenchymal cell differentiation (GO:0048762)
Adjusted p-value 1.21315860

## Export nodes with cluster assignments

In [39]:
cluster_file = open('../data/connected_graph_nodes_clusters.csv', 'w')
cluster_file.write('Id,Label,HierarchicalCluster\n')
for block, node_ids in ids.items():
    for node_id in node_ids:
        ensembl_id = graph.vertex_properties['id'][node_id]
        cluster_file.write(','.join([ensembl_id, '/'.join(protein_id_to_name[ensembl_id]), str(block)]))
        cluster_file.write('\n')
cluster_file.close()