# Graph clustering using the Louvain algorithm

In [1]:
import pickle
import networkx as nx
import numpy as np

In [2]:
# Set the random seed for reproducibility
import numpy as np
np.random.seed(42)
# Set the seed in python
import random
random.seed(42)

In [3]:
graph = pickle.load(open('../data/filtered_graph.pickle','rb'))

In [4]:
gene_id_to_name = pickle.load(open('../data/gene_id_to_name.pickle','rb'))
protein_id_to_name = pickle.load(open('../data/protein_id_to_name.pickle','rb'))

## Louvain Clustering

In [5]:
# Cluster the graph using the Louvain method
from networkx.algorithms.community import louvain_communities, greedy_modularity_communities
communities = greedy_modularity_communities(graph, weight=None)

In [6]:
len(communities)

17

## Calculate enrichment of genes in each cluster in the network

In [7]:
from pprint import pprint
import json
import requests

In [8]:
go_names = pickle.load(open('../data/go_bio_process_2021_names.pickle','rb'))

In [9]:
len(go_names)

14937

In [10]:
def query_enrichr(names, gene_set_library):
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
    genes_str = '\n'.join(names)
    description = 'Example gene list'
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')

    data = json.loads(response.text)
#     print(data)

    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'
    user_list_id = data['userListId']
    response = requests.get(
        ENRICHR_URL + query_string % (user_list_id, gene_set_library)
     )
    if not response.ok:
        raise Exception('Error fetching enrichment results')
    return json.loads(response.content)

In [11]:
def print_enrichment(response, gene_set_library, threshold=.01, pos_filter_words=None, neg_filter_words=None, n=None):
    if n is None:
        n = len(response[gene_set_library])
    
    for i,enr in enumerate(response[gene_set_library][:n]):
        pvalue = float(enr[6])
        term = enr[1]
        match=False
        if pvalue < threshold:
            if pos_filter_words:
                if any([word in term for word in pos_filter_words]):
                    match=True
            else:
                match=True
            if neg_filter_words:
                if any([word in term for word in neg_filter_words]):
                    match=False
            else:
                match=True
                    
        if match:
            for j in [1, 6]:
                print(headers[j], enr[j])
            print(', '.join(enr[5]))
            print('-')

In [12]:
gene_set_library = 'GO_Biological_Process_2021'

In [13]:
headers = ['Rank', 'Term name', 'P-value', 'Z-score', 'Combined score', 'Overlapping genes', 
           'Adjusted p-value', 'Old p-value', 'Old adjusted p-value']

In [14]:
protein_id_to_name = pickle.load(open('../data/protein_id_to_name.pickle','rb'))

In [15]:
protein_id_to_synonyms = pickle.load(open('../data/protein_id_to_synonyms.pickle','rb'))

In [16]:
responses = []
i=0
matched = 0
total = 0
no_protein = []
not_in_go = []
for genes in communities:
    # Get the names of the genes in the community
    gene_names = []
    for gene in genes:
        total+=1
        matched_to_go = False
        # print(i, gene)
        if gene in protein_id_to_synonyms:
            for name in protein_id_to_synonyms[gene]:
                if name in go_names and not matched_to_go:
                    # print(i, gene, name)
                    gene_names.append(name)
                    matched+=1
                    matched_to_go = True
            if not matched_to_go:    
                not_in_go.append((gene, protein_id_to_name[gene]))
        else:
            no_protein.append(gene)
    response = query_enrichr(gene_names, gene_set_library)
    responses.append((i, response))
    i+=1
print(matched, total, len(no_protein), len(not_in_go))
    

252 271 0 19


In [17]:
# for p in not_in_go: print(p)

In [18]:
positive_words = ['differentiation', 'development', 'signal', 'matrix', 'organization', 'proliferation', 'stem', 'pathway', 'morpho', 'mesoderm', 'endoderm', 'different', 'specification']
negative_words = ['transcription']

In [19]:
for block_id, response in responses:
    print("------------------------------------")
    print("BLOCK", block_id)
    print("------------------------------------")    
    print_enrichment(response, gene_set_library, pos_filter_words=positive_words, neg_filter_words=negative_words, n=10)

------------------------------------
BLOCK 0
------------------------------------
Term name negative regulation of myeloid cell differentiation (GO:0045638)
Adjusted p-value 0.0002934760768151128
HOXA9, ZFP36, MAFB
-
Term name endodermal cell fate specification (GO:0001714)
Adjusted p-value 0.0009919001646572165
NANOG, POU5F1
-
------------------------------------
BLOCK 1
------------------------------------
Term name circulatory system development (GO:0072359)
Adjusted p-value 6.315793405391344e-10
TBX1, FOXC2, MEF2C, FOXC1, FOXF1, SOX9, ISL1, MIXL1
-
Term name heart development (GO:0007507)
Adjusted p-value 4.420628936509694e-09
TBX1, FOXC2, MEF2C, FOXC1, FOXF1, SOX9, ISL1, MIXL1
-
Term name mesenchymal cell differentiation (GO:0048762)
Adjusted p-value 5.959829008250859e-09
MEF2C, FOXC1, FOXF2, TBX20, SOX9, ISL1
-
Term name endocardial cushion development (GO:0003197)
Adjusted p-value 1.227682201839613e-06
FOXF1, TBX20, SOX9, ISL1
-
Term name anterior/posterior pattern specification

## Export nodes with cluster assignments

In [29]:
pickle.dump(communities, open('../data/louvain_clusters.pickle','wb'))

## Export enrichments as pickle

In [30]:
threshold = .01

enrichments = []
for cluster,response in enumerate(responses):
    for enr in response[1][gene_set_library]:
        pvalue = float(enr[6])
        term = enr[1]
        genes = enr[5]
        enrichments.append((cluster, term, pvalue, genes))

pickle.dump(enrichments, open('../data/cluster_enrichments_louvain.pickle','wb'))