In [19]:
import requests
import json
from tqdm import tqdm

In [20]:
genes = ['TSC22D1',
 'KLF1',
 'MAP2K6',
 'CEBPE',
 'RUNX1T1',
 'MAML2',
 'CBL',
 'PTPN9',
 'TGFBR2',
 'ETS2',
 'SGK1',
 'TBX3',
 'DUSP9',
 'SPI1',
 'ELMSAN1',
 'UBASH3B',
 'PTPN12',
 'FOXA1',
 'FOXA3',
 'IGDCC3',
 'BCORL1',
 'MEIS1',
 'GLB1L2',
 'IKZF3',
 'BAK1',
 'FEV',
 'MAP2K3',
 'SLC38A2',
 'SET',
 'LHX1',
 'TBX2',
 'SLC4A1',
 'RREB1',
 'ZNF318',
 'MAPK1',
 'COL2A1',
 'ZBTB25',
 'MAP4K5',
 'SLC6A9',
 'MIDN',
 'OSR2',
 'DLX2',
 'CBFA2T3',
 'HES7',
 'FOXL2',
 'AHR',
 'FOXO4',
 'RHOXF2BB',
 'S1PR2',
 'POU3F2',
 'LYL1',
 'IER5L',
 'CNN1',
 'CELF2',
 'JUN',
 'CEBPA',
 'MAP4K3',
 'ZC3HAV1',
 'CDKN1A',
 'UBASH3A',
 'PRTG',
 'PTPN1',
 'TP73',
 'MAP7D1',
 'FOSB',
 'C19orf26',
 'IRF1',
 'TMSB4X',
 'BPGM',
 'SAMD1',
 'HOXB9',
 'HOXC13',
 'CKS1B',
 'CLDN6',
 'KIF18B',
 'KIF2C',
 'BCL2L11',
 'COL1A1',
 'CEBPB',
 'FOXF1',
 'ZBTB1',
 'PLK4',
 'ARRDC3',
 'C3orf72',
 'KIAA1804',
 'HNF4A',
 'SNAI1',
 'KMT2A',
 'ISL2',
 'CSRNP1',
 'ARID1A',
 'CNNM4',
 'NCL',
 'ZBTB10',
 'STIL',
 'ATL1',
 'NIT1',
 'CDKN1B',
 'PTPN13',
 'HOXA13',
 'CITED1',
 'PRDM1',
 'HK2',
 'CDKN1C',
 'EGR1']

In [21]:
result = {}
for gene in tqdm(genes):
    gene_code = gene
    headers = {
        'accept': 'application/json',
    }
    
    params = {
        'interactors': 'true',
        'species': 'Homo sapiens',
        'pageSize': '20',
        'page': '1',
        'sortBy': 'ENTITIES_PVALUE',
        'order': 'ASC',
        'resource': 'TOTAL',
        'pValue': '1',
        'includeDisease': 'true',
    }
    
    response = requests.get(f'https://reactome.org/AnalysisService/identifier/{gene_code}/projection', params=params, headers=headers)
    if response.status_code != 200:
        print(response.status_code)
    else:
        r = json.loads(response.content.decode())
        result[gene] = r    
    

100%|██████████████████████████████████████████████████████████████████| 105/105 [02:08<00:00,  1.23s/it]


In [22]:
with open('reactome_genes.json', 'w') as json_file:
    json.dump(result, json_file, indent=4)

## Building the similarity/distance matrix

In [1]:
import json
import pandas as pd

In [2]:
with open('./little_data/reactome_genes.json', 'r') as f:
    data = json.load(f)

In [4]:
data['KLF1']['pathways'][0]

{'stId': 'R-HSA-8950505',
 'dbId': 8950505,
 'name': 'Gene and protein expression by JAK-STAT signaling after Interleukin-12 stimulation',
 'species': {'dbId': 48887, 'taxId': '9606', 'name': 'Homo sapiens'},
 'llp': True,
 'entities': {'resource': 'TOTAL',
  'total': 748,
  'found': 1,
  'ratio': 0.03128529005813711,
  'curatedTotal': 73,
  'curatedFound': 0,
  'interactorsTotal': 684,
  'interactorsFound': 1,
  'pValue': 0.03128529005813707,
  'fdr': 0.09845664812413735,
  'exp': []},
 'reactions': {'resource': 'TOTAL', 'total': 36, 'found': 1, 'ratio': 0.0024},
 'inDisease': False}

In [10]:
pathways = []
for gene in list(data.keys()):
    for path in data[gene]['pathways']:
        pathways.append(path['dbId'])

In [20]:
len(set(pathways))

882

In [11]:
gene_path = pd.DataFrame(columns=list(set(pathways)), index = list(data.keys()))
pval = 0.05
for gene in list(data.keys()):
    for path in data[gene]['pathways']:
        if path['entities']['pValue'] < pval and path['entities']['found']==1 and path['inDisease']==True:
            gene_path.loc[gene, path['dbId']] = 1

In [12]:
gene_path.sum(axis=0).max()

5

In [13]:
sim_mtx = pd.DataFrame(index = list(data.keys()), columns = list(data.keys()))

In [14]:
for gene1 in sim_mtx.index:
    for gene2 in sim_mtx.columns:
        t1 = gene_path.loc[gene1]
        t1 = t1.dropna().index.tolist()
        t2 = gene_path.loc[gene2]
        t2 = t2.dropna().index.tolist()
        sim_mtx.loc[gene1, gene2] = len(set(t1).intersection(set(t2)))
sim_mtx.fillna(0, inplace=True)

In [16]:
sim_mtx.to_csv('reactome_distance_mtx_inDisease.csv')