In [13]:
import csv
import numpy as np
import requests

with open("../data/child_trait.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    sign_values = []
    genes = []
    
    for line in tsv_file:
        if line[27] == 'P-VALUE':
            continue
        p_value = float(line[27])
        
        if p_value < 0.05:
            sign_l = (p_value, line[14])
            sign_values.append(sign_l)
            
            if ',' in line[14]:
                genes.extend(line[14].split(','))
            elif len(line[14]) == 0:
                continue
            elif ' - ' in line[14]:
                genes.append(line[14].replace(' - ', '-'))
            else:
                genes.append(line[14])
            
        
    unique_genes = list(dict.fromkeys(genes))
    
    f = open("../data/mapped_genes.txt", "w")
    for gene in unique_genes:
        f.write(gene + ', ')
    f.close()



#map the genes to proteins
string_api_url = "https://string-db.org/api"
output_format = "tsv-no-header"
method = "get_string_ids"
species = "9606"  




#genes = ['TIMP2', 'SELP', 'RBMS3', 'APOC1-APOC1P1', 'LINC02388-LRIG3', 'CELF1']
print('first 10 unique genes:', unique_genes[:10])
query = "%0d".join(unique_genes[:10])



request_url = f"{string_api_url}/{output_format}/{method}?identifiers={query}&species={species}"


response = requests.get(request_url)
protein_ids =[]


protein_mappings = response.text.split("\n")
for line in protein_mappings:
    if line.strip():
        fields = line.split("\t")
        if len(fields) > 1:
            protein_id = fields[1]
            protein_ids.append(protein_id)

print(protein_ids)

    

print('mapped unique genes (significant values): ', len(unique_genes))
#print('mapped unique genes (last 10)', unique_genes)

    


with open("../data/alzheimer_background.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    l1 = []
    l2 = []
    for line in tsv_file:
        if line[27] == 'P-VALUE':
            continue
        p_value = float(line[27])
        t = (p_value, line[14])
        l1.append(t)
        if p_value < 0.05:
            sign_l = (p_value, line[14])
            l2.append(sign_l)
    
    #print('including background data:', l1[:10])
    print('number of tuples:', len(l1))
    print('number of sign values:', len(l2))
    
# line = {'p-value': 1, 'mapped-genes': [a, b c]}

first 10 unique genes: ['TIMP2', ' CEP295NL', 'SELP', 'RBMS3', 'TAFA5', 'LINC02388-LRIG3', 'PPP5D1P', ' PPP5D1P', 'KLHL36-USP10', 'CIMAP3']
['9606.ENSP00000262768', '9606.ENSP00000312767', '9606.ENSP00000263686', '9606.ENSP00000373277', '9606.ENSP00000383933']
mapped unique genes (significant values):  1919
number of tuples: 3134
number of sign values: 3134


In [1]:
import requests
import csv

string_api_url = "https://string-db.org/api"
output_format = "tsv-no-header"
method = "get_string_ids"
species = "9606"



with open("../data/mapped_genes.txt") as file:
    txt_file = csv.reader(file, delimiter="\t")
    genes = []
    
    for gene in txt_file:
        if gene:
            genes.append(gene[0])
    


genes = ['TIMP2', 'SELP', 'RBMS3', 'APOC1-APOC1P1', 'LINC02388-LRIG3', 'CELF1']  
query = "%0d".join(genes)  



request_url = f"{string_api_url}/{output_format}/{method}?identifiers={query}&species={species}"


response = requests.get(request_url)

protein_ids = []

protein_mappings = response.text.split("\n")
for line in protein_mappings:
    if line.strip():  
        fields = line.split("\t")
        if len(fields) > 1:  
            protein_id = fields[1]  
            protein_ids.append(protein_id)


print("Protein IDs:", protein_ids)
    


Protein IDs: ['9606.ENSP00000262768', '9606.ENSP00000263686', '9606.ENSP00000373277', '9606.ENSP00000436864']


In [16]:
import random

method = "network"
#string_ids = "9606.ENSP00000312767%0d9606.ENSP00000263686%0d9606.ENSP00000373277%0d9606.ENSP00000383933"  # Example protein IDs
string_ids = "9606.ENSP00000312767"

request_url = f"{string_api_url}/{output_format}/{method}?identifiers={string_ids}&species={species}"
response = requests.get(request_url)

# Parse the response to get edges
interactions = response.text.split("\n")
for interaction in interactions:
    print(interaction)  # Interactions between proteins

interaction_data = [line.split("\t") for line in interactions if line]

# Counting edges
num_edges = len(interaction_data)
print(f"Number of edges: {num_edges}")


# Assuming `all_proteins` is a list of all STRING protein IDs
#random_proteins = random.sample(all_proteins, len(genes))

9606.ENSP00000261778	9606.ENSP00000312767	TANGO6	CEP295NL	9606	0.447	0	0	0	0	0	0	0.447
9606.ENSP00000312767	9606.ENSP00000339179	CEP295NL	TSSK4	9606	0.432	0	0	0	0	0	0	0.432
9606.ENSP00000312767	9606.ENSP00000325978	CEP295NL	LRRC31	9606	0.501	0	0	0	0	0	0	0.501
9606.ENSP00000312767	9606.ENSP00000424711	CEP295NL	CLRN2	9606	0.53	0	0	0	0	0	0	0.53
9606.ENSP00000325978	9606.ENSP00000424711	LRRC31	CLRN2	9606	0.425	0	0	0	0.055	0	0	0.417

Number of edges: 5


In [None]:
import requests

STRING_API_URL = "https://string-db.org/api"
OUTPUT_FORMAT = "tsv-no-header"
METHOD = "get_string_ids"
SPECIES = "9606"


GWAS_API_URL = f"https://www.ebi.ac.uk/gwas/rest/api/efoTraits/search/findByEfoTraitContains?efoTrait={disease_name}"

In [29]:
import requests

STRING_API_URL = "https://string-db.org/api"
OUTPUT_FORMAT = "tsv-no-header"
METHOD = "get_string_ids"
SPECIES = "9606"


class Disease:
    def __init__(self, name, efo_id):
        self.name = name
        #self.efo_id = self.get_efo()
        self.efo_id = efo_id
        self.genes = []
        self.proteins = []
        self.interactions = []
        self.num_edges = None
    
    def get_efo(self):
        url = f"https://www.ebi.ac.uk/gwas/rest/api/efoTraits/search/findByEfoTraitContains?efoTrait={self.name}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for efo_trait in data['_embedded']['efoTraits']:
                print(f"Name: {efo_trait['trait']}, EFO ID: {efo_trait['shortForm']}")
        else:
            print(f"Failed to retrieve data: {response.status_code}")
            
    def get_genes(self):
        apiUrl = 'https://www.ebi.ac.uk/gwas/rest/api'
        url = '%s/efoTraits/%s/associations?projection=associationByStudy' %(apiUrl, self.efo_id)
        return url
    
    def get_proteins(self):
        query = "%0d".join(self.genes)
        method = "get_string_ids"
        url = f"https://string-db.org/api/tsv-no-header/get_string_ids?identifiers={query}&species=9606"
        return url
    
    def get_clustering(self):
        method = "network"
        url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/{method}?identifiers={self.proteins}&species={SPECIES}"
        return url
    
    def send_request(self, url):
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to retrieve data: {response.status_code}")
            return None
        
    def quantify_clustering(self):
        #for gene retrieval
        response = self.send_request(self.get_genes())
        associations = response['_embedded']['associations']
        for association in associations:
            loci = association['loci']
            for locus in loci:
                author_reported_genes = locus['authorReportedGenes']
                for gene in author_reported_genes:
                    gene_name = gene['geneName']
                    self.genes.append(gene_name)
                    
        self.genes = list(dict.fromkeys(self.genes))
        print(f"Genes: {len(self.genes)}")
        
        #for protein mappings
        response = self.send_request(self.get_proteins())
        if response:
            protein_mappings = response.text.split("\n")
            for line in protein_mappings:
                if line.strip():  
                    fields = line.split("\t")
                    if len(fields) > 1:  
                        protein_id = fields[1]  
                        self.proteins.append(protein_id)
            self.proteins = list(dict.fromkeys(self.proteins))
            print(f"Proteins: {len(self.proteins)}")
        else:
            print("Failed to retrieve protein mappings.")
            
        '''for line in protein_mappings:
            if line.strip():  
                fields = line.split("\t")
                if len(fields) > 1:  
                    protein_id = fields[1]  
                    self.proteins.append(protein_id)'''
                    
        #for clustering the proteins
        response = self.send_request(self.get_clustering())
        interactions = response.text.split("\n")
        for interaction in interactions:
            print(interaction)
            self.interactions.append(interaction)
            
        interaction_data = [line.split("\t") for line in interactions if line]
        self.num_edges = len(interaction_data)
        print(f"Number of edges: {self.num_edges}")
        
        

disease1 = Disease("Alzheimer's", 'MONDO_0004975')
disease1.quantify_clustering()

Genes: 513


JSONDecodeError: Extra data: line 1 column 3 (char 2)

In [7]:

import requests
import pandas as pd
import random

STRING_API_URL = "https://string-db.org/api"
OUTPUT_FORMAT = "tsv-no-header"
SPECIES = "9606"
REQUIRED_SCORE = 700

aliases_df = pd.read_csv('../data/9606.protein.aliases.v12.0.txt', sep='\t')
ALL_PROTEINS = aliases_df['string_protein_id'].unique().tolist()

print(f"Total number of human proteins: {len(ALL_PROTEINS)}\n")

class Disease:
    def __init__(self, name, efo_id):
        self.name = name
        self.efo_id = efo_id
        self.genes = []
        self.proteins = []
        self.interactions = []
        self.num_edges = 0
        self.all_string_proteins = set(ALL_PROTEINS)
    
    def __str__(self):
        return self.name

    def send_request(self, url):
        response = requests.get(url)
        if response.status_code == 200:
            return response
        else:
            print(f"Failed to retrieve data: {response.status_code}\n")
            return None

    def get_genes(self):
        api_url = 'https://www.ebi.ac.uk/gwas/rest/api'
        url = f'{api_url}/efoTraits/{self.efo_id}/associations?projection=associationByStudy'
        return url

    def get_proteins(self, genes):
        query = "%0d".join(genes)
        url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/get_string_ids?identifiers={query}&species={SPECIES}"
        print(f'retrieve proteins url: {url}\n')
        return url

    def batch_list(self, data_list, batch_size):
        for i in range(0, len(data_list), batch_size):
            yield data_list[i:i + batch_size]
    
    def get_clustering(self, required_score=700):
        #protein_batches = list(self.batch_list(self.proteins, 100))
        #urls = []
        #for batch in protein_batches:
        identifiers = "%0A".join(self.proteins)
        url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/network?identifiers={identifiers}&species={SPECIES}&required_score={required_score}"
        #urls.append(url)
        print(f'protein clustering url: {url}\n')
        return url

    def get_random_edges(self, num_nodes, iterations=10, required_score=700):
        total_edges = 0

        '''for _ in range(iterations):
            random_num_edges = 0
            
            random_proteins = random.sample(list(self.all_string_proteins), num_nodes)
            random_protein_batches = list(self.batch_list(random_proteins, 100))
            urls = []
            for batch in random_protein_batches:
                identifiers = "%0A".join(batch)
                url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/network?identifiers={identifiers}&species={SPECIES}"
                urls.append(url)

            for url in urls:
                response = self.send_request(url)
                if response:
                    interactions = response.text.split("\n")
                    for interaction in interactions:
                        if interaction.strip():
                            self.interactions.append(interaction)
                    interaction_data = [line.split("\t") for line in interactions if line]
                    random_num_edges += len(interaction_data)
                else:
                    print("Failed to retrieve clustering data.")'''
                    
        for _ in range(iterations):
            random_num_edges = 0
            
            random_proteins = random.sample(list(self.all_string_proteins), num_nodes)
            identifiers = "%0A".join(random_proteins)
            url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/network?identifiers={identifiers}&species={SPECIES}&required_score={required_score}"
            response = self.send_request(url)
            if response:
                interactions = response.text.split("\n")
                for interaction in interactions:
                    if interaction.strip():
                        self.interactions.append(interaction)
                interaction_data = [line.split("\t") for line in interactions if line]
                random_num_edges += len(interaction_data)
            else:
                print("Failed to retrieve clustering data.")
        print(f'random edges url: {url}\n')
        
        return random_num_edges / iterations
    
    def get_ppi_enrichment(self):
        query = "%0d".join(self.proteins)
        url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/ppi_enrichment?identifiers={query}&species={SPECIES}&required_score={REQUIRED_SCORE}"
        print(f'get ppi enrichment url: {url}\n')
        return url
        
    def quantify_clustering(self):
        # Retrieve genes
        response = self.send_request(self.get_genes())
        print(f'Get genes url: {self.get_genes()}')
        count = 0
        if response:
            associations = response.json()['_embedded']['associations']
            for association in associations:
                loci = association['loci']
                for locus in loci:
                    author_reported_genes = locus['authorReportedGenes']
                    for gene in author_reported_genes:
                        gene_name = gene['geneName']
                        count += 1
                        if count <= 800:
                            self.genes.append(gene_name)
            self.genes = list(dict.fromkeys(self.genes))
            print(f"Genes: {len(self.genes)}\n")
        
        # Retrieve protein mappings
        '''for gene_batch in self.batch_list(self.genes, 200):
            response = self.send_request(self.get_proteins(gene_batch))
            if response:
                protein_mappings = response.text.split("\n")
                for line in protein_mappings:
                    if line.strip(): 
                        fields = line.split("\t")
                        if len(fields) > 1:
                            protein_id = fields[1]  
                            self.proteins.append(protein_id)
                self.proteins = list(dict.fromkeys(self.proteins))
                print(f"Proteins: {len(self.proteins)}")
            else:
                print("Failed to retrieve protein mappings.")'''
        
        response = self.send_request(self.get_proteins(self.genes))
        if response:
            protein_mappings = response.text.split("\n")
            for line in protein_mappings:
                if line.strip(): 
                    fields = line.split("\t")
                    if len(fields) > 1:
                        protein_id = fields[1]  
                        self.proteins.append(protein_id)
            self.proteins = list(dict.fromkeys(self.proteins))
            print(f"Proteins: {len(self.proteins)}\n")
        else:
            print("Failed to retrieve protein mappings.")

        # For clustering the proteins
        '''clustering_urls = self.get_clustering()
        for url in clustering_urls:
            response = self.send_request(url)
            if response:
                interactions = response.text.split("\n")
                for interaction in interactions:
                    if interaction.strip():
                        self.interactions.append(interaction)
                interaction_data = [line.split("\t") for line in interactions if line]
                self.num_edges += len(interaction_data)
            else:
                print("Failed to retrieve clustering data.")
        print(f"Number of edges: {self.num_edges}")'''
        
        response = self.send_request(self.get_clustering())
        if response:
            interactions = response.text.split("\n")
            for interaction in interactions:
                if interaction.strip():
                    self.interactions.append(interaction)
            interaction_data = [line.split("\t") for line in interactions if line]
            self.num_edges += len(interaction_data)
        else:
            print("Failed to retrieve clustering data.")
        print(f"Number of edges: {self.num_edges}\n")
        
        # Compare to random set of protein clustering
        random_edges = self.get_random_edges(len(self.proteins))
        
        print(f"Average number of edges for random nodes: {random_edges}\n")

        if self.num_edges > random_edges:
            print("The disease genes show higher clustering than random nodes.")
        else:
            print("The disease genes do not show higher clustering than random nodes.")
        print('--------------------------------------------\n')
        
# Usage
disease1 = Disease("Alzheimer's Disease", 'MONDO_0004975')
print(disease1, '\n')
disease1.quantify_clustering()


disease2 = Disease("Parkinson Disease", 'MONDO_0005180')
print(disease2, '\n')
disease2.quantify_clustering()

disease3 = Disease("Celiac Disease", 'EFO_0001060')
print(disease3, '\n')
disease3.quantify_clustering()

disease4 = Disease("Schizophrenia", 'MONDO_0005090')
print(disease4, '\n')
disease4.quantify_clustering()


Total number of human proteins: 19699
Alzheimer's Disease
Get genes url: https://www.ebi.ac.uk/gwas/rest/api/efoTraits/MONDO_0004975/associations?projection=associationByStudy
Genes: 324
%0dCDON%0dPCNX%0dFANCD2%0dFANCD2OS%0dBCAM%0dPVR%0dEXOC3L2%0dMS4A3%0dCUGBP2%0dIL6R%0dTDRD10%0dSHE%0dUBE2Q1%0dADAR%0dCYB561%0dWTAPP1%0dMMP13%0dDCUN1D5%0dCCBP2%0dCCRL2%0dCCR5%0dCCR2%0dLTF%0dCCR3%0dC17orf28%0dCDR2L%0dSH2D4B%0dKANSL1%0dLRRC27A%0dMAPT%0dWNT3%0dKCNN3%0dSOX14%0dCLDN18%0dPGBD3P4%0dKCTD8%0dIL21%0dCETN4P%0dSERINC5%0dKRT18P45%0dCDC42SE2%0dRAPGEF6%0dFNIP1%0dACSL6%0dPFDN1%0dHBEGF%0dFAM20C%0dNPVF%0dUBA52P1%0dTAS2R62P%0dUSP6NL%0dMKI67%0dMGMT%0dMS4%0dMS4A6E%0dCRHR1%0dMACROD2%0dPPIAP17%0dDAPL1%0dOR7E89P%0dPDE1A%0dCPS1%0dRPS27P10%0dCADM2%0dFBXO40%0dLINC00290%0dMGC45800%0dOFCC1%0dRPL7AP36%0dPEX6%0dRPL3P8%0dIMMP2L%0dSORD%0dVN2R20P%0dRPL13AP7%0dIRF6%0dGAPDHP15%0dRBBP4P4%0dPMS2CL%0dZNF12%0dTHSD7A%0dTMEM106B%0dIL19%0dNCKAP5%0dCCDC85C%0dSDR9C7%0dALDH4A1%0dPKNOX2%0dNARS2%0dNFIC%0dCDC42EP3%0dPAK2%0dZPBP%0dADCY8%

In [2]:
# Quantify protein clustering for certain diseases
import requests
import pandas as pd
import random
from PIL import Image
from io import BytesIO

STRING_API_URL = "https://string-db.org/api"
OUTPUT_FORMAT = "tsv-no-header"
SPECIES = "9606"
REQUIRED_SCORE = 700

aliases_df = pd.read_csv('../data/9606.protein.aliases.v12.0.txt', sep='\t')
ALL_PROTEINS = aliases_df['string_protein_id'].unique().tolist()

print(f"Total number of human proteins: {len(ALL_PROTEINS)}\n")

class Disease:
    def __init__(self, name, efo_id):
        self.name = name
        self.efo_id = efo_id
        self.genes = []
        self.proteins = []
        self.interactions = []
        self.num_edges = 0
        self.all_string_proteins = set(ALL_PROTEINS)
    
    def __str__(self):
        return self.name

    def send_request(self, url):
        response = requests.get(url)
        if response.status_code == 200:
            return response
        else:
            print(f"Failed to retrieve data: {response.status_code}\n")
            return None
        
    def send_post_request(self, url, data):
        try:
            response = requests.post(url, data=data)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve data: {e}")
            return None

    def get_genes(self):
        api_url = 'https://www.ebi.ac.uk/gwas/rest/api'
        url = f'{api_url}/efoTraits/{self.efo_id}/associations?projection=associationByStudy'
        return url

    def get_proteins(self, genes):
        query = "%0d".join(genes)
        url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/get_string_ids?identifiers={query}&species={SPECIES}"
        print(f'retrieve proteins url: {url}\n')
        return url

    def batch_list(self, data_list, batch_size):
        for i in range(0, len(data_list), batch_size):
            yield data_list[i:i + batch_size]
    
    def get_clustering(self, proteins, method = 'network'):
        identifiers = "%0d".join(proteins)
        data = {'identifiers': identifiers,
                'species': SPECIES,
                'required_score': REQUIRED_SCORE
            }
        #url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/network?identifiers={identifiers}&species={SPECIES}&required_score={required_score}"
        #print(f'protein clustering url: {url}\n')
        request_url = "/".join([STRING_API_URL, OUTPUT_FORMAT, method])
        return request_url, data
    
    def get_clustering_image(self, proteins, method = 'network'):
        identifiers = "%0d".join(proteins)
        data = {'identifiers': identifiers,
                'species': SPECIES,
                'required_score': REQUIRED_SCORE
            }
        #url = f"{STRING_API_URL}/{OUTPUT_FORMAT}/network?identifiers={identifiers}&species={SPECIES}&required_score={required_score}"
        #print(f'protein clustering url: {url}\n')
        request_url = "/".join([STRING_API_URL, 'image', method])
        return request_url, data

    def get_random_edges(self, num_nodes, iterations=10, required_score=700):
        total_edges = 0
                    
        for _ in range(iterations):
            random_num_edges = 0
            
            random_proteins = random.sample(list(self.all_string_proteins), num_nodes)
            #identifiers = "%0A".join(random_proteins)
            url, data = self.get_clustering(random_proteins)
            response = self.send_post_request(url, data)
            if response:
                interactions = response.text.split("\n")
                for interaction in interactions:
                    if interaction.strip():
                        self.interactions.append(interaction)
                interaction_data = [line.split("\t") for line in interactions if line]
                random_num_edges += len(interaction_data)
            else:
                print("Failed to retrieve clustering data.")
        print(f'random edges url: {url}\n')
        
        return random_num_edges / iterations
    
    def get_ppi_enrichment(self, method='ppi_enrichment'):
        identifiers = "%0d".join(self.proteins)
        data = {'identifiers': identifiers,
                'species': SPECIES,
                'required_score': REQUIRED_SCORE
            }
        request_url = "/".join([STRING_API_URL, OUTPUT_FORMAT, method])
        return request_url, data
        
    def quantify_clustering(self):
        # Retrieve genes
        response = self.send_request(self.get_genes())
        print(f'Get genes url: {self.get_genes()}')
        count = 0
        if response:
            associations = response.json()['_embedded']['associations']
            for association in associations:
                loci = association['loci']
                for locus in loci:
                    author_reported_genes = locus['authorReportedGenes']
                    for gene in author_reported_genes:
                        gene_name = gene['geneName']
                        count += 1
                        if count <= 800:
                            self.genes.append(gene_name)
            self.genes = list(dict.fromkeys(self.genes))
            print(f"Genes: {len(self.genes)}\n")
        
        # Retrieve protein mappings
        response = self.send_request(self.get_proteins(self.genes))
        if response:
            protein_mappings = response.text.split("\n")
            for line in protein_mappings:
                if line.strip(): 
                    fields = line.split("\t")
                    if len(fields) > 1:
                        protein_id = fields[1]  
                        self.proteins.append(protein_id)
            self.proteins = list(dict.fromkeys(self.proteins))
            print(f"Proteins: {len(self.proteins)}\n")
        else:
            print("Failed to retrieve protein mappings.")

        # For clustering the proteins
        url, data = self.get_clustering(self.proteins)
        response = self.send_post_request(url, data)
        if response:
            interactions = response.text.split("\n")
            for interaction in interactions:
                if interaction.strip():
                    self.interactions.append(interaction)
            interaction_data = [line.split("\t") for line in interactions if line]
            self.num_edges += len(interaction_data)
        else:
            print("Failed to retrieve clustering data.")
        print(f"Number of edges: {self.num_edges}\n")
        
        url, data = self.get_clustering_image(self.proteins)
        response = self.send_post_request(url, data)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content))
        image.show()
        
        # Compare to random set of protein clustering
        random_edges = self.get_random_edges(len(self.proteins))
        
        print(f"Average number of edges for random nodes: {random_edges}\n")

        if self.num_edges > random_edges:
            print("The disease genes show higher clustering than random nodes.")
        else:
            print("The disease genes do not show higher clustering than random nodes.")
        
        # For ppi enrichment
        url, data = self.get_ppi_enrichment()
        response = self.send_request(url, data)
        if response:
            try:
                for line in response.text.strip().split("\n"):
                    columns = line.split("\t")
                    if len(columns) > 5:
                        pvalue = columns[5]
                        print("P-value:", pvalue)
                    else:
                        print("Unexpected response format:", line)
            except Exception as e:
                print(f"Error parsing response: {e}")
        else:
            print("Failed to retrieve PPI enrichment data.")
        print('--------------------------------------------\n')
        
# Usage
disease4 = Disease("Cataract", 'MONDO_0005129')
print(disease4, '\n')
disease4.quantify_clustering()

disease5 = Disease("Huntington Disease", 'MONDO_0007739')
print(disease5, '\n')
disease5.quantify_clustering()

disease6 = Disease("Gaucher Disease", 'MONDO_0018150')
print(disease6, '\n')
disease6.quantify_clustering()

disease3 = Disease("Schizophrenia", 'MONDO_0005090')
print(disease3, '\n')
disease3.quantify_clustering()

disease1 = Disease("Alzheimer's Disease", 'MONDO_0004975')
print(disease1, '\n')
disease1.quantify_clustering()


disease2 = Disease("Parkinson Disease", 'MONDO_0005180')
print(disease2, '\n')
disease2.quantify_clustering()





Total number of human proteins: 19699

Cataract 

Get genes url: https://www.ebi.ac.uk/gwas/rest/api/efoTraits/MONDO_0005129/associations?projection=associationByStudy
Genes: 73

retrieve proteins url: https://string-db.org/api/tsv-no-header/get_string_ids?identifiers=PPARD%0dCCDC102A%0dHSPA1A46%0dLINC02429%0dMIR548AG1%0dNR%0dCDKN2C%0dEFNA3%0dADCK3%0dSOX2%0d0T%0dHLA-DQB1%0dTHSD7A%0dIGFBP3%0dLOC730338%0dZNF800%0dLOC157273%0dTNKS%0dCDKN2B-AS1%0dDMRTA1%0dBAMBI%0dLINC01517%0dRIC8A%0dLOC338694%0dOCA2%0dWWP2%0dPLEKHH3%0dNPLOC4%0dMETRNL%0dJAG1%0dLOC101929395%0dSLC24A3%0dLRP1B%0dKYNU%0dOT%0dTNS3%0dCASP7%0d5' LOC338694%0dCAPRIN2%0dPLB1%0dCDKN2B%0dRBFOX1%0dSEPT9%0dCASZ1%0dADGRL2%0dFAM46C%0dLINC00970%0dTRIB2%0dDIRC3%0dRARB%0dATXN7%0dC4orf22%0dBMP3%0dKCNQ5%0dQKI%0dCREB5%0dBIN3%0dEGR3%0dPLCE1%0dODF3%0dBET1L%0dMVK%0dFAM222A%0dSTXBP6%0dBMP4%0dRORA%0dVPS13C%0dALDOA%0dRHOT1%0dRHBDL3%0dMIR2117HG%0dCPAMD8%0dHORMAD2&species=9606

Proteins: 59

Number of edges: 3

random edges url: https://string-db.org/ap

TypeError: Disease.send_request() takes 2 positional arguments but 3 were given

In [3]:
import pandas as pd
import networkx as nx

string_df = pd.read_csv('../data/9606.protein.links.v12.0.txt', sep=' ')

print(string_df.head())
high_confidence_df = string_df[string_df['combined_score'] >= 700]

G = nx.from_pandas_edgelist(
    high_confidence_df, 'protein1', 'protein2', edge_attr='combined_score'
)

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

               protein1              protein2  combined_score
0  9606.ENSP00000000233  9606.ENSP00000356607             173
1  9606.ENSP00000000233  9606.ENSP00000427567             154
2  9606.ENSP00000000233  9606.ENSP00000253413             151
3  9606.ENSP00000000233  9606.ENSP00000493357             471
4  9606.ENSP00000000233  9606.ENSP00000324127             201
Number of nodes: 16201
Number of edges: 236930


In [None]:
import requests
import json

# Step 1: Set the base URL for the GWAS Catalog API
base_url = "https://www.ebi.ac.uk/gwas/rest/api"

# Step 2: Use the EFO ID for Alzheimer's Disease
efo_id = "MONDO_0004975"

# Step 3: Define the API endpoint for Alzheimer's associations
url = f"{base_url}/efoTraits/{efo_id}/associations?projection=associationByStudy"

# Step 4: Send the request to retrieve the data
response = requests.get(url)

# Step 5: Parse and save the response data
if response.status_code == 200:
    data = response.json()
    
    # Optionally, write the full data to a file
    with open("alzheimers_associations.json", "w") as f:
        json.dump(data, f, indent=4)
    
    print("Downloaded data successfully!")
else:
    print(f"Failed to retrieve data: {response.status_code}")


Downloaded data successfully!


In [17]:
# Importing required packages
import requests     # Manages data transfer from the GWAS Catalog REST API
import pandas as pd # Makes data handling easier
import json         # Hanling the returned data type called JSON
from collections import OrderedDict

# API Address:
apiUrl = 'https://www.ebi.ac.uk/gwas/rest/api'

# Accessing data for a single variant:
variant = 'MONDO_0004975'
#requestUrl = '%s/efoTraits/%s/associations?projection=associationByStudy' %(apiUrl, variant)
#requestUrl = '%s/efoTraits/%s/associations?projection=associationByStudy' %(apiUrl, variant)
requestUrl = 'https://www.ebi.ac.uk/gwas/rest/api/efoTraits/EFO_0001060/associations?projection=associationsBySnpSummary'
response = requests.get(requestUrl, headers={ "Content-Type" : "application/json"})

# The returned response is a "response" object, from which we have to extract and parse the information:
decoded = response.json()

genes = []

associations = data['_embedded']['associations']
count = 0
for association in associations:
    loci = association['loci']
    for locus in loci:
        author_reported_genes = locus['authorReportedGenes']
        for gene in author_reported_genes:
            gene_name = gene['geneName']
            #entrez_id = gene['entrezGeneIds'][0]['entrezGeneId']
            #ensembl_id = gene['ensemblGeneIds'][0]['ensemblGeneId']
            genes.append(gene_name)
            count = count + 1

unique_genes = list(dict.fromkeys(genes))
print(len(unique_genes))
print(len(genes))

f = open("../data/celiac_genes.txt", "w")
for gene in genes:
    f.write(gene + ', ')
f.close()
    

513
1562
