# Alliance Genome

In [4]:
# Environment Setup

In [3]:
import requests
from concurrent.futures import ThreadPoolExecutor
import json
import os
import pandas as pd
import numpy as np
import gzip
import io
from tqdm import tqdm

In [68]:
# Setting up data directories
tmp_data = os.path.join('./resources/tmp_data')
processed_data = os.path.join('./resources/processed_data')

Utility Functions

In [3]:

def scrape_page(url, page, limit):
    """
    Scrape a single page of records from the API.

    Args:
        url (str): The URL of the API endpoint.
        page (int): The page number to scrape.
        limit (int): The number of records per page.

    Returns:
        list: A list of records from the specified page.
    """
    params = {'page': page, 'limit': limit}
    response = requests.get(url, params=params)
    data = response.json()
    return data['results']

def scrape_all_records(url, limit=20, max_workers=10):
    """
    Scrape all records from the API.

    Args:
        url (str): The URL of the API endpoint.
        limit (int, optional): The number of records per page. Defaults to 20.
        max_workers (int, optional): The maximum number of concurrent requests. Defaults to 10.

    Returns:
        list: A list of all records from the API.
    """
    response = requests.get(url, params={'page': 1, 'limit': limit})
    data = response.json()
    total_pages = -(-data['total'] // limit)  # ceiling division

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(scrape_page, url, page, limit) for page in range(1, total_pages + 1)]
        all_records = [record for future in futures for record in future.result()]

    return all_records

In [4]:
def download_gzipped_json(uri):
    # Send a GET request to the URI
    response = requests.get(uri)

    # Check if the response was successful
    if response.status_code != 200:
        raise Exception(f"Failed to download file from {uri}")

    # Create a BytesIO object to hold the gzipped data
    gzipped_data = io.BytesIO(response.content)

    # Create a GzipFile object to decompress the data
    with gzip.GzipFile(fileobj=gzipped_data, mode='rb') as f:
        # Read the decompressed data into a string
        json_data = f.read().decode('utf-8')

    # Parse the JSON data into a Python object
    data = json.loads(json_data)

    return data

In [5]:
from enum import Enum, auto
import requests
import os
import json
from biobox_analytics.utils import download_gzipped_json, oxo_mapping
import io
import pandas as pd
from tqdm import tqdm

class AllianceDownloads(Enum):

    DISEASE_HUMAN = "https://fms.alliancegenome.org/download/DISEASE-ALLIANCE-JSON_HUMAN.json.gz"
    DISEASE_MOUSE = "https://fms.alliancegenome.org/download/DISEASE-ALLIANCE-JSON_MGI.json.gz"


class AllianceGenomeAdapter:

    def __init__(self):
        self._api = "https://www.alliancegenome.org/api/"
        self._disease_gene_associations = []
        self._prepare_mappers()
        self._download_disease_associations()

    def _prepare_mappers(self):

        
        human = pd.read_table("https://www.alliancegenome.org/api/geneMap/ensembl?species=NCBITaxon:9606", sep="\t")

        mouse = pd.read_table("https://www.alliancegenome.org/api/geneMap/ensembl?species=NCBITaxon:10090", sep="\t")

        self._gene_map = pd.concat([human, mouse])


    def _download_disease_associations(self):
        human_results = download_gzipped_json(AllianceDownloads.DISEASE_HUMAN.value)
        DISEASE_IDS = list(set([x.get('DOID') for x in human_results['data']]))

        # self._doid_mapping = oxo_mapping(DISEASE_IDS, ["EFO", "MONDO"])
        with open(os.path.join('./resources/mappings', 'doid_efo.mapping.json'), 'r') as f:
            
            self._doid_mapping = json.load(f)

        for _id in tqdm(DISEASE_IDS, desc="Collecting DiseaseGene Annotations"):
 
            try:
                harmonized_disease_id = self._doid_mapping.get(_id)[0]
            except:
                continue
            
            records = scrape_all_records(f"https://www.alliancegenome.org/api/disease/{_id}/genes?filter.species=Homo sapiens")
            associations = []
            for anno in records:
                association = self._format_disease_gene_annotation(anno, harmonized_disease_id)
                if association is not None:
                    associations.append(association)
            self._disease_gene_associations.extend(associations)

    def _hgnc2ensembl(self, hgnc):
        subset = self._gene_map[self._gene_map['Gene ID'] == hgnc]
        return subset['Ensembl ID'].values[0]
    
    def _format_disease_gene_annotation(self, anno, disease_id, fmt="edge"):

        relation_str = anno.get('generatedRelationString').replace('_', ' ')
        disease_qualifiers = anno.get('diseaseQualifiers', [])

        if len(disease_qualifiers) > 1:
            print(anno)
            raise Exception("Disease Qualifiers should not be more than one in length")
            
        if len(disease_qualifiers) == 1:
            disease_qualifier_str = disease_qualifiers[0].replace('_', ' ')
            relation_str = f'{relation_str} {disease_qualifier_str}'

        uuid = anno.get("uniqueId")

        references = anno.get('pubmedPubModIDs')
        evidenceCodes = anno.get('evidenceCodes', [])
        evidence = [x.get('abbreviation') for x in evidenceCodes]
        

        try:
            gene_hgnc = anno['subject']['modEntityId']
            ensembl_id = self._hgnc2ensembl(gene_hgnc)
        except:
            return

        output = {
            'from': {
                'uuid': ensembl_id
            },
            'to': {
                'uuid': disease_id
            },
            'label': relation_str,
            'properties': {
                'uuid': uuid,
                'references': references,
                'evidence': evidence,
                'citation_authority': 'Alliance of Genome Resources'
            }
        }
        return output
        

    
    # def get_nodes(self):
    #     """
    #     Returns a generator of node tuples for node types specified in the
    #     adapter constructor.
    #     """
    #     print("Generating Nodes")

    #     for da in self._disease_gene_associations:
            

In [6]:
adapter = AllianceGenomeAdapter()

Collecting DiseaseGene Annotations: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6127/6127 [46:03<00:00,  2.22it/s]


In [7]:
len(adapter._disease_gene_associations)

234908

In [14]:
adapter._disease_gene_associations[100]

{'from': {'uuid': 'ENSG00000156170'},
 'to': {'uuid': 'MONDO:0009723'},
 'label': 'is implicated via orthology',
 'properties': {'uuid': 'wBCpG48BJgBeDSVnk52w',
  'references': ['MGI:6194238'],
  'evidence': ['IEA'],
  'citation_authority': 'Alliance of Genome Resources'}}

In [9]:
with gzip.open(os.path.join('./resources/processed_data', 'alliance_disease_gene_edge.jsonl.gz'), 'wt') as f:
    for n in tqdm(adapter._disease_gene_associations):
        f.write(json.dumps(n) + '\n')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234908/234908 [00:03<00:00, 74724.49it/s]


In [11]:
rel_labels = list(set([x.get('label') for x in adapter._disease_gene_associations]))
relationships = {
    x: {'from': 'Gene', 'to': 'Disease'} for x in rel_labels
}

alliance_gene_disease_associations_metadata = {
  "_meta": {
    "version": "0.1.0",
    "date_updated": "2024-05-20",
    "maintainer": "BioBox Analytics"
  },
  "key": "alliance_disease_gene",
  "name": "Alliance - Disease Gene Assocations",
  "description": "A consortium of 7 model organism databases (MODs) and the Gene Ontology (GO) Consortium whose goal is to provide an integrated view of their data to all biologists, clinicians and other interested parties. This data package contains edges between Genes and Diseases. It will require you to have loaded in EFO and MONDO ontologies.",
  "source": [
    {
      "uri": "https://www.alliancegenome.org/downloads",
      "type": "doc"
    },
    {
      "uri": "https://fms.alliancegenome.org/download/DISEASE-ALLIANCE-JSON_HUMAN.json.gz",
      "type": "data",
      "version": "7.1.0"
    }
  ],
  "concepts": {},
  "relationships": relationships
}

In [13]:
with open('./resources/processed_data/alliance_disease_gene_metadata.json', 'w') as outfile:
    json.dump(alliance_gene_disease_associations_metadata, outfile)

# Scratchpad

In [38]:
hmd_human_phenotype.head()

Unnamed: 0,0,1,2,3,4,5
0,A1BG,1,A1bg,MGI:2152878,,
1,A1CF,29974,A1cf,MGI:1917115,"MP:0005367, MP:0005369, MP:0005370, MP:0005376...",
2,A2M,2,A2m,MGI:2449119,,
3,A3GALT2,127550,A3galt2,MGI:2685279,,
4,A4GALT,53947,A4galt,MGI:3512453,"MP:0005376, MP:0005386, MP:0010768",


In [6]:
def download_tab_delimited_data(url, col_names):
    """Downloads tab-delimited data from a URL, skipping header lines and empty lines.

    Args:
        url: The URL of the tab-delimited file.

    Returns:
        A pandas DataFrame containing the data.
    """

    response = requests.get(url)
    response.raise_for_status()  # Raise an exception if the request failed

    # Filter out header lines and empty lines
    data_lines = [line for line in response.text.splitlines() 
                  if not line.startswith('#')]

    # Create a file-like object from the data

    data_io = io.StringIO('\n'.join(data_lines)) 

    # Read the data into a DataFrame, specifying the delimiter and quoting
    df = pd.read_table(data_io, sep='\t', names=col_names, header=None, skip_blank_lines=True, on_bad_lines='skip')

    return df



# MGI: Alleles and Genotypes

2 Alleles -> 1 Genotype - model of -> Disease

In [70]:
mgi_disease_mouse_model = download_tab_delimited_data(
    "https://www.informatics.jax.org/downloads/reports/MGI_DiseaseMouseModel.rpt",
    ['do_term', 'DOID', 'negated', 'allele_pairs', 'strain', 'allele_symbol',
     'allele_mgi_id', 'totalNumAlleleReferences', 'repo_id', 'mouse_genotype_rr_id', 'mouse_gene_symbol', 'mouse_gene_id', 'repo_id_gene'])

In [71]:
mgi_disease_mouse_model.head()

Unnamed: 0,do_term,DOID,negated,allele_pairs,strain,allele_symbol,allele_mgi_id,totalNumAlleleReferences,repo_id,mouse_genotype_rr_id,mouse_gene_symbol,mouse_gene_id,repo_id_gene
0,17-beta hydroxysteroid dehydrogenase 3 deficiency,DOID:0112248,,Hsd17b3<tm1.2Mpo>/Hsd17b3<tm1.2Mpo>,involves: 129S6/SvEvTac * C57BL/6N * C57BL/6NCrl,Hsd17b3<tm1.2Mpo>,MGI:6508689,1.0,,RRID:MGI:6508693,Hsd17b3,MGI:107177,EuMMCR:12655|EuMMCR:3397|EuMMCR:3398|MMRRC:069...
1,2-aminoadipic 2-oxoadipic aciduria,DOID:0111453,,Dhtkd1<em1Hpr>/Dhtkd1<em1Hpr>,involves: C57BL/6N * FVB/N,Dhtkd1<em1Hpr>,MGI:6268324,1.0,,RRID:MGI:6268326,Dhtkd1,MGI:2445096,NM-KI-00005|MMRRC:042792-MU|MMRRC:045216-MU|MM...
2,3-methylglutaconic aciduria type 3,DOID:0110004,,Opa3<m1Votr>/Opa3<m1Votr>,involves: C3H * C57BL/6JCrl,Opa3<m1Votr>,MGI:5312669,5.0,,RRID:MGI:5312681,Opa3,MGI:2686271,EuMMCR:25483|MMRRC:067719-MU|MMRRC:041675-MU|M...
3,"3-methylglutaconic aciduria with deafness, enc...",DOID:0110001,,Serac1<em1Bcgen>/Serac1<em1Bcgen>,C57BL/6N-Serac1<em1Bcgen>,Serac1<em1Bcgen>,MGI:7316653,1.0,,RRID:MGI:7316678,Serac1,MGI:2447813,CMMR:CMMR:ADPW|NM-KO-2100721|MMRRC:069450-MU|M...
4,46 XX gonadal dysgenesis,DOID:14450,,Fancl<gcd>/Fancl<gcd>,involves: C57BL/6J * CBA/J,Fancl<gcd>,MGI:1861640,5.0,,RRID:MGI:2660751,Fancl,MGI:1914280,EuMMCR:9783|EuMMCR:9782|MMRRC:044564-MU|MMRRC:...


Gather all the alleles as nodes first.

In [72]:
mgi_phenotypic_allele = download_tab_delimited_data('https://www.informatics.jax.org/downloads/reports/MGI_PhenotypicAllele.rpt',
                                                   ['mgi_allele_id', 'allele_symbol', 'allele_name', 'allele_type', 'allele_attribute', 'PMID', 'mgi_marker_id','marker_symbol', 'marker_refseq_id', 'marker_ensembl_id', 'mp_id', 'synonyms', 'marker_name'])

In [73]:
mgi_phenotypic_allele.head()

Unnamed: 0,mgi_allele_id,allele_symbol,allele_name,allele_type,allele_attribute,PMID,mgi_marker_id,marker_symbol,marker_refseq_id,marker_ensembl_id,mp_id,synonyms,marker_name
0,MGI:5603349,0610010K14Rik<tm1.1(KOMP)Vlcg>,"targeted mutation 1.1, Velocigene",Targeted,Null/knockout|Reporter,,MGI:1915609,0610010K14Rik,NM_001177601,ENSMUSG00000020831,"MP:0005386,MP:0010768",,RIKEN cDNA 0610010K14 gene
1,MGI:4452997,0610010K14Rik<tm1(KOMP)Vlcg>,"targeted mutation 1, Velocigene",Targeted,Null/knockout|Reporter,,MGI:1915609,0610010K14Rik,NM_001177601,ENSMUSG00000020831,,,RIKEN cDNA 0610010K14 gene
2,MGI:7425474,0610040J01Rik<em1(IMPC)Ccpcz>,"endonuclease-mediated mutation 1, Institute of...",Endonuclease-mediated,Null/knockout,,MGI:1923511,0610040J01Rik,NM_001417897,ENSMUSG00000060512,"MP:0005376,MP:0005378,MP:0005385,MP:0005387,MP...",,RIKEN cDNA 0610040J01 gene
3,MGI:3923426,1110038B12Rik<Gt(Ayu21-B141)Imeg>,"gene trap Ayu21-B141, Institute of Molecular E...",Gene trapped,,,MGI:1916013,1110038B12Rik,,ENSMUSG00000092203,,,RIKEN cDNA 1110038B12 gene
4,MGI:3923883,1110059E24Rik<Gt(Ayu21-W256)Imeg>,"gene trap Ayu21-W256, Institute of Molecular E...",Gene trapped,,,MGI:1913456,1110059E24Rik,NM_025423,ENSMUSG00000035171,,,RIKEN cDNA 1110059E24 gene


In [74]:
# Quick inspection
mgi_phenotypic_allele.describe(include='all')

Unnamed: 0,mgi_allele_id,allele_symbol,allele_name,allele_type,allele_attribute,PMID,mgi_marker_id,marker_symbol,marker_refseq_id,marker_ensembl_id,mp_id,synonyms,marker_name
count,108940,108940,108937,108940,99813,50495.0,108940,108940,89676,92229,44935,72449,108940
unique,108940,108867,42229,14,1048,,33118,33115,17298,17741,13572,70938,31283
top,MGI:5603349,Arhgap6/Hccs/Mid1<tm1Hzo>,"endonuclease-mediated mutation 1, GemPharmatec...",Targeted,Null/knockout,,MGI:104735,Gt(ROSA)26Sor,NM_013556,ENSMUSG00000086429,MP:0002873,"deltaNLS Wld<S>|deltaNLS<R213A,R215A> Wld<S>","gene trap ROSA 26, Philippe Soriano"
freq,1,3,8989,43851,37740,,1477,1477,247,1477,2068,6,1477
mean,,,,,,21256520.0,,,,,,,
std,,,,,,8454548.0,,,,,,,
min,,,,,,25645.0,,,,,,,
25%,,,,,,15139580.0,,,,,,,
50%,,,,,,20850370.0,,,,,,,
75%,,,,,,27916960.0,,,,,,,


In [75]:
mgi_phenotypic_allele['allele_type'].unique()

array(['Targeted', 'Endonuclease-mediated', 'Gene trapped',
       'Chemically induced (ENU)', 'Transgenic', 'Spontaneous',
       'Not Applicable', 'Transposon induced',
       'Chemically induced (other)', 'Radiation induced',
       'Chemically and radiation induced', 'QTL', 'Not Specified',
       'Other'], dtype=object)

In [76]:
allele_map = {}
for idx, row in tqdm(mgi_phenotypic_allele.iterrows()):
    d = row.dropna().to_dict()
    _id = d.get('mgi_allele_id')
    if _id in allele_map:
        continue
    else:
        allele_attribute = d.get('allele_attribute', None)
        mammalian_phenotype_id = d.get('mp_id', None)
        props = {
            'uuid': _id,
            'displayName': d.get('allele_symbol', None),
            'description': d.get('allele_name', None),
            'citations': f'PMID:{d.get("PMID", None)}' if d.get('PMID', None) is not None else None,
            'allele_type': d.get('allele_type', None),
            'allele_attribute': allele_attribute.split('|') if allele_attribute is not None else None,
            'marker_ensembl_id': d.get('marker_ensembl_id'),
            'marker_symbol': d.get('marker_symbol'),
            'marker_mgi_id': d.get('mgi_marker_id'),
            'mammalian_phenotype_id': mammalian_phenotype_id.split(',') if mammalian_phenotype_id is not None else None
        }

        
        n = {
            '_id': _id,
            'labels': ['Allele'],
            'properties': {
                k:v for k,v in props.items() if v is not None
            }
        }
        allele_map[_id] = n

108940it [00:13, 7906.16it/s]


## Creating Genotype Nodes

Next we need to construct all the Genotypes, but also make the edges for the allelic combinations.

In [77]:
mgi_gene_pheno = download_tab_delimited_data(
    'https://www.informatics.jax.org/downloads/reports/MGI_GenePheno.rpt',
    ['allelic_composition', 'allele_symbols', 'allele_ids', 'genetic_background', 'mp_id', 'PMID', 'mgi_marker_id', 'mgi_genotype_id']
)

In [78]:
mgi_gene_pheno.head()

Unnamed: 0,allelic_composition,allele_symbols,allele_ids,genetic_background,mp_id,PMID,mgi_marker_id,mgi_genotype_id
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,MGI:2166359
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,MGI:2166359
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,MGI:2166359
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,MGI:2166359
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,MGI:2166359


In [79]:
genotype_map = {}

In [80]:
genotype_subset = mgi_gene_pheno[['mgi_genotype_id', 'allelic_composition', 'genetic_background']].groupby('mgi_genotype_id').first()
for _id, row in tqdm(genotype_subset.iterrows()):
    d = row.to_dict()
    if _id in genotype_map:
        continue
    else:
        n = {
            '_id': _id,
            'labels': ['Genotype'],
            'properties': {
                'uuid': _id,
                'displayName': d.get('allelic_composition'),
                'description': '',
                'allelic_composition': d.get('allelic_composition'),
                'genetic_background': d.get('genetic_background'),
                'url': f"https://www.informatics.jax.org/allele/genoview/{_id}"
            }
        }
        genotype_map[_id] = n

48999it [00:02, 21341.51it/s]


In [81]:
# Subset out allele_id and genotype_id
geno2allele = mgi_gene_pheno[['allele_ids', 'mgi_genotype_id']]

In [82]:
geno2allele.head()

Unnamed: 0,allele_ids,mgi_genotype_id
0,MGI:1857242,MGI:2166359
1,MGI:1857242,MGI:2166359
2,MGI:1857242,MGI:2166359
3,MGI:1857242,MGI:2166359
4,MGI:1857242,MGI:2166359


In [83]:
geno2allele_edges = []

for name, group in geno2allele.groupby('mgi_genotype_id'):
    allele_ids = set()
    for allele_str in list(group['allele_ids'].unique()):
        allele_ids.update(allele_str.split('|'))
    
    
    for allele_id in list(allele_ids):
        e = {
            'from': {
                'uuid': name
            },
            'to': {
                'uuid': allele_id
            },
            'label': 'has allele',
            'properties': {}
        }
        geno2allele_edges.append(e)

In [84]:
geno2allele_edges[0]

{'from': {'uuid': 'MGI:2166359'},
 'to': {'uuid': 'MGI:1857242'},
 'label': 'has allele',
 'properties': {}}

Convert the DOIDs to EFOs where possible, and then explode out the genotypes for edges. Assumes EFO and MONDO are in.

In [85]:
mgi_disease_gene_model = download_tab_delimited_data('https://www.informatics.jax.org/downloads/reports/MGI_DiseaseGeneModel.rpt',
                                                    [
                                                        'human_gene_symbol',
                                                        'human_gene_name',
                                                        'hgnc_id',
                                                        'do_term',
                                                        'do_term_id',
                                                        'mouse_genotype_id',
                                                        'mouse_gene',
                                                        'mouse_mgi_id',
                                                        'facilities',
                                                        'repo_id'
                                                    ])

In [86]:
mgi_disease_gene_model.head()

Unnamed: 0,human_gene_symbol,human_gene_name,hgnc_id,do_term,do_term_id,mouse_genotype_id,mouse_gene,mouse_mgi_id,facilities,repo_id
0,A1CF,APOBEC1 complementation factor,HGNC:24086,gout,DOID:13189,,A1cf,MGI:1917115,APB|EMMA|EuMMCR|GPT|JAX|MMRRC,EuMMCR:8829|EuMMCR:8828|EM:11705|APB:7708|MMRR...
1,A2M,alpha-2-macroglobulin,HGNC:7,Alzheimer's disease,DOID:10652,MGI:3693439|MGI:2684657|MGI:3721938|MGI:575438...,A2m,MGI:2449119,CMMR|GPT|MMRRC|SMOC,CMMR:CMMR:ABAL|CMMR:CMMR:ABDJ|CMMR:CMMR:ABHA|N...
2,A2M,alpha-2-macroglobulin,HGNC:7,background diabetic retinopathy,DOID:13208,,A2m,MGI:2449119,CMMR|GPT|MMRRC|SMOC,CMMR:CMMR:ABAL|CMMR:CMMR:ABDJ|CMMR:CMMR:ABHA|N...
3,A2M,alpha-2-macroglobulin,HGNC:7,COVID-19,DOID:0080600,MGI:6402641|MGI:6455041|MGI:6431340|MGI:6415340,A2m,MGI:2449119,CMMR|GPT|MMRRC|SMOC,CMMR:CMMR:ABAL|CMMR:CMMR:ABDJ|CMMR:CMMR:ABHA|N...
4,A2M,alpha-2-macroglobulin,HGNC:7,multiple sclerosis,DOID:2377,MGI:3835028|MGI:6305814|MGI:4461062|MGI:542970...,A2m,MGI:2449119,CMMR|GPT|MMRRC|SMOC,CMMR:CMMR:ABAL|CMMR:CMMR:ABDJ|CMMR:CMMR:ABHA|N...


In [87]:
mgi_disease_gene_model.dropna(subset=['mouse_genotype_id'], inplace=True)

In [88]:
doid_grouped = mgi_disease_gene_model[['do_term_id', 'mouse_genotype_id']].groupby('do_term_id')

In [89]:
with open('./resources/mappings/doid_efo.mapping.json', 'r') as f:
    doid_efo_mapping = json.load(f)

In [90]:
len(doid_efo_mapping.keys())

6127

In [91]:
genotype2disease_edges = []

for doid, group in doid_grouped:
    if doid not in doid_efo_mapping:
        continue
    else:
        
        efo_hits = doid_efo_mapping[doid]
        if efo_hits is None:
            continue
        efo_id = efo_hits[0]
        genotype_ids = set()
        for genotype_str in list(group['mouse_genotype_id']):
            genotype_ids.update(genotype_str.split('|'))

        for genotype_id in list(genotype_ids):
            e = {
                'from': {
                    'uuid': genotype_id
                },
                'to': {
                    'uuid': efo_id
                },
                'label': 'is model of',
                'properties': {}
            }
            genotype2disease_edges.append(e)

In [92]:
genotype2disease_edges[10]

{'from': {'uuid': 'MGI:5314017'},
 'to': {'uuid': 'EFO:1001352'},
 'label': 'is model of',
 'properties': {}}

# Node and Edge Generation

In [93]:
nodes = []
edges = []

In [94]:
nodes.extend(genotype_map.values())
print(len(nodes))
edges.extend(geno2allele_edges)
print(len(edges))

48999
50361


In [95]:
nodes.extend(allele_map.values())
print(len(nodes))
edges.extend(genotype2disease_edges)
print(len(edges))

157939
57477


In [96]:

with gzip.open(os.path.join(processed_data, 'alliance_disease_models_node.jsonl.gz'), 'wt', encoding='utf-8') as outfile:
    for x in tqdm(nodes):
        outfile.write(json.dumps(x) + '\n')

with gzip.open(os.path.join(processed_data, 'alliance_disease_models_edge.jsonl.gz'), 'wt', encoding='utf-8') as outfile:
    for x in tqdm(edges):
        outfile.write(json.dumps(x) + '\n')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157939/157939 [00:03<00:00, 45774.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 57477/57477 [00:00<00:00, 122554.33it/s]


In [98]:
metadata = {
    "_meta": {
        "version": "0.1.3",
        "date_updated": "2024-02-17",
        "maintainer": "BioBox Analytics"
    },
    "key": "alliance_disease_models",
    "name": "Alliance Genome - Animal Models of Disease",
    "description": "Extraction of alleles-genotypes-diseases reports from MGI.",
    "source": [
        {
            "uri": "https://www.informatics.jax.org/",
            "type": "doc"
        },
        {
            "uri": "https://www.informatics.jax.org/downloads/reports/index.html#pheno",
            "type": "data",
            "version": "7.1.0"
        }
    ],
    "concepts": {
        "Allele": {
            "label": "Allele",
            "dbLabel": "Allele",
            "definition": "One of a set of sequence features known to exist at a particular genomic location. An allele is a seqeunce feature at a genomic location where variation occurs (i.e. where >1 different sequence is known to exist). An allele can span only the extent of sequence known to vary (e.g. a single base SNP, or short insertion), or it can span a larger extent that includes one or more variable features as proper parts (e.g. a 'gene allele' that spans the extent of an entire gene which contains several sequence alterations). Alleles can carry 'reference' or 'variant' sequence - depending on whether the its 'state' matches that considered to be the reference at that location. Alleles whose state differs from the reference are called 'variant alleles', and those that match the reference are called 'reference alleles'. What is considered the 'reference' state at a particular location may vary, depending on the context/goal of a particular analysis. A 'sequence alteration' is a 'variant allele' that varies along its entire extent (i.e every position varies from that of some defined reference sequence)."
        },
        "Genotype": {
            "label": "Genotype",
            "dbLabel": "Genotype",
            "definition": "The total sum of the genetic information of an organism that is known and relevant to the experiment being performed, including chromosomal, plasmid, viral or other genetic material which has been introduced into the organism either prior to or during the experiment. Information, making the distinction between the actual physical material (e.g. a cell) and the information about the genetic content (genotype)."
        }
    },
    "relationships": {
        "has allele": {
            "from": "Genotype",
            "to": "Allele"
        },
        "is model of": {
            "from": "Genotype",
            "to": "Disease"
        }
    }
}
with open(os.path.join(processed_data, 'alliance_disease_models_metadata.json'), 'w') as f:
    json.dump(metadata, f)

# Scratch

In [1]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver("bolt+ssc://pr-1645-neo4j-pr-1645.frzb.biobox.to:7687", auth=('neo4j', 'EpPRhKPyJUxgC6gv'))

In [4]:
res = requests.get('http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Attribute name = "ensembl_gene_id" /><Attribute name = "mmusculus_homolog_ensembl_gene" /></Dataset></Query>')

In [5]:
human_mouse_homolog = pd.read_table(io.StringIO(res.text), header=None)

In [6]:
human_mouse_homolog.columns = ['hsapiens', 'mmusculus']

In [7]:
human_mouse_homolog.dropna(subset='mmusculus', inplace=True)
human_mouse_homolog.set_index('mmusculus', inplace=True)

In [8]:
human_mouse_homolog.loc['ENSMUSG00000064341']['hsapiens']

'ENSG00000198888'

In [9]:
records, summary, _ = driver.execute_query(
    """
    MATCH (a:Allele)
    WHERE a.marker_ensembl_id is not null
    RETURN a.uuid, a.marker_ensembl_id
    """,
    database_="bx-d52b76a6-2e9b-4d53-894e-58e818cff0f2"
)

In [10]:
batch = []

for r in records:
    allele_id = r.get('a.uuid')
    mouse_gene_id = r.get('a.marker_ensembl_id')
 
    try:
        hsapien_id = human_mouse_homolog.loc[mouse_gene_id]['hsapiens']
        batch.append({'source': allele_id, 'target': hsapien_id})
    except:
        continue



In [11]:
len(batch)

87157

In [12]:
driver.execute_query(
    """
    UNWIND $batch as b
    MATCH (a:Allele {uuid: b.source})
    MATCH (g:Gene { uuid: b.target})
    MERGE (a)-[:`on target`]->(g)
    """,
    database_="bx-d52b76a6-2e9b-4d53-894e-58e818cff0f2",
    batch=batch
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x12f856490>, keys=[])

In [146]:
res = requests.post("https://pr-1625-rest.frzb.biobox.to/bioref/query/objects",
                   json={ "query": "(Genotype)-> is model of ->(Disease)<- is implicated in severity of <-(Gene)", "class": "Gene", "pagination": { "limit": 100, "offset": 0, "sortBy": { "key": "uuid", "descending": False } } },
                   headers= {
                       'x-biobox-orgid': 'bx-3079fc5a-1ed1-4e17-a482-e38efdbfe837',
                        'Authorization': f"Bearer {}"
                   })

In [148]:
res.json()

[{'class': 'Gene',
  'variable': 'g2',
  'statement': 'MATCH\n(g:`Genotype`)-[_r:`is model of`]->(d:`Disease`)<-[_r2:`is implicated in severity of`]-(g2:`Gene`)RETURN DISTINCT g2\nSKIP 0\nLIMIT 100',
  'countStatement': 'MATCH\n(g:`Genotype`)-[_r:`is model of`]->(d:`Disease`)<-[_r2:`is implicated in severity of`]-(g2:`Gene`)RETURN count(DISTINCT g2)',
  'total': 327,
  'offset': 0,
  'count': 100,
  'data': [{'biotype': 'protein_coding',
    'symbol': 'ALX4',
    'level': 'manually annotated loci',
    'displayName': 'ALX4',
    'start': 44260440,
    'source': 'HAVANA',
    'chr': '11',
    'uuid': 'ENSG00000052850',
    'strand': -1,
    'ensembl_gene_id_version': 'ENSG00000052850.8',
    'HGNC': 'HGNC:450',
    'end': 44310139,
    'tag': ['']},
   {'biotype': 'protein_coding',
    'symbol': 'ARID1A',
    'level': 'verified loci',
    'displayName': 'ARID1A',
    'start': 26693236,
    'source': 'HAVANA',
    'chr': '1',
    'uuid': 'ENSG00000117713',
    'strand': 1,
    'ensembl_g

In [149]:
[1, 2] + [3,4]

[1, 2, 3, 4]