# Get PanelApp genes
Download rare disease genes using the PanelApp API

## Preliminaries

In [120]:
import requests
import json
import pandas as pd
from IPython.display import clear_output

## Get PanelApp genes through the API

In [108]:
r = requests.get("https://panelapp.genomicsengland.co.uk/api/v1/")
print(json.dumps(r.json(), sort_keys=True, indent=4))

{
    "activities": "https://panelapp.genomicsengland.co.uk/api/v1/activities/",
    "entities": "https://panelapp.genomicsengland.co.uk/api/v1/entities/",
    "genes": "https://panelapp.genomicsengland.co.uk/api/v1/genes/",
    "panels": "https://panelapp.genomicsengland.co.uk/api/v1/panels/",
    "panels/signedoff": "https://panelapp.genomicsengland.co.uk/api/v1/panels/signedoff/",
    "regions": "https://panelapp.genomicsengland.co.uk/api/v1/regions/",
    "strs": "https://panelapp.genomicsengland.co.uk/api/v1/strs/"
}


In [122]:
session = requests.Session()


def get_panel_app_genes():
    url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/"
    while url:
        print(url)
        clear_output(wait=True)
        page = requests.get(url).json()
        yield page
        url = page["next"]


df = pd.concat([pd.json_normalize(p, "results") for p in get_panel_app_genes()])

https://panelapp.genomicsengland.co.uk/api/v1/genes/?page=338


### Copy the dataframe to avoid re-downloading

In [124]:
df_copy = df.copy()

In [245]:
df = df_copy.copy()

## Filter for genes of interest
Filter for genes which 
- Have confidence level "Amber" or "Green"
- Are in a GMS Rare Disease panel

In [246]:
df["confidence_level"] = df["confidence_level"].astype(int)
m1 = df["confidence_level"] >= 2
df["panel.types"] = df["panel.types"].astype(str)
m2 = df["panel.types"].str.contains("gms-rare-disease")

df = df[m1 & m2]

## Tidy the gene data
Keep only the columns of interest. Most genes have multiple phenotype annotations, but where they are otherwise duplicated, it is not worth keeping all of that phenotype data.

In [247]:
# Columns of interest
cols = [
    "gene_data.hgnc_symbol",
    "gene_data.ensembl_genes.GRch38.90.ensembl_id",
    "confidence_level",
    "phenotypes",
    "panel.name",
    "panel.disease_group",
    "mode_of_inheritance",
]
df = df[cols]

# Drop redundant phenotype information
df = df.explode("phenotypes", ignore_index=True)
df.columns = [
    "hgnc",
    "ensg",
    "confidence",
    "phenotype",
    "panel",
    "disease_group",
    "moi",
]
df = df.drop_duplicates([c for c in df.columns if c != "phenotype"])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14976 entries, 0 to 31583
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   hgnc           14976 non-null  object
 1   ensg           14975 non-null  object
 2   confidence     14976 non-null  int64 
 3   phenotype      14764 non-null  object
 4   panel          14976 non-null  object
 5   disease_group  14976 non-null  object
 6   moi            14976 non-null  object
dtypes: int64(1), object(6)
memory usage: 936.0+ KB


In [248]:
# One gene lacks an ENSG identifier
df[df.ensg.isna()]

Unnamed: 0,hgnc,ensg,confidence,phenotype,panel,disease_group,moi
23612,HIST1H4E,,2,Tessadori-van Haaften neurodevelopmental syndr...,Intellectual disability - microarray and seque...,Neurology and neurodevelopmental disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp..."


In [249]:
# This is HIST1H4E.
# Its updated HGNC symbol is H4C5.
# Its Ensembl ID is ENSG00000276966.
# I will update this information directly.

df.loc[df.hgnc == "HIST1H4E", "ensg"] = "ENSG00000276966"
df.loc[df.hgnc == "HIST1H4E", "hgnc"] = "H4C5"

## Save to output

In [253]:
df.to_csv("../outputs/panel_app_gms_rare_disease_genes.tsv", sep="\t", index=False)