In [1]:
# Import libraries
import pandas as pd
import numpy as np
import statsmodels.stats.multitest as multi
from collections import Counter
from statsmodels.stats.proportion import proportions_ztest
from scipy import stats
from math import log10
from pyhpo import Ontology
from HPOFunctions import *
_ = Ontology()

  from pandas.core import (


# -------------------------------------------------------------------

### **Import DECIPHER dataset & process**
- Identify all patients in DECIPHER with a pathogenic/likely pathogenic variant and at least 1 HPO term
- Get the number of terms per patient
- Get all propagated HPO IDs and terms for each patient

In [2]:
# Read DECIPHER csv
decipher = pd.read_csv('decipher.csv')

# Filter for pathogenicity
decipher = decipher[(decipher["pathogenicity"] == "Pathogenic") | (decipher["pathogenicity"] == "Likely pathogenic")]

# Get term frequency for each patient
decipher['HPOTermsFreq'] = decipher['phenotype_names'].apply(get_frequency)

# Remove patients with 0 HPO terms
decipher = decipher[decipher['phenotype_names'].notna()]

# Get propagated HPO IDs & term names
decipher["propagatedIDs"] = propagate_HPO_IDs(decipher["hpo_accessions"])
decipher["propagatedIDs"] = decipher["propagatedIDs"].apply('|'.join)
decipher['propagatedTerms'] = get_HPO_terms(decipher['propagatedIDs'])

### **Identify top-level HPO terms** 
 - Get all terms below 'Phenotypic abnormality'
 - Replace musculoskeletal system term with child terms

In [3]:
# Get terms directly below "Phenotypic abnormality"
HPOTopLevelIDs = []
for HPO in Ontology.get_hpo_object("Phenotypic abnormality").children:
    HPOTopLevelIDs.append(str(HPO)[0:10])

# Replace "Abnormality of the musculoskeletal system" with abnormalities of the skeletal system, musculature and connective tissue
HPOTopLevelIDs.remove("HP:0033127")
HPOTopLevelIDs.extend(["HP:0000924", "HP:0003549", "HP:0003011"])

# Convert HPO IDs to HPO names
topLevelTerms = []
for HPO in HPOTopLevelIDs:
    topLevelTerms.append(get_HPO_term(HPO))
print(f'{len(topLevelTerms)} Top-Level HPO Terms: {topLevelTerms}')

# Get list of lists of propagated HPO terms for each patient
HPOTerm_list = []
for HPOTerms in decipher["propagatedTerms"].tolist():
    HPOTerm_list.append(HPOTerms.split('|'))

# Find top-level terms from propagated terms for each patient
HPOTermsFiltered = []
for HPOTerms in HPOTerm_list:
    HPOTerms = [term for term in HPOTerms if term in topLevelTerms] 
    HPOTermsFiltered.append(HPOTerms)
decipher["topLevelTerms"] = HPOTermsFiltered

25 Top-Level HPO Terms: ['Abnormality of the respiratory system', 'Abnormality of prenatal development or birth', 'Abnormality of head or neck', 'Abnormal cellular phenotype', 'Abnormality of the digestive system', 'Abnormality of the cardiovascular system', 'Abnormality of the immune system', 'Constitutional symptom', 'Neoplasm', 'Abnormality of the thoracic cavity', 'Abnormality of the breast', 'Abnormality of the genitourinary system', 'Growth abnormality', 'Abnormality of the endocrine system', 'Abnormality of the ear', 'Abnormality of blood and blood-forming tissues', 'Abnormality of the nervous system', 'Abnormality of the integument', 'Abnormality of limbs', 'Abnormality of the eye', 'Abnormality of metabolism/homeostasis', 'Abnormality of the voice', 'Abnormality of the skeletal system', 'Abnormality of connective tissue', 'Abnormality of the musculature']


# -------------------------------------------------------------------

### **Filter DECIPHER using defined gene list**
- Read in pre-defined gene list
- Find unique patients with pathogenic or likely pathogenic variants in genes defined in the gene list, who have at least 1 HPO term
- Check for patients with multiple variants and remove duplicate patients

In [4]:
# Filter processed DECIPHER dataset using gene list
genelist = pd.read_csv('genelist.csv', header=0)
genelist = genelist["gene"].to_list()
decipherFiltered = decipher[decipher["gene"].isin(genelist)]

# Check which patients have more than 1 variant
duplicates = decipherFiltered[decipherFiltered.duplicated(subset='# patient_id')]
if len(duplicates) > 0:
    non_unique_patients = list(duplicates['# patient_id'])#.unique())
    print(f"Patients with non-unique variants: {non_unique_patients}\n")
    patient_genes = {}
    for patient_id in non_unique_patients:
        patient_data = decipherFiltered[decipherFiltered['# patient_id'] == patient_id]
        genes = list(patient_data['gene'].unique())
        patient_genes[patient_id] = genes
    for patient_id, genes in patient_genes.items():
        print(f"Patient {patient_id} has variants in genes: {genes}")
print('\nAll patients with multiple variants have variants within the same gene.')

# Remove duplicate patients
decipherFiltered = decipherFiltered.drop_duplicates(subset=['# patient_id'])

Patients with non-unique variants: [263014, 264461, 274394, 277208, 293763, 293763, 304945]

Patient 263014 has variants in genes: ['KMT2A']
Patient 264461 has variants in genes: ['KMT2A']
Patient 274394 has variants in genes: ['ACTL6B']
Patient 277208 has variants in genes: ['KMT2A']
Patient 293763 has variants in genes: ['BCOR']
Patient 304945 has variants in genes: ['KMT2A']

All patients with multiple variants have variants within the same gene.


### **Get percentage prevelance of each phenotype**
- Find all unique HPO terms across entire DECIPHER dataset
- Get the frequency and percentage occurance of each term across the gene list filtered dataset

In [5]:
# Get all unique HPO terms across whole DECIPHER dataset
allHPOTermsDecipher = set()
decipher['propagatedTerms'].str.split("|").apply(allHPOTermsDecipher.update)
allHPOTermsDecipher = list(allHPOTermsDecipher)
freqHPOTerms_decipher = len(allHPOTermsDecipher)

# Find patient frequency in filtered dataframe
patientFreq = len(decipherFiltered)

# Go through list of strings, to create list of lists
HPOTermsList = decipherFiltered["propagatedTerms"].tolist()
HPOTermsLonglist = []
for HPOTerms in HPOTermsList:
    HPOTerms = HPOTerms.split('|')
    HPOTermsLonglist.append(HPOTerms)

# Count number of terms across patients 
HPOTermsLonglistFlat = [item for sublist in HPOTermsLonglist for item in sublist]
HPOTermsFreq = Counter(HPOTermsLonglistFlat)

# Identify number of terms 
HPOTermsFreqAll = {}
for term in allHPOTermsDecipher:
    if term in HPOTermsFreq:
        HPOTermsFreqAll[term] = HPOTermsFreq[term]
    else:
        HPOTermsFreqAll[term] = 0

# Identify percentage of terms 
HPOTermsPercentAll = {}
for term, freq in HPOTermsFreqAll.items():
    term = str(Ontology.get_hpo_object(term))
    percent = (freq/patientFreq)*100
    HPOTermsPercentAll[term[13:]] = round(percent, 2)

# Get number of patients and unique genes in reverse filtered dataset
print(f"Patients: {decipherFiltered.groupby(['# patient_id']).size().count()}")
print(f"Genes: {decipherFiltered.groupby(['gene']).size().count()}")

# Save filtered dataset
decipherFiltered.to_csv('./DataHandlingOutput/decipherFiltered.csv')

Patients: 462
Genes: 38


# -------------------------------------------------------------------

### **Reverse filter DECIPHER using defined gene list**
- Find unique patients with pathogenic or likely pathogenic variants in genes that are not in the defined gene list, who have at least 1 HPO term
- Get the frequency and percentage occurance of each term across the reverse gene list filtered dataset

In [6]:
# Filter processed DECIPHER dataset using gene list
decipherOther = decipher[~decipher["gene"].isin(genelist)]
decipherOther = decipherOther.drop_duplicates(subset=['# patient_id'])

# Create list of HPO terms for each patient
otherHPOTermsList = decipherOther["propagatedTerms"].tolist()
otherpatientFreq = len(decipherOther)

# Go through list of strings, to create list of lists
otherHPOTermsLonglist = []
for HPOTerms in otherHPOTermsList:
    HPOTerms = HPOTerms.split('|')
    otherHPOTermsLonglist.append(HPOTerms)

# Count number of terms across patients 
otherHPOTermsLonglistFlat = [item for sublist in otherHPOTermsLonglist for item in sublist]
otherHPOTermsFreq = Counter(otherHPOTermsLonglistFlat)

# Identify number of terms 
otherHPOTermsFreqAll = {}
for term in allHPOTermsDecipher:
    if term in otherHPOTermsFreq:
        otherHPOTermsFreqAll[term] = otherHPOTermsFreq[term]
    else:
        otherHPOTermsFreqAll[term] = 0

# Identify percentage of terms 
otherHPOTermsPercentAll = {}
for term, freq in otherHPOTermsFreqAll.items():
    term = str(Ontology.get_hpo_object(term))
    percent = (freq/otherpatientFreq)*100
    otherHPOTermsPercentAll[term[13:]] = round(percent, 2)

# Get number of patients and unique genes in reverse filtered dataset
print(f"Patients: {decipherOther.groupby(['# patient_id']).size().count()}")
print(f"Genes: {decipherOther.groupby(['gene']).size().count()}")

Patients: 5070
Genes: 1037


# -------------------------------------------------------------------

### **Compare occurance of HPO terms between the two identified groups**
- Gather frequencies and percentages for both groups
- Using a two-proportions z-test and FDR (BH) correction from statsmodels, compare the proportion of patients in each group with each phenotype, to identify significantly increased/decreased phenotypes

In [7]:
# Create dataframe from dicts
genelistOtherHPO = pd.DataFrame({'HPOTerm': list(HPOTermsPercentAll.keys()), 
                                 'genelistPercent': list(HPOTermsPercentAll.values()), 
                                 'genelistFreq': list(HPOTermsFreqAll.values()), 
                                 'otherPercent': list(otherHPOTermsPercentAll.values()), 
                                 'otherFreq': list(otherHPOTermsFreqAll.values())})

# Get total frequency of each term
genelistOtherHPO["totalFreq"] = genelistOtherHPO["genelistFreq"] + genelistOtherHPO["otherFreq"] 

# Sort terms alphabetically
genelistOtherHPO = genelistOtherHPO.sort_values('HPOTerm')

# Loop through each HPO term and get p-value
for index, row in genelistOtherHPO.iterrows():
    HPOTerm = row["HPOTerm"]
    genelistFreq = row["genelistFreq"]
    otherFreq = row["otherFreq"]
    frequencies = np.array([genelistFreq, otherFreq])
    totals = np.array([patientFreq, otherpatientFreq])
    stat, pVal = proportions_ztest(count=frequencies, nobs=totals, alternative="two-sided")
    genelistOtherHPO.loc[index, "pVal"] = pVal

# Adjust p-value (BH) and identify significantly different terms at p<0.05
pVals = np.array(genelistOtherHPO["pVal"])
pVals[np.isnan(pVals)] = 1
p_adj = multi.fdrcorrection(pVals, alpha=0.05, method='indep')
genelistOtherHPO["pValAdj"] = p_adj[1]
genelistOtherHPO['significant'] = np.where(genelistOtherHPO['pValAdj']<0.05, "Y", "N")

# Add HPO IDs
ids = []
for term in genelistOtherHPO['HPOTerm']:
    id = get_HPO_id(term)
    ids.append(id)
genelistOtherHPO['HPO_id'] = ids

# Find percentage difference and log10 pvalue
genelistOtherHPO['delta'] = genelistOtherHPO['genelistPercent'] - genelistOtherHPO['otherPercent']
log10s = []
for pval in genelistOtherHPO['pValAdj']:
    log10s.append(-1*(log10(pval)))
genelistOtherHPO['-log10padj'] = log10s

# Get significant HPO terms with a delta of 3%
genelistOtherHPOSignificant = genelistOtherHPO[(genelistOtherHPO['significant']=="Y") & ((genelistOtherHPO['delta']>=3)|(genelistOtherHPO['delta']<=-3))]
genelistOtherHPOSignificant['change'] = np.where(genelistOtherHPOSignificant['genelistPercent']>genelistOtherHPOSignificant['otherPercent'], "Increase", "Decrease")

# Save as csv
genelistOtherHPOSignificant.to_csv('./DataHandlingOutput/genelistOtherHPOSignificant.csv')

# Find top-level terms
genelistOtherHPOTopLevel = genelistOtherHPO[genelistOtherHPO['HPOTerm'].isin(topLevelTerms)]
display(genelistOtherHPOTopLevel[['HPOTerm', 'genelistPercent', 'otherPercent', 'pValAdj']])

# Save top-level terms as csv
genelistOtherHPOTopLevel.to_csv('./DataHandlingOutput/genelistOtherHPOTopLevel.csv')

  zstat = value / std
  zstat = value / std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genelistOtherHPOSignificant['change'] = np.where(genelistOtherHPOSignificant['genelistPercent']>genelistOtherHPOSignificant['otherPercent'], "Increase", "Decrease")


Unnamed: 0,HPOTerm,genelistPercent,otherPercent,pValAdj
1666,Abnormal cellular phenotype,0.0,0.45,0.8163002
2076,Abnormality of blood and blood-forming tissues,1.3,2.07,0.8163002
3901,Abnormality of connective tissue,6.49,7.02,0.8163002
2902,Abnormality of head or neck,76.19,62.82,1.333825e-06
2662,Abnormality of limbs,39.83,30.28,0.001238733
3522,Abnormality of metabolism/homeostasis,4.98,4.32,0.8163002
1580,Abnormality of prenatal development or birth,3.9,4.58,0.8163002
943,Abnormality of the breast,3.25,3.35,0.9282816
1090,Abnormality of the cardiovascular system,14.94,14.26,0.8163002
243,Abnormality of the digestive system,26.84,16.67,4.040824e-06


# -------------------------------------------------------------------

### **Compare number of HPO terms per patient between the two identified groups**
- Identify and compare number of top-level HPO terms per patient between the two groups

In [8]:
# Count top-level HPO terms for each gene list patient
genelistTopLevel = decipherFiltered[["# patient_id", "topLevelTerms"]]
genelistTopLevel["topLevelTerms"] = genelistTopLevel["topLevelTerms"].apply('|'.join)
genelistTopLevel['topLevelTermsFreq'] = genelistTopLevel['topLevelTerms'].apply(get_frequency)

# Count top-level HPO terms for each patient in the rest of DECIPHER
otherTopLevel = decipherOther[["# patient_id", "topLevelTerms"]]
otherTopLevel["topLevelTerms"] = otherTopLevel["topLevelTerms"].apply('|'.join)
otherTopLevel['topLevelTermsFreq'] = otherTopLevel['topLevelTerms'].apply(get_frequency)

# Find number of patients with each HPO term frequency 
genelistCountsTopLevel = dict(Counter(genelistTopLevel['topLevelTermsFreq']))
otherCountsTopLevel = dict(Counter(otherTopLevel['topLevelTermsFreq']))

# Calculate percent of patients in each group with each frequency
genelistPercentTopLevel = {key: (value / patientFreq)*100 for key, value in genelistCountsTopLevel.items()}
otherPercentTopLevel = {key: (value / otherpatientFreq)*100 for key, value in otherCountsTopLevel.items()}

# Create dataframe comparing gene list patients with patients in the rest of DECIPHER
HPOTopLevelPercent = pd.DataFrame({"genelistPercent": pd.Series(genelistPercentTopLevel), "otherPercent": pd.Series(otherPercentTopLevel)})
display(HPOTopLevelPercent)

# Get array of frequencies for patients in each group
otherTopLevelArr = pd.array(otherTopLevel['topLevelTermsFreq'])
genelistTopLevelArr = pd.array(genelistTopLevel['topLevelTermsFreq'])

# Summary statistics
otherSumStats = (pd.DataFrame(otherTopLevelArr).agg(["count", "min", "max", "median", "mean", "skew"]).rename(columns={pd.DataFrame(otherTopLevelArr).columns[0]: 'otherlist'}).round(decimals=2))
genelistSumStats = pd.DataFrame(genelistTopLevelArr).agg(["count", "min", "max", "median", "mean", "skew"]).rename(columns={pd.DataFrame(genelistTopLevelArr).columns[0]: 'genelist'}).round(decimals=2)
display(pd.concat([otherSumStats, genelistSumStats], axis=1))

# Mann Whitney U test
MannWhitneyTopLevel = stats.mannwhitneyu(genelistTopLevelArr, otherTopLevelArr, alternative='two-sided')
print(f'\nP-value: {MannWhitneyTopLevel.pvalue}')

# Save top-level terms as csv
HPOTopLevelPercent.to_csv('./DataHandlingOutput/HPOTopLevelPercent.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genelistTopLevel["topLevelTerms"] = genelistTopLevel["topLevelTerms"].apply('|'.join)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genelistTopLevel['topLevelTermsFreq'] = genelistTopLevel['topLevelTerms'].apply(get_frequency)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  otherTopLevel["topLevelT

Unnamed: 0,genelistPercent,otherPercent
1,7.359307,14.911243
2,7.359307,9.072978
3,7.142857,13.708087
4,13.203463,15.088757
5,17.748918,14.280079
6,16.233766,11.775148
7,13.203463,9.112426
8,8.225108,5.147929
9,4.545455,3.136095
10,2.597403,2.169625


Unnamed: 0,otherlist,genelist
count,5070.0,462.0
min,1.0,1.0
max,18.0,14.0
median,4.0,5.0
mean,4.5,5.36
skew,0.55,0.26



P-value: 1.513148080718329e-13


- Identify and compare number of unpropagated terms per patient between the two groups.

In [9]:
# Find number of patients with each HPO term frequency 
genelistCountsUnprop = dict(Counter(decipherFiltered['HPOTermsFreq']))
otherCountsUnprop = dict(Counter(decipherOther['HPOTermsFreq']))

# Calculate percent of patients in each group with each frequency
genelistPercentUnprop = {key: (value / patientFreq)*100 for key, value in genelistCountsUnprop.items()}
otherPercentUnprop = {key: (value / otherpatientFreq)*100 for key, value in otherCountsUnprop.items()}

# Create percentage dataframe comparing gene list patients with patients in the rest of DECIPHER
HPOUnpropPercent = pd.DataFrame({"genelistPercent": pd.Series(genelistPercentUnprop), "otherPercent": pd.Series(otherPercentUnprop)})
HPOUnpropPercent = HPOUnpropPercent.fillna(0)

# Create frequency dataframe comparing gene list patients with patients in the rest of DECIPHER
HPOUnpropFreq = pd.DataFrame({"genelistFreq": pd.Series(genelistCountsUnprop), "otherFreq": pd.Series(otherCountsUnprop)})
HPOUnpropFreq = HPOUnpropFreq.fillna(0)

# Convert to histogram format
histogram = pd.DataFrame(columns = ["bin", "genelistPercent", "otherPercent"])
histogram["bin"] = ("1-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39")
histogram["genelistPercent"] = (sum(HPOUnpropPercent["genelistPercent"][0:4]), sum(HPOUnpropPercent["genelistPercent"][4:9]), sum(HPOUnpropPercent["genelistPercent"][9:14]), sum(HPOUnpropPercent["genelistPercent"][14:19]), sum(HPOUnpropPercent["genelistPercent"][19:24]), sum(HPOUnpropPercent["genelistPercent"][24:29]), sum(HPOUnpropPercent["genelistPercent"][29:34]), sum(HPOUnpropPercent["genelistPercent"][34:39]))
histogram["otherPercent"] = (sum(HPOUnpropPercent["otherPercent"][0:4]), sum(HPOUnpropPercent["otherPercent"][4:9]), sum(HPOUnpropPercent["otherPercent"][9:14]), sum(HPOUnpropPercent["otherPercent"][14:19]), sum(HPOUnpropPercent["otherPercent"][19:24]), sum(HPOUnpropPercent["otherPercent"][24:29]), sum(HPOUnpropPercent["otherPercent"][29:34]), sum(HPOUnpropPercent["otherPercent"][34:39]))
display(histogram)

# Get array of frequencies for patients in each group
otherUnpropArr = np.array(decipherOther['HPOTermsFreq'])
genelistUnpropArr = np.array(decipherFiltered['HPOTermsFreq'])

# Summary statistics
otherSumStats = (pd.DataFrame(otherUnpropArr).agg(["count", "min", "max", "median", "mean", "skew"]).rename(columns={pd.DataFrame(otherUnpropArr).columns[0]: 'otherlist'}).round(decimals=2))
genelistSumStats = (pd.DataFrame(genelistUnpropArr).agg(["count", "min", "max", "median", "mean", "skew"]).rename(columns={pd.DataFrame(genelistUnpropArr).columns[0]: 'genelist'}).round(decimals=2))
display(pd.concat([otherSumStats, genelistSumStats], axis=1))

# Mann-Whitney U test
MannWhitneyUnprop = stats.mannwhitneyu(genelistUnpropArr, otherUnpropArr, alternative='two-sided')
print(f'\nP-value: {MannWhitneyUnprop.pvalue}')

# Save top-level terms as csv
histogram.to_csv('./DataHandlingOutput/histogram.csv')

Unnamed: 0,bin,genelistPercent,otherPercent
0,1-4,22.727273,36.114398
1,5-9,45.887446,42.781065
2,10-14,23.160173,15.897436
3,15-19,5.194805,3.786982
4,20-24,1.731602,1.084813
5,25-29,0.649351,0.216963
6,30-34,0.649351,0.118343
7,35-39,0.0,0.0


Unnamed: 0,otherlist,genelist
count,5070.0,462.0
min,1.0,1.0
max,36.0,39.0
median,6.0,7.0
mean,6.63,8.01
skew,1.36,1.71



P-value: 1.5116338173521496e-10


# -------------------------------------------------------------------