In [None]:
# Import libraries
import pandas as pd
import numpy as np
from collections import Counter
from statsmodels.stats.proportion import proportions_ztest
from pyhpo import Ontology
_ = Ontology()

### Functions

In [None]:
def get_all_parents(hpo_id):
    """
    Recursively get all parent terms of a HPO term
    """
    term = Ontology.get_hpo_object(hpo_id)
    parents = set()
    for parent in term.parents:
        parent_id = str(parent)[:10]
        parents.add(parent_id)
        parents |= get_all_parents(parent_id)
    return parents

def get_all_children(hpo_id):
    """
    Recursively get all child terms of a HPO term
    """
    term = Ontology.get_hpo_object(hpo_id)
    children = set()
    for child in term.children:
        child_id = str(child)[13:]
        children.add(child_id)
        children |= get_all_children(child_id)
    return children

def get_hpo_term(hpo_id):
    """
    Get the name of an HPO term
    """
    term = Ontology.get_hpo_object(hpo_id)
    return str(term)[13:] if term else None


def get_hpo_id(hpo_id):
    """
    Get the name of an HPO term
    """
    term = Ontology.get_hpo_object(hpo_id)
    return str(term)[:10] if term else None


def get_frequency(hpo_terms):
    """
    Get frequency of HPO terms
    """
    return len(str(hpo_terms).split('|'))


def filter_phenotype_name_list(input_list, parent_terms):
    """
    Filter a list using the parent_terms list
    """
    return [term for term in input_list if term in parent_terms]

# --------------------------------------------------------------------------------------------------------------

# **Propagating HPO terms in DECIPHER dataset**

### Import dataset

In [None]:
# Import DECIPHER dataset csv
decipher_data = pd.read_csv('decipher.csv', header=0)

# Filter for pathogenicity
decipher_data = decipher_data[(decipher_data["pathogenicity"] == "Pathogenic") | (decipher_data["pathogenicity"] == "Likely pathogenic")]

# Create column showing HPO terms per patient
decipher_data['HPO_terms_freq'] = decipher_data['phenotype_names'].apply(get_frequency)

# Remove patients with 0 HPO terms
decipher_data = decipher_data[decipher_data['phenotype_names'].notna()]

### Propagate HPO term IDs

In [None]:
# Create list of strings containing HPO terms for each patient
phenotype_names = decipher_data["hpo_accessions"].tolist()

# Go through list of strings, to create list of lists
phenotype_name_list = []
for list_phenotypes in phenotype_names:
    list_phenotypes = str(list_phenotypes)
    list_phenotypes = list_phenotypes.split('|')
    phenotype_name_list.append(list_phenotypes)

# Call function for list in phenotype_name_list (list of lists) and add parent IDs to list
for phenotype_list in phenotype_name_list:
    new_phenotypes = set()
    for phenotype in phenotype_list:
        parents = get_all_parents(phenotype)
        new_phenotypes |= parents 
        for phenotype in list(new_phenotypes):
            if str(phenotype) not in phenotype_list:
                phenotype_list.append(phenotype)

# Add column with all parent HPO IDs
decipher_data["propagated_terms"] = phenotype_name_list
decipher_data["propagated_terms"] = decipher_data["propagated_terms"].apply('|'.join)

### Get propagated term names

In [None]:
# Split HPO IDs column by '|'
hpo_ids = decipher_data['propagated_terms'].str.split('|')

# Iterate through each list of HPO IDs and apply HPO term function
hpo_terms = []
for ids in hpo_ids:
    terms = [get_hpo_term(hpo_id) for hpo_id in ids]
    hpo_terms.append('|'.join(terms))

# Add new column to dataframe with HPO terms
decipher_data['propagated_names'] = hpo_terms

# --------------------------------------------------------------------------------------------------------------

# **Identifying top-level parent HPO terms**

### Create a list of parent terms

In [None]:
# Get child terms of "Phenotypic abnormality"
parent_terms = []
term = Ontology.get_hpo_object("Phenotypic abnormality")
for hpo in term.children:
    hpo = str(hpo)
    hpo_id = (hpo[0:10])
    parent_terms.append(hpo_id)

# Replace "Abnormality of the musculoskeletal system" with abnormalities of the skeletal system, musculature and connective tissue
parent_terms.extend(["HP:0000924", "HP:0003549", "HP:0003011"])
parent_terms.remove("HP:0033127")

# Convert HPO IDs to HPO names
parent_names = []
for hpo in parent_terms:
    hpo_name = get_hpo_term(hpo)
    parent_names.append(hpo_name)

### Filter propagated_terms using parent terms list

In [None]:
# Create list of hpo terms for each patient
phenotype_names = decipher_data["propagated_terms"].tolist()

# Go through list of strings, to create list of lists
phenotype_name_list = []
for list_phenotypes in phenotype_names:
    list_phenotypes = list_phenotypes.split('|')
    phenotype_name_list.append(list_phenotypes)

# Filter list of lists
filtered_lists = []
for list_phenotypes in phenotype_name_list:
    list_phenotypes = filter_phenotype_name_list(list_phenotypes, parent_terms)
    filtered_lists.append(list_phenotypes)

# Add filtered list as new column to decipher_data
decipher_data["parent_terms"] = filtered_lists

# --------------------------------------------------------------------------------------------------------------

# **Filtering DECIPHER dataset for gene list**

### Import gene list

In [None]:
# Read csv and generate list of genes
gene_list = pd.read_csv('gene_list.csv', header=0)
gene_names = gene_list["gene"].to_list()

### Filter dataset

In [None]:
# Filter using gene list
decipher_data_filtered = decipher_data[decipher_data["gene"].isin(gene_names)]

# Check which patients have more than 1 variant
duplicates = decipher_data_filtered[decipher_data_filtered.duplicated(subset='# patient_id')]

if len(duplicates) > 0:
    non_unique_patients = list(duplicates['# patient_id'])#.unique())
    #print(f"Patients with non-unique variants: {non_unique_patients}\n")
    patient_genes = {}
    for patient_id in non_unique_patients:
        patient_data = decipher_data_filtered[decipher_data_filtered['# patient_id'] == patient_id]
        genes = list(patient_data['gene'].unique())
        patient_genes[patient_id] = genes
    for patient_id, genes in patient_genes.items():
        #print(f"Patient {patient_id} has variants in genes: {genes}")

# Remove duplicate patients
decipher_data_filtered = decipher_data_filtered.drop_duplicates(subset=['# patient_id'])

### Group by PcG/TrxG

In [None]:
# Obtain lists of genes with PcG, TrxG, PcG/TrxG membership
PcG_genes = gene_list.loc[gene_list['group_membership'] == "PcG"]
TrxG_genes = gene_list.loc[gene_list['group_membership'] == "TrxG"]
PcG_TrxG_genes = gene_list.loc[gene_list['group_membership'] == "PcG/TrxG"]

# Add column showing gene membership for each patient
decipher_data_filtered["group"] = np.where(decipher_data_filtered['gene'].isin(PcG_TrxG_genes["gene"]), "PcG/TrxG",
                                  np.where(decipher_data_filtered['gene'].isin(PcG_genes["gene"]), "PcG", "TrxG"))

decipher_data_filtered.to_csv("filtered.csv")

### Total number of unique HPO terms

In [None]:
# Find frequency of all HPO terms in DECIPHER dataset
decipher_data_combined = pd.concat([decipher_data_rev, decipher_data_filtered], axis=0)

# Unique HPO terms
all_terms_in_decipher = set()
decipher_data_combined['propagated_names'].str.split("|").apply(all_terms_in_decipher.update)
all_terms_in_decipher = list(all_terms_in_decipher)
freq_terms_in_decipher = len(all_terms_in_decipher)
#print(f"Number of unique terms across DECIPHER: {freq_terms_in_decipher}")

### Find frequency and percentage of all HPO terms

In [None]:
# Create list of HPO terms for each patient
gene_list_terms = decipher_data_filtered["propagated_names"].tolist()

# Find total number of patients
gene_list_total_freq = len(decipher_data_filtered)

# Go through list of strings, to create list of lists
gene_list_terms_list = []
for list_phenotypes in gene_list_terms:
    list_phenotypes = list_phenotypes.split('|')
    gene_list_terms_list.append(list_phenotypes)

# Flatten the list of lists into a single list
gene_list_flat_list_all = [item for sublist in gene_list_terms_list for item in sublist]

# Count the frequency of each term 
gene_list_freq_count_all = Counter(gene_list_flat_list_all)

# Create dictionary, go through parent_terms and add frequency values for each term in freq_count list
gene_list_term_freq_all = {}
for term in all_terms_in_decipher:
    if term in gene_list_freq_count_all:
        gene_list_term_freq_all[term] = gene_list_freq_count_all[term]
    else:
        gene_list_term_freq_all[term] = 0

# Calculate percentages, add term:percentage to dictionary term_percent and print the percentage of each term
gene_list_term_percent_all = {}
for term, freq in gene_list_term_freq_all.items():
    term = str(Ontology.get_hpo_object(term))
    percent = (freq/gene_list_total_freq)*100
    gene_list_term_percent_all[term[13:]] = round(percent, 2)

# --------------------------------------------------------------------------------------------------------------

# **Reverse filtering DECIPHER dataset**

### Filter for genes not in gene list

In [None]:
# Filter for genes that are not in gene list
decipher_data_rev = decipher_data[~decipher_data["gene"].isin(gene_names)]

# Remove duplicate patients
decipher_data_rev = decipher_data_rev.drop_duplicates(subset=['# patient_id'])

### Find frequency and percentage of all HPO terms

In [None]:
# Create list of HPO terms for each patient
rev_terms = decipher_data_rev["propagated_names"].tolist()

# Find total number of patients
rev_total_freq = len(decipher_data_rev)

# Go through list of strings, to create list of lists
rev_terms_list = []
for list_phenotypes in rev_terms:
    list_phenotypes = list_phenotypes.split('|')
    rev_terms_list.append(list_phenotypes)

# Flatten the list of lists into a single list
rev_flat_list_all = [item for sublist in rev_terms_list for item in sublist]

# Count the frequency of each term 
rev_freq_count_all = Counter(rev_flat_list_all)

# Create dictionary, go through parent_terms and add frequency values for each term in freq_count list
rev_term_freq_all = {}
for term in all_terms_in_decipher:
    if term in rev_freq_count_all:
        rev_term_freq_all[term] = rev_freq_count_all[term]
    else:
        rev_term_freq_all[term] = 0

# Calculate percentages, add term:percentage to dictionary term_percent and print the percentage of each term
rev_term_percent_all = {}
for term, freq in rev_term_freq_all.items():
    term = str(Ontology.get_hpo_object(term))
    percent = (freq/rev_total_freq)*100
    rev_term_percent_all[term[13:]] = round(percent, 2)

# --------------------------------------------------------------------------------------------------------------

# **HPO frequencies: comparing top-level and unpropagated terms (gene list vs rest of DECIPHER)**

### Compare frequency of top-level HPO terms per patient

In [None]:
# Count top-level HPO terms for each gene list patient
gene_list_top_level = decipher_data_filtered[["# patient_id", "parent_terms"]]
gene_list_top_level["parent_terms"] = gene_list_top_level["parent_terms"].apply('|'.join)
gene_list_top_level['HPO_parent_terms_freq'] = gene_list_top_level['parent_terms'].apply(get_frequency)

# Count top-level HPO terms for each patient in the rest of DECIPHER
rev_top_level = decipher_data_rev[["# patient_id", "parent_terms"]]
rev_top_level["parent_terms"] = rev_top_level["parent_terms"].apply('|'.join)
rev_top_level['HPO_parent_terms_freq'] = rev_top_level['parent_terms'].apply(get_frequency)

# Find number of patients with each HPO term frequency 
gene_list_counts_top_level = Counter(gene_list_top_level['HPO_parent_terms_freq'])
rev_list_counts_top_level = Counter(rev_top_level['HPO_parent_terms_freq'])

# Convert counter object into dictionary
gene_list_counts_top_level = dict(gene_list_counts_top_level)
rev_list_counts_top_level = dict(rev_list_counts_top_level)

# Calculate percent of patients in each group with each frequency
gene_list_percent_top_level = {key: (value / gene_list_total_freq)*100 for key, value in gene_list_counts_top_level.items()}
rev_percent_top_level = {key: (value / rev_total_freq)*100 for key, value in rev_list_counts_top_level.items()}

# Create dataframe comparing gene list patients with patients in the rest of DECIPHER
top_level_hpo_percent = pd.DataFrame({"gene_list_percent": pd.Series(gene_list_percent_top_level), "rev_percent": pd.Series(rev_percent_top_level)})

### Compare frequency of unpropagated HPO terms per patient

In [None]:
# Find number of patients with each HPO term frequency 
gene_list_counts_unpropagated = Counter(decipher_data_filtered['HPO_terms_freq'])
rev_list_counts_unpropagated = Counter(decipher_data_rev['HPO_terms_freq'])

# Convert counter object into dictionary
gene_list_counts_unpropagated = dict(gene_list_counts_unpropagated)
rev_list_counts_unpropagated = dict(rev_list_counts_unpropagated)

# Calculate percent of patients in each group with each frequency
gene_list_percent_unpropagated = {key: (value / gene_list_total_freq)*100 for key, value in gene_list_counts_unpropagated.items()}
rev_percent_unpropagated = {key: (value / rev_total_freq)*100 for key, value in rev_list_counts_unpropagated.items()}

# Create dataframe comparing gene list patients with patients in the rest of DECIPHER
unpropagated_hpo_percent = pd.DataFrame({"gene_list_percent": pd.Series(gene_list_percent_unpropagated), "rev_percent": pd.Series(rev_percent_unpropagated)})
unpropagated_hpo_percent = unpropagated_hpo_percent.fillna(0)

### Create unpropated HPO term frequency bins (for histogram)

In [None]:
histogram = pd.DataFrame(columns = ["bin", "gene_list_percent", "rev_percent"])
histogram["bin"] = ("1-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39")
histogram["gene_list_percent"] = (sum(unpropagated_hpo_percent["gene_list_percent"][0:4]), sum(unpropagated_hpo_percent["gene_list_percent"][4:9]), sum(unpropagated_hpo_percent["gene_list_percent"][9:14]), sum(unpropagated_hpo_percent["gene_list_percent"][14:19]), sum(unpropagated_hpo_percent["gene_list_percent"][19:24]), sum(unpropagated_hpo_percent["gene_list_percent"][24:29]), sum(unpropagated_hpo_percent["gene_list_percent"][29:34]), sum(unpropagated_hpo_percent["gene_list_percent"][34:39]))
histogram["rev_percent"] = (sum(unpropagated_hpo_percent["rev_percent"][0:4]), sum(unpropagated_hpo_percent["rev_percent"][4:9]), sum(unpropagated_hpo_percent["rev_percent"][9:14]), sum(unpropagated_hpo_percent["rev_percent"][14:19]), sum(unpropagated_hpo_percent["rev_percent"][19:24]), sum(unpropagated_hpo_percent["rev_percent"][24:29]), sum(unpropagated_hpo_percent["rev_percent"][29:34]), sum(unpropagated_hpo_percent["rev_percent"][34:39]))

# --------------------------------------------------------------------------------------------------------------

# **HPO terms: testing for significance (gene list vs rest of DECIPHER)**

### Combine filtered and reverse filtered dictionaries into dataframe

In [None]:
gene_list_rev_hpo = pd.DataFrame({'hpo_term': list(gene_list_term_percent_all.keys()), 'gene_list_percent': list(gene_list_term_percent_all.values()), 'gene_list_freq': list(gene_list_term_freq_all.values()), 
'rev_percent': list(rev_term_percent_all.values()), 'rev_freq': list(rev_term_freq_all.values())})
gene_list_rev_hpo["total_freq"] = gene_list_rev_hpo["gene_list_freq"] + gene_list_rev_hpo["rev_freq"] 
gene_list_rev_hpo = gene_list_rev_hpo.sort_values('hpo_term')

### Calculate p-values using 2-proportions Z-test

In [None]:
# Loop through each HPO term and calculate p-value
for index, row in gene_list_rev_hpo.iterrows():
    hpo_term = row["hpo_term"]
    gene_list_freq = row["gene_list_freq"]
    rev_freq = row["rev_freq"]
    frequencies = np.array([gene_list_freq, rev_freq])
    totals = np.array([gene_list_total_freq, rev_total_freq])
    stat, p_value = proportions_ztest(count=frequencies, nobs=totals, alternative="two-sided")
    gene_list_rev_hpo.loc[index, "p_value"] = p_value

# Adjust p-value (Bonferroni) and identify significantly different terms
gene_list_rev_hpo["adj_p_value"] = gene_list_rev_hpo["p_value"] * freq_terms_in_decipher
gene_list_rev_hpo['significant'] = np.where(gene_list_rev_hpo['adj_p_value']<0.05, "Y", "N")

# Get significant HPO terms by total number of HPO terms
gene_list_rev_hpo_significant = gene_list_rev_hpo[(gene_list_rev_hpo['significant']=="Y") & (gene_list_rev_hpo['total_freq']>50)]
gene_list_rev_hpo_significant['change'] = np.where(gene_list_rev_hpo_significant['gene_list_percent']>gene_list_rev_hpo_significant['rev_percent'], "Increase", "Decrease")

### Get top-level HPO terms

In [None]:
gene_list_rev_hpo_top_level = gene_list_rev_hpo[gene_list_rev_hpo['hpo_term'].isin(parent_names)]

# --------------------------------------------------------------------------------------------------------------

# **HPO terms: testing for significance (PcG vs TrxG)**

### Split filtered DECIPHER dataset into PcG and TrxG

In [None]:
PcG_decipher_data_filtered = decipher_data_filtered[decipher_data_filtered["group"] == "PcG"]
TrxG_decipher_data_filtered = decipher_data_filtered[decipher_data_filtered["group"] == "TrxG"]

### Unique HPO terms across PcG and TrxG

In [None]:
# Find frequency of all HPO terms in DECIPHER dataset
PcG_TrxG_combined = pd.concat([PcG_decipher_data_filtered, TrxG_decipher_data_filtered], axis=0)

# Unique HPO terms
all_terms_in_PcG_TrxG = set()
PcG_TrxG_combined['propagated_terms'].str.split("|").apply(all_terms_in_PcG_TrxG.update)
all_terms_in_PcG_TrxG = list(all_terms_in_PcG_TrxG)
freq_terms_in_PcG_TrxG = len(all_terms_in_PcG_TrxG)
#print(f"Number of unique terms across PcG and TrxG: {freq_terms_in_PcG_TrxG}")

### Find frequency and percentage of terms in PcG

In [None]:
# Create list of hpo terms for each patient
PcG_phenotype_names = PcG_decipher_data_filtered["propagated_terms"].tolist()

# Go through list of strings, to create list of lists
PcG_phenotype_name_list = []
for list_phenotypes in PcG_phenotype_names:
    list_phenotypes = list_phenotypes.split('|')
    PcG_phenotype_name_list.append(list_phenotypes)

# Find number of PcG patients
PcG_total_freq = len(PcG_decipher_data_filtered)

# Flatten the list of lists into a single list
PcG_flat_list = [item for sublist in PcG_phenotype_name_list for item in sublist]

# Count the frequency of each term using a Counter object
PcG_freq_count = Counter(PcG_flat_list)

# Create dictionary and add frequency values for each term in freq_count list
PcG_term_id_freq = {}
for term in all_terms_in_PcG_TrxG:
    if term in PcG_freq_count:
        PcG_term_id_freq[term] = PcG_freq_count[term]
    else:
        PcG_term_id_freq[term] = 0

# Convert HPO IDs into HPO names
PcG_term_freq = {}
for term, freq in PcG_term_id_freq.items():
    term = str(Ontology.get_hpo_object(term))
    PcG_term_freq[term[13:]] = freq
    PcG_term_freq["total"] = PcG_total_freq

# Calculate percentages, add term:percentage to dictionary term_percent and print the percentage of each term
PcG_term_percent = {}
for term, freq in PcG_term_freq.items():
    percent = (freq/PcG_total_freq)*100
    PcG_term_percent[term] = round(percent, 2)

### Find frequency and percentage of terms in TrxG

In [None]:
# Create list of hpo terms for each patient
TrxG_phenotype_names = TrxG_decipher_data_filtered["propagated_terms"].tolist()

# Go through list of strings, to create list of lists
TrxG_phenotype_name_list = []
for list_phenotypes in TrxG_phenotype_names:
    list_phenotypes = list_phenotypes.split('|')
    TrxG_phenotype_name_list.append(list_phenotypes)

# Find number of TrxG patients
TrxG_total_freq = len(TrxG_decipher_data_filtered)

# Flatten the list of lists into a single list
TrxG_flat_list = [item for sublist in TrxG_phenotype_name_list for item in sublist]

# Count the frequency of each term using a Counter object
TrxG_freq_count = Counter(TrxG_flat_list)

# Create dictionary, go through parent_terms and add frequency values for each term in freq_count list
TrxG_term_id_freq = {}
for term in all_terms_in_PcG_TrxG:
    if term in TrxG_freq_count:
        TrxG_term_id_freq[term] = TrxG_freq_count[term]
    else:
        TrxG_term_id_freq[term] = 0

# Convert HPO IDs into HPO names
TrxG_term_freq = {}
for term, freq in TrxG_term_id_freq.items():
    term = str(Ontology.get_hpo_object(term))
    TrxG_term_freq[term[13:]] = freq
    TrxG_term_freq["total"] = TrxG_total_freq

# Calculate percentages, add term:percentage to dictionary term_percent and print the percentage of each term
TrxG_term_percent = {}
for term, freq in TrxG_term_freq.items():
    percent = (freq/TrxG_total_freq)*100
    TrxG_term_percent[term] = round(percent, 2)

### Combine PcG and TrxG dictionaries into dataframe

In [None]:
PcG_TrxG_hpo = pd.DataFrame({'hpo_term': list(PcG_term_freq.keys()), 'PcG_percent': list(PcG_term_percent.values()), 'PcG_freq': list(PcG_term_freq.values()), 
'TrxG_percent': list(TrxG_term_percent.values()), 'TrxG_freq': list(TrxG_term_freq.values())})
PcG_TrxG_hpo = PcG_TrxG_hpo.sort_values('hpo_term')

### Calculate p-values using 2-proportions Z-test

In [None]:
# Loop through each HPO term and calculate p-value
for index, row in PcG_TrxG_hpo.iterrows():
    hpo_term = row["hpo_term"]
    PcG_freq = row["PcG_freq"]
    TrxG_freq = row["TrxG_freq"]
    frequencies = np.array([PcG_freq, TrxG_freq])
    totals = np.array([PcG_total_freq, TrxG_total_freq])
    stat, p_value = proportions_ztest(count=frequencies, nobs=totals, alternative="two-sided")
    PcG_TrxG_hpo.loc[index, "p_value"] = p_value

# Adjust p-value (Bonferroni) and identify significantly different terms
PcG_TrxG_hpo["adj_p_value"] = PcG_TrxG_hpo["p_value"] * freq_terms_in_PcG_TrxG
PcG_TrxG_hpo['significant'] = np.where(PcG_TrxG_hpo['adj_p_value']<0.05, "Y", "N")
PcG_TrxG_hpo_significant = PcG_TrxG_hpo[(PcG_TrxG_hpo['significant']=="Y")]

# --------------------------------------------------------------------------------------------------------------

## **HPO top-level terms: three-way pairwise comparisons (PcG vs rest of DECIPHER & TrxG vs rest of DECIPHER)**

### Merge relevant dataframes

In [None]:
rev_hpo_subset = gene_list_rev_hpo[["hpo_term", "rev_freq"]]
PcG_TrxG_hpo_subset = PcG_TrxG_hpo[["hpo_term", "PcG_freq", "TrxG_freq"]]

PcG_TrxG_rev_combined = pd.merge(rev_hpo_subset, PcG_TrxG_hpo_subset, on="hpo_term")
PcG_TrxG_rev_combined_top_level = PcG_TrxG_rev_combined[PcG_TrxG_rev_combined['hpo_term'].isin(parent_names)]

### Compare TrxG with rest of DECIPHER

In [None]:
# Loop through each HPO term and calculate p-value
for index, row in PcG_TrxG_rev_combined_top_level.iterrows():
    hpo_term = row["hpo_term"]
    TrxG_freq = row["TrxG_freq"]
    rev_freq = row["rev_freq"]
    frequencies = np.array([TrxG_freq, rev_freq])
    totals = np.array([TrxG_total_freq, rev_total_freq])
    stat, p_value = proportions_ztest(count=frequencies, nobs=totals, alternative="two-sided")
    PcG_TrxG_rev_combined_top_level.loc[index, "TrxG_vs_rev_p_value"] = p_value

# Adjust p-value (Bonferroni) and identify significantly different terms
PcG_TrxG_rev_combined_top_level["TrxG_vs_rev_adj_p_value"] = PcG_TrxG_rev_combined_top_level["TrxG_vs_rev_p_value"] * freq_terms_in_decipher
PcG_TrxG_rev_combined_top_level['TrxG_vs_rev_significant'] = np.where(PcG_TrxG_rev_combined_top_level['TrxG_vs_rev_adj_p_value']<0.05, "Y", "N")

### Compare PcG with rest of DECIPHER

In [None]:
# Loop through each HPO term and calculate p-value
for index, row in PcG_TrxG_rev_combined_top_level.iterrows():
    hpo_term = row["hpo_term"]
    PcG_freq = row["PcG_freq"]
    rev_freq = row["rev_freq"]
    frequencies = np.array([PcG_freq, rev_freq])
    totals = np.array([PcG_total_freq, rev_total_freq])
    stat, p_value = proportions_ztest(count=frequencies, nobs=totals, alternative="two-sided")
    PcG_TrxG_rev_combined_top_level.loc[index, "PcG_vs_rev_p_value"] = p_value

# Adjust p-value (Bonferroni) and identify significantly different terms
PcG_TrxG_rev_combined_top_level["PcG_vs_rev_adj_p_value"] = PcG_TrxG_rev_combined_top_level["PcG_vs_rev_p_value"] * freq_terms_in_decipher
PcG_TrxG_rev_combined_top_level['PcG_vs_rev_significant'] = np.where(PcG_TrxG_rev_combined_top_level['PcG_vs_rev_adj_p_value']<0.05, "Y", "N")

# --------------------------------------------------------------------------------------------------------------

## **Output filtered DECIPHER dataset with propagated terms as csv for clustering**

In [None]:
decipher_data_filtered.to_csv("decipher_filtered.csv")

# --------------------------------------------------------------------------------------------------------------