In [40]:
"""
ScientificData论文图片绘制
单个基因数据桑吉图
"""

'\nScientificData论文图片绘制\n单个基因数据桑吉图\n'

In [41]:
import os

from collections import defaultdict

In [42]:
base_path = '/mnt/disk1/xzyao/CancerPNRLE/merge_data'

AML_file = f'{base_path}/Acute Myeloid Leukemia.gene-var.tsv'


go_ancestor_file = '/mnt/disk1/xzyao/CancerPNRLE/result/芷涵GO-HPO分级/go_second_ancestors.txt'

hpo_ancestor_file = '/mnt/disk1/xzyao/CancerPNRLE/result/芷涵GO-HPO分级/hpo_third_ancestors.txt'


fig_save_path = '/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制'

template_html = f'{fig_save_path}/Sankey-template.html'

In [43]:

def read_obo_level_file(_go_level_file: str):

    go_id_to_second_ancestor = {}
    with open(_go_level_file) as f:
        f.readline()
        for line in f:
            l = line.strip().split('\t')
            if len(l) != 4:
                continue
            go_id = l[0]
            ancestor_id = l[2]
            ancestor_name = l[3]

            if ancestor_name.startswith('obsolete'):
                ancestor_name = ancestor_name.replace('obsolete', '').strip()

            go_id_to_second_ancestor[go_id] = f'{ancestor_id}-{ancestor_name}'

    return go_id_to_second_ancestor


In [44]:
# read ancestor file
go_to_ancestor = read_obo_level_file(go_ancestor_file)
hp_to_ancestor = read_obo_level_file(hpo_ancestor_file)

In [68]:
def replace_template(template_html: str, nodes: set, links: set, save_file: str):

    with open(template_html) as f, open(save_file, 'w') as wf:
        doc = f.read()
        
        doc = doc.replace('data_str', str(nodes))
        doc = doc.replace('link_str', str(links))

        wf.write(doc)
    print(f'{save_file} saved.')
    
    

In [69]:
def read_merge_db_file(merge_db_file,
                       go_to_ancestor: dict,
                       hpo_to_ancestor: dict,
                       use_ancestor=False,
                       selected_gene='',):
    
    
    node_set = set()

    # gene to var_type
    gene_var_count = defaultdict(int)
    # var_type to sub_var_type
    var_sub_var_count = defaultdict(int)
    # sub_var_type to bp
    sub_var_bp_count = defaultdict(int)
    
       
    with open(AML_file) as f:
        f.readline()
        for line in f:
            l = line.strip().split('\t')

            pmid, sent_id, sentence, rich_sent, snps, alter,\
            gene, events, trigger, include_even, event_genes,\
            go, hpo, mesh, un_norm_bp, basic_score = line.strip().split('\t')
            
            if selected_gene:
                if gene != selected_gene:
                    continue
        

            l_go_set = {go_term for go_term in go.split('; ')}
            l_hpo_set = {hpo_term for hpo_term in hpo.split('; ')}
            l_mesh_set = {mesh_term for mesh_term in mesh.split('; ')}
            
            # re-assignment snp and alteration 
            snp_set = set()
            alter_set = set()
            for snp in snps.split('; '):
                if snp == '-':
                    continue
                snp_set.add(snp)
            
            for alt in alter.split('; '):
                if alt == '-':
                    continue
                    
                if alt.startswith('g.') or alt.startswith('c.'):
                    snp_set.add(alt)
                else:
                    alter_set.add(alt)
            
            # get ancestor for go and hpo
            go_ancester_set = set()
            hpo_ancester_set = set()
            for go_term in go.split('; '):
                if go_term == '-':
                    continue
                go_id = go_term.split('-')[0]
                if go_to_ancestor.get(go_id):
                    go_ancester_set.add(go_to_ancestor[go_id])
            go_ancester = '; '.join(go_ancester_set) if go_ancester_set else '-'    
            
            for hpo_term in hpo.split('; '):
                if hpo_term == '-':
                    continue
                hpo_id = hpo_term.split('-')[0]
                if hpo_to_ancestor.get(hpo_id):
                    hpo_ancester_set.add(hpo_to_ancestor[hpo_id])
            hpo_ancester = '; '.join(hpo_ancester_set) if hpo_ancester_set else '-'
            
            
            #print(go_ancester)
            
            # start to process data for plot
            if use_ancestor:
                plot_go_set = set(go_ancester.split('; '))
                plot_hpo_set = set(hpo_ancester.split('; '))
            else:
                plot_go_set = set(go.split('; '))
                plot_hpo_set = set(hpo.split('; '))
            
            node_set.add(gene)
            
            
            bp_set = plot_go_set | plot_hpo_set


            if '-' in bp_set:
                bp_set.remove('-')
            if '' in bp_set:
                bp_set.remove('')

            node_set.update(bp_set)
            
            #print(snp_set)
            #print(alter_set)
            #print(bp_set)
            

            # fixme: normalized snps
            if snp_set:
                for snp in snp_set:
                    snp = snp.strip()

                    # fixme: some snp format is wrong in db file
                    #if not snp.startswith('rs') or snp == 'None' \
                        #or not snp.startswith('p.') or not snp.startswith('c.'):
                        #continue
                    node_set.add('SNP')
                    node_set.add(snp)

                    # sub_var -- bp
                    for bp in bp_set:
                        gene_var_count[ (gene, 'SNP') ] += 1
                        var_sub_var_count[ ('SNP', snp) ] += 1

                        sub_var_bp_count[(snp, bp)] += 1

            if alter_set :
                for var in alter_set:
                    var = var.strip()
                    # fixme: c.175delGC in var col
                    if len(var.split(':')) != 2:
                        continue

                    # fixme: some wrong GO HP Mesh
                    if var.startswith("GO:") or var.startswith("HP:") or var.startswith('MESH'):
                        continue
                    var_type, sub_var_type = var.split(':')

                    node_set.add(var_type)
                    node_set.add(sub_var_type)


                    # sub_var -- bp
                    for bp in bp_set:
                        gene_var_count[ (gene, var_type) ] += 1

                        var_sub_var_count[ (var_type, sub_var_type) ] += 1

                        sub_var_bp_count[(sub_var_type, bp)] += 1


        # todo: only 5 sub_var_bp to show
        nodes = [{'name': node} for node in node_set]

        links = []
        nodes = []
        # for _key, value in (*gene_var_count.items(), *var_sub_var_count.items(), *sub_var_bp_count.items()):
        # sub_var_bp_count = {(sub_var, bp): num for (sub_var, bp), num in sub_var_bp_count.items() if num > 5}
        for (source, target), value in (*gene_var_count.items(), *var_sub_var_count.items(), *sub_var_bp_count.items()):
        # for (source, target), value in (*gene_var_count.items(), *var_sub_var_count.items()):
            # fixme: for test
            if {'name': source} not in nodes:
                nodes.append({'name': source} )
            if {'name': target} not in nodes:
                nodes.append({'name': target} )

            # {"source": "Total", "target": "Environment", "value": 0.342284047256003}
            links.append({
                "source": source,
                "target": target,
                "value": value,
            })
    return nodes, links




In [104]:
def read_merge_db_file_add_bp_layer(merge_db_file,
                       go_to_ancestor: dict,
                       hpo_to_ancestor: dict,
                       use_ancestor=False,
                       selected_gene='',):
    
    
    node_set = set()

    # gene to var_type
    gene_var_count = defaultdict(int)
    # var_type to sub_var_type
    var_sub_var_count = defaultdict(int)
    # sub_var_type to bp ancestor
    sub_var_bp_count = defaultdict(int)
    # ancestor to bp
    ancestor_bp_count =defaultdict(int)
    
       
    with open(AML_file) as f:
        f.readline()
        for line in f:
            l = line.strip().split('\t')

            pmid, sent_id, sentence, rich_sent, snps, alter,\
            gene, events, trigger, include_even, event_genes,\
            go, hpo, mesh, un_norm_bp, basic_score = line.strip().split('\t')
            
            if selected_gene:
                if gene != selected_gene:
                    continue
        

            l_go_set = {go_term for go_term in go.split('; ')}
            l_hpo_set = {hpo_term for hpo_term in hpo.split('; ')}
            l_mesh_set = {mesh_term for mesh_term in mesh.split('; ')}
            
            # re-assignment snp and alteration 
            snp_set = set()
            alter_set = set()
            for snp in snps.split('; '):
                if snp == '-':
                    continue
                snp_set.add(snp)
            
            for alt in alter.split('; '):
                if alt == '-':
                    continue
                    
                if alt.startswith('g.') or alt.startswith('c.'):
                    snp_set.add(alt)
                else:
                    alter_set.add(alt)
            
            
            # todo: connect bp and ancerstor term
            bp_set = set()
            for go_term in go.split('; '):
                if go_term == '-':
                    continue
                go_id = go_term.split('-')[0]
                if go_to_ancestor.get(go_id):
                    go_ancestor = go_to_ancestor[go_id]
                    bp_set.add(f'{go_term}::{go_ancestor}')
                    
            for hpo_term in hpo.split('; '):
                if hpo_term == '-':
                    continue
                hpo_id = hpo_term.split('-')[0]
                if hpo_to_ancestor.get(hpo_id):
                    hpo_ancestor = hpo_to_ancestor[hpo_id]
                    bp_set.add(f'{hpo_term}::{hpo_ancestor}')
            
        
            node_set.add(gene)
            
            if '-' in bp_set:
                bp_set.remove('-')
            if '' in bp_set:
                bp_set.remove('')

            # fixme: normalized snps
            if snp_set:
                for snp in snp_set:
                    snp = snp.strip()

                    # fixme: some snp format is wrong in db file
                    #if not snp.startswith('rs') or snp == 'None' \
                        #or not snp.startswith('p.') or not snp.startswith('c.'):
                        #continue
                    node_set.add('SNP')
                    node_set.add(snp)

                    # sub_var -- bp
                    for bp_term in bp_set:
                        bp, bp_ancestor = bp_term.split('::')
                        
                        gene_var_count[ (gene, 'SNP') ] += 1
                        var_sub_var_count[ ('SNP', snp) ] += 1

                        sub_var_bp_count[(snp, bp_ancestor)] += 1
                        ancestor_bp_count[(bp_ancestor, bp)] += 1

            if alter_set :
                for var in alter_set:
                    var = var.strip()
                    # fixme: c.175delGC in var col
                    if len(var.split(':')) != 2:
                        continue

                    # fixme: some wrong GO HP Mesh
                    if var.startswith("GO:") or var.startswith("HP:") or var.startswith('MESH'):
                        continue
                    var_type, sub_var_type = var.split(':')

                    node_set.add(var_type)
                    node_set.add(sub_var_type)

                    # sub_var -- bp
                    for bp_term in bp_set:
                        bp, bp_ancestor = bp_term.split('::')

                        gene_var_count[ (gene, var_type) ] += 1

                        var_sub_var_count[ (var_type, sub_var_type) ] += 1

                        sub_var_bp_count[(sub_var_type, bp_ancestor)] += 1
                        ancestor_bp_count[(bp_ancestor, bp)] += 1


        # todo: only 5 sub_var_bp to show
        nodes = [{'name': node} for node in node_set]

        links = []
        nodes = []
        # for _key, value in (*gene_var_count.items(), *var_sub_var_count.items(), *sub_var_bp_count.items()):
        # sub_var_bp_count = {(sub_var, bp): num for (sub_var, bp), num in sub_var_bp_count.items() if num > 5}
        for (source, target), value in (*gene_var_count.items(), *var_sub_var_count.items(),
                                        *sub_var_bp_count.items(), 
                                        #*ancestor_bp_count.items()
                                       ):
        # for (source, target), value in (*gene_var_count.items(), *var_sub_var_count.items()):
            # fixme: for test
            if {'name': source} not in nodes:
                nodes.append({'name': source} )
            if {'name': target} not in nodes:
                nodes.append({'name': target} )

            # {"source": "Total", "target": "Environment", "value": 0.342284047256003}
            links.append({
                "source": source,
                "target": target,
                "value": value,
            })
    return nodes, links, \
            gene_var_count, var_sub_var_count, \
            sub_var_bp_count, ancestor_bp_count


In [92]:
def read_merge_db_file_two_layer(merge_db_file,
                       go_to_ancestor: dict,
                       hpo_to_ancestor: dict,
                       use_ancestor=False,
                       selected_gene='',):
    
    
    node_set = set()

    # gene to var_type
    gene_var_count = defaultdict(int)
    # var_type to sub_var_type
    var_sub_var_count = defaultdict(int)
    # sub_var_type to bp
    sub_var_bp_count = defaultdict(int)
    
       
    with open(AML_file) as f:
        f.readline()
        for line in f:
            l = line.strip().split('\t')

            pmid, sent_id, sentence, rich_sent, snps, alter,\
            gene, events, trigger, include_even, event_genes,\
            go, hpo, mesh, un_norm_bp, basic_score = line.strip().split('\t')
            
            if selected_gene:
                if gene != selected_gene:
                    continue
        

            l_go_set = {go_term for go_term in go.split('; ')}
            l_hpo_set = {hpo_term for hpo_term in hpo.split('; ')}
            l_mesh_set = {mesh_term for mesh_term in mesh.split('; ')}
            
            # re-assignment snp and alteration 
            snp_set = set()
            alter_set = set()
            for snp in snps.split('; '):
                if snp == '-':
                    continue
                snp_set.add(snp)
            
            for alt in alter.split('; '):
                if alt == '-':
                    continue
                    
                if alt.startswith('g.') or alt.startswith('c.'):
                    snp_set.add(alt)
                else:
                    alter_set.add(alt)
            
            # get ancestor for go and hpo
            go_ancester_set = set()
            hpo_ancester_set = set()
            for go_term in go.split('; '):
                if go_term == '-':
                    continue
                go_id = go_term.split('-')[0]
                if go_to_ancestor.get(go_id):
                    go_ancester_set.add(go_to_ancestor[go_id])
            go_ancester = '; '.join(go_ancester_set) if go_ancester_set else '-'    
            
            for hpo_term in hpo.split('; '):
                if hpo_term == '-':
                    continue
                hpo_id = hpo_term.split('-')[0]
                if hpo_to_ancestor.get(hpo_id):
                    hpo_ancester_set.add(hpo_to_ancestor[hpo_id])
            hpo_ancester = '; '.join(hpo_ancester_set) if hpo_ancester_set else '-'
            
            
            #print(go_ancester)
            
            # start to process data for plot
            if use_ancestor:
                plot_go_set = set(go_ancester.split('; '))
                plot_hpo_set = set(hpo_ancester.split('; '))
            else:
                plot_go_set = set(go.split('; '))
                plot_hpo_set = set(hpo.split('; '))
            
            node_set.add(gene)
            
            
            bp_set = plot_go_set | plot_hpo_set


            if '-' in bp_set:
                bp_set.remove('-')
            if '' in bp_set:
                bp_set.remove('')

            node_set.update(bp_set)
            
            #print(snp_set)
            #print(alter_set)
            #print(bp_set)
            

            # fixme: normalized snps
            if snp_set:
                for snp in snp_set:
                    snp = snp.strip()

                    # fixme: some snp format is wrong in db file
                    #if not snp.startswith('rs') or snp == 'None' \
                        #or not snp.startswith('p.') or not snp.startswith('c.'):
                        #continue
                    node_set.add('SNP')
                    node_set.add(snp)

                    # sub_var -- bp
                    for bp in bp_set:
                        gene_var_count[ (gene, 'SNPs') ] += 1
                        #var_sub_var_count[ ('SNPs', snp) ] += 1

                        sub_var_bp_count[('SNPs', bp)] += 1

            if alter_set :
                for var in alter_set:
                    var = var.strip()
                    # fixme: c.175delGC in var col
                    if len(var.split(':')) != 2:
                        continue

                    # fixme: some wrong GO HP Mesh
                    if var.startswith("GO:") or var.startswith("HP:") or var.startswith('MESH'):
                        continue
                    var_type, sub_var_type = var.split(':')

                    node_set.add(var_type)
                    node_set.add(sub_var_type)


                    # sub_var -- bp
                    for bp in bp_set:
                        gene_var_count[ (gene, var_type) ] += 1

                        #var_sub_var_count[ (var_type, sub_var_type) ] += 1

                        sub_var_bp_count[(var_type, bp)] += 1


        # todo: only 5 sub_var_bp to show
        nodes = [{'name': node} for node in node_set]

        links = []
        nodes = []
        # for _key, value in (*gene_var_count.items(), *var_sub_var_count.items(), *sub_var_bp_count.items()):
        # sub_var_bp_count = {(sub_var, bp): num for (sub_var, bp), num in sub_var_bp_count.items() if num > 5}
        for (source, target), value in (*gene_var_count.items(), *sub_var_bp_count.items()):
        # for (source, target), value in (*gene_var_count.items(), *var_sub_var_count.items()):
            # fixme: for test
            if {'name': source} not in nodes:
                nodes.append({'name': source} )
            if {'name': target} not in nodes:
                nodes.append({'name': target} )

            # {"source": "Total", "target": "Environment", "value": 0.342284047256003}
            links.append({
                "source": source,
                "target": target,
                "value": value,
            })
    return nodes, links


In [133]:
def single_type_sankey(type_to_sub_type_count: dict, \
                       temp_file: str, save_file: str):
    
    
    nodes = []
    links = []
    saved_node_set = set()
    
    for (node, sub_node), count in type_to_sub_type_count.items():
        
        if node == sub_node:
            continue
            
        if node == 'SNP':
            continue
            
        if node not in saved_node_set:
            nodes.append({'name': node})
            saved_node_set.add(node)
            
        if sub_node not in saved_node_set:
            nodes.append({'name': sub_node})
            saved_node_set.add(sub_node)
        
        links.append({
            "source": node,
            "target": sub_node,
            "value": count,
        })
        
    with open(temp_file) as f, open(save_file, 'w') as wf:
        doc = f.read()
        
        doc = doc.replace('data_str', str(nodes))
        doc = doc.replace('link_str', str(links))

        wf.write(doc)
    print(f'{save_file} saved.')
        

In [134]:
# 颜色记录 2023 07 17
# Gene #ffa500
# Alteration #a841da
# BP #89ce4f


In [1]:
# SanKey 图保存完成
# 替换template中对应位置

uas_ancestor = True
selected_gene = 'ERBB2'

html_save_file = f'{fig_save_path}/ERBB2.sankey-two-layer.html'
#html_save_file = f'{fig_save_path}/ERBB2.sankey-ancestor-bp.html'

#nodes, links = read_merge_db_file(AML_file, go_to_ancestor, hp_to_ancestor, uas_ancestor, selected_gene)
nodes, links = read_merge_db_file_two_layer(AML_file, go_to_ancestor, hp_to_ancestor, uas_ancestor, selected_gene)
#nodes, links = read_merge_db_file_add_bp_layer(AML_file, go_to_ancestor, hp_to_ancestor, uas_ancestor, selected_gene)

replace_template(template_html, nodes, links, html_save_file)

NameError: name 'fig_save_path' is not defined

In [136]:
# 深入单基因案例
# 没个Alteration type and GO/HPO ancestor的细分条目
nodes, links, \
gene_var_count, var_sub_var_count, \
sub_var_bp_count, ancestor_bp_count = read_merge_db_file_add_bp_layer(AML_file, go_to_ancestor, hp_to_ancestor, uas_ancestor, selected_gene)


In [137]:
# 绘制单独的var 桑吉图
fig_save_path = f'/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制'

var_temp_file = f'{fig_save_path}/Sankey-template_var.html'
var_sankey_save_file = f'{fig_save_path}/var-sankey.html'


single_type_sankey(var_sub_var_count, var_temp_file, var_sankey_save_file)


/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制/var-sankey.html saved.


In [138]:
# 绘制单独的BP 桑吉图
fig_save_path = f'/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制'

bp_temp_file = f'{fig_save_path}/Sankey-template_BP.html'
bp_sankey_save_file = f'{fig_save_path}/BP-sankey.html'


single_type_sankey(ancestor_bp_count, bp_temp_file, bp_sankey_save_file)


/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制/BP-sankey.html saved.


In [216]:

def single_type_sankey(type_to_sub_type_count: dict, \
                       temp_file: str, save_file: str, \
                      one_level_color:str, second_level_color: str, 
                      display_sub_node_num:int=3):
    
        
    node_count = defaultdict(int)
    node_to_sub_node_count = defaultdict(int)
    
    
    nodes = []
    links = []
    saved_node_set = set()
    #print(type_to_sub_type_count)
    for (node, sub_node), count in type_to_sub_type_count.items():
        
        if node == sub_node:
            continue
            
        #if sub_node == 'Mutations':
            #count = int(count/6)
        
        node_count[node] += count
        node_to_sub_node_count[(node, sub_node)] += count

    
    # generate pie data in echarts
    pie_data = []
    for node in node_count.keys():
        children_list = []
        
        sub_node_count = {sub_node: count for (_node, sub_node), count in node_to_sub_node_count.items() if _node == node}
        
        total_sub_node = sum(sub_node_count.values())
        
        # wrong value check
        if node_count[node] != total_sub_node:
            print(node_count[node])
            print(sub_node_count)
            input()
        
        sorted_sub_node = sorted(sub_node_count.keys(), key=lambda x: sub_node_count[x], reverse=True)
        
        display_count = 0
        
        for idx, sub_node in enumerate(sorted_sub_node):
            sub_node_value = sub_node_count[sub_node]
            
            if idx >= display_sub_node_num:
                break
            
            children_list.append({
                "name": sub_node,
                "value": sub_node_value,
                "itemStyle": {
                    "opacity": 0.8,
                    "color": second_level_color
                }
            })
            display_count += sub_node_value
            
            #print(children_list)

        
        other_count = total_sub_node - display_count
        
        if other_count > 0:
            children_list.append({
                "name": '...',
                "value": other_count,
                "itemStyle": {
                "opacity": 0.8,
                "color": second_level_color,
                },
                })
        
        pie_data.append({
            "name": node,
            "itemStyle": {
                "opacity": 0.8,
                "color": one_level_color,
            },
            "value": display_count + other_count,
            "children": children_list
            
        })
        
        
    
    # repalce the ecahrts pie template
    with open(temp_file) as f, open(save_file, 'w') as wf:
        doc = f.read()
        
        doc = doc.replace('data_str', str(pie_data))

        wf.write(doc)
    print(f'{save_file} saved.')



In [217]:
# 绘制单独的Var饼状图
fig_save_path = f'/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制'

var_temp_file = f'{fig_save_path}/Pie-template.html'
var_sankey_save_file = f'{fig_save_path}/Alt-Pie.html'

display_num = 10

single_type_sankey(var_sub_var_count, var_temp_file, var_sankey_save_file,
                  '#a841da', '#b965e1', display_num)


/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制/Alt-Pie.html saved.


In [223]:
# 绘制单独的BP饼状图
fig_save_path = f'/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制'

bp_temp_file = f'{fig_save_path}/Pie-template.html'
bp_sankey_save_file = f'{fig_save_path}/BP-Pie.html'

display_num = 3

new_ancestor_bp_count = {(source.replace('-', ' '), target.replace('-', ' ')): count for (source,target), count in ancestor_bp_count.items()}


single_type_sankey(new_ancestor_bp_count, bp_temp_file, bp_sankey_save_file,
                  '#89ce4f', '#a9db7f', display_num)



/mnt/disk1/xzyao/CancerPNRLE/ScientificData投稿富集分析结果/ScientificData_论文绘图/单基因桑吉图绘制/BP-Pie.html saved.


In [224]:
new_ancestor_bp_count.keys()

dict_keys([('HP:0001871 Abnormality of blood and blood forming tissues', 'HP:0004808 Acute myeloid leukemia'), ('HP:0001871 Abnormality of blood and blood forming tissues', 'HP:0002863 Myelodysplasia'), ('GO:0065007 biological regulation', 'GO:0023052 signaling'), ('GO:0009987 cellular process', 'GO:0006915 apoptotic process'), ('GO:0009987 cellular process', 'GO:0030154 cell differentiation'), ('HP:0002664 Neoplasm', 'HP:0003002 Breast carcinoma'), ('HP:0002664 Neoplasm', 'HP:0002664 Neoplasm'), ('GO:0009987 cellular process', 'GO:0038084 vascular endothelial growth factor signaling pathway'), ('GO:0008152 metabolic process', 'GO:0009056 catabolic process'), ('GO:0003824 catalytic activity', 'GO:0016791 phosphatase activity'), ('HP:0001871 Abnormality of blood and blood forming tissues', 'HP:0001909 Leukemia'), ('HP:0002086 Abnormality of the respiratory system', 'HP:0030358 Non small cell lung carcinoma'), ('GO:0003824 catalytic activity', 'GO:0005006 epidermal growth factor receptor

In [225]:
for (source, target), value in new_ancestor_bp_count.items():
    if source == 'HP:0001871 Abnormality of blood and blood forming tissues':
        print(f'{target} {value}')

HP:0004808 Acute myeloid leukemia 10
HP:0002863 Myelodysplasia 4
HP:0001909 Leukemia 2
HP:0002665 Lymphoma 3
HP:0005523 Lymphoproliferative disorder 3
HP:0005506 Chronic myelogenous leukemia 4
HP:0006775 Multiple myeloma 1
HP:0002488 Acute leukemia 2
HP:0005547 Myeloproliferative disorder 2


In [166]:
var_sub_var_count

defaultdict(int,
            {('copy number variation', 'deletion'): 24,
             ('expression changes', 'overexpression'): 62,
             ('expression changes', 'underexpression'): 101,
             ('copy number variation', 'amplification'): 62,
             ('point mutations', 'Mutations'): 258,
             ('expression changes', 'inactivation'): 12,
             ('copy number variation', 'fusion'): 19,
             ('expression changes', 'expression'): 26,
             ('epigenic marks', 'methylation'): 6,
             ('epigenic marks', 'phosphorylation'): 9,
             ('copy number variation', 'knockout'): 30,
             ('SNP', 'rs113488022'): 9,
             ('epigenic marks', 'epigenetics'): 13,
             ('expression changes', 'dysregulation'): 20,
             ('point mutations', 'insertion'): 7,
             ('SNP', 'p.A330L'): 1,
             ('SNP', 'p.I332E'): 1,
             ('point mutations', 'substitution'): 1,
             ('SNP', 'rs1468390238'): 1,


In [162]:
sort_var = sorted(var_sub_var_count.keys())

#print(sort_var)

for var in sort_var:
    print(f'{var}: {var_sub_var_count[var]}')


('SNP', 'c.1GATA>GATA'): 6
('SNP', 'c.2A>A'): 1
('SNP', 'c.6A>C'): 2
('SNP', 'p.A330L'): 1
('SNP', 'p.D9T'): 1
('SNP', 'p.E189R'): 1
('SNP', 'p.E234F'): 1
('SNP', 'p.E333A'): 1
('SNP', 'p.F243L'): 1
('SNP', 'p.F293I'): 6
('SNP', 'p.G4S'): 2
('SNP', 'p.I332E'): 1
('SNP', 'p.I767M'): 1
('SNP', 'p.K122N'): 6
('SNP', 'p.K27M'): 2
('SNP', 'p.K334A'): 1
('SNP', 'p.L234F'): 1
('SNP', 'p.L235V'): 2
('SNP', 'p.L792H'): 1
('SNP', 'p.P13K'): 1
('SNP', 'p.P396L'): 1
('SNP', 'p.R292P'): 1
('SNP', 'p.S239C'): 2
('SNP', 'p.S298A'): 1
('SNP', 'p.S37F'): 6
('SNP', 'p.S442C'): 1
('SNP', 'p.S4C'): 1
('SNP', 'p.S4E'): 1
('SNP', 'p.S6K'): 2
('SNP', 'p.S727A'): 2
('SNP', 'p.S727S'): 2
('SNP', 'p.T145D'): 1
('SNP', 'p.Y187L'): 1
('SNP', 'p.Y300L'): 1
('SNP', 'rs1057519826'): 1
('SNP', 'rs1057519860'): 1
('SNP', 'rs1057519861'): 2
('SNP', 'rs1057519919'): 2
('SNP', 'rs113488022'): 9
('SNP', 'rs11554273'): 2
('SNP', 'rs1178938625'): 2
('SNP', 'rs121434568'): 4
('SNP', 'rs121434569'): 7
('SNP', 'rs121434592'): 

In [123]:
sort_ancestor = sorted(ancestor_bp_count.keys())

#print(sort_var)

for ancestor in sort_ancestor:
    print(f'{ancestor}: {ancestor_bp_count[ancestor]}')

('GO:0002376-immune system process', 'GO:0002524-hypersensitivity'): 1
('GO:0002376-immune system process', 'GO:0006955-immune response'): 1
('GO:0002376-immune system process', 'GO:0042110-T cell activation'): 1
('GO:0002376-immune system process', 'GO:0042119-neutrophil activation'): 1
('GO:0003824-catalytic activity', 'GO:0003824-catalytic activity'): 1
('GO:0003824-catalytic activity', 'GO:0003904-deoxyribodipyrimidine photo-lyase activity'): 1
('GO:0003824-catalytic activity', 'GO:0004697-protein kinase C activity'): 1
('GO:0003824-catalytic activity', 'GO:0004707-MAP kinase activity'): 1
('GO:0003824-catalytic activity', 'GO:0005006-epidermal growth factor receptor activity'): 25
('GO:0003824-catalytic activity', 'GO:0005020-stem cell factor receptor activity'): 1
('GO:0003824-catalytic activity', 'GO:0008168-methyltransferase activity'): 1
('GO:0003824-catalytic activity', 'GO:0008177-succinate dehydrogenase (ubiquinone) activity'): 1
('GO:0003824-catalytic activity', 'GO:001630