# What is the distribution of alias alias collisions in Ensembl, HGNC, and NCBI?

## <a id='toc6_'></a>[How many unique primary gene symbols are there?](#toc0_)

### <a id='toc6_1_'></a>[Per Source](#toc0_)

In [139]:
import pandas as pd
import numpy as np
import plotly.express as px

In [140]:
def create_aa_collision_histogram(aa_collision_gene_xxxx_df: pd.DataFrame, source: str, xxxx_aa_collision_count: int):
    """Create a histogram of the frequencies at which aliases are shared

    :param aa_collision_gene_xxxx_df: A df of alias-alias collisions organized by primary gene symbol
    :param source: Representation of the source of the gene records
    :param xxxx_alias_count: Number of aliases total in the source
    :return: A histogram of the percentage of aliases that are shared between 2 genes, 3 genes, and so on
    """
    
    #Count the number of times each shared alias is used
    aa_collision_xxxx_count_df = aa_collision_gene_xxxx_df.pivot_table(
    index=["collision"], aggfunc="size"
    )
    aa_collision_xxxx_count_df = aa_collision_xxxx_count_df.reset_index()
    aa_collision_xxxx_count_df.rename(columns={0: "num_gene_records"}, inplace=True)
    aa_collision_xxxx_count_df = aa_collision_xxxx_count_df.sort_values(
        "num_gene_records", ascending=False)

    #Convert to csv
    aa_collision_xxxx_count_df.to_csv(f'../output/aa_collision_{source}_count_df.csv', index=True)

    #Count the frequency at which aliases are shared 
    aa_collision_xxxx_distribution_df = aa_collision_xxxx_count_df.pivot_table(
    index=["num_gene_records"], aggfunc="size"
    )
    aa_collision_xxxx_distribution_df = aa_collision_xxxx_distribution_df.reset_index()
    aa_collision_xxxx_distribution_df.rename(columns={0: "num_collision_symbol"}, inplace=True)
    aa_collision_xxxx_distribution_df["percent_collision_symbol"] = (
        aa_collision_xxxx_distribution_df["num_collision_symbol"] / xxxx_aa_collision_count
    ) * 100

    #Convert to csv
    aa_collision_xxxx_distribution_df.to_csv(f'../output/aa_collision_{source}_distribution_df.csv', index=True)

    #Create histogram df 
    xxxx_alias_count_histogram_df = aa_collision_xxxx_distribution_df.drop(
    "num_collision_symbol", axis=1)

    #Convert to csv
    xxxx_alias_count_histogram_df.to_csv(f'../output/{source}_alias_count_histogram_df.csv', index=True)

    return px.bar(xxxx_alias_count_histogram_df, x="num_gene_records", y="percent_collision_symbol")



In [141]:
mini_ensg_df = pd.read_csv(
    "../output/mini_ensg_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_ensg_df

Unnamed: 0.1,Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,0,ENSG00000210049,,7481,MTTF,MT-TF
1,1,ENSG00000210049,,7481,TRNF,MT-TF
2,2,ENSG00000211459,,7470,12S,MT-RNR1
3,3,ENSG00000211459,,7470,MOTS-C,MT-RNR1
4,4,ENSG00000211459,,7470,MTRNR1,MT-RNR1
...,...,...,...,...,...,...
95019,133058,ENSG00000197989,85028,30062,LINC00100,SNHG12
95020,133059,ENSG00000197989,85028,30062,PNAS-123,SNHG12
95021,133060,ENSG00000229388,,52502,LINC01715,TAF12-DT
95022,133062,ENSG00000274978,26824,10108,RNU11-1,RNU11


In [142]:
ensg_gene_symbol_set = set(mini_ensg_df["gene_symbol"])
ensg_gene_symbol_count = len(ensg_gene_symbol_set)

In [143]:
mini_hgnc_df = pd.read_csv(
    "../output/mini_hgnc_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_hgnc_df

Unnamed: 0.1,Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,0,5,,1,ENSG00000121410,A1BG
1,1,37133,FLJ23569,503538,ENSG00000268895,A1BG-AS1
2,2,24086,ACF,29974,ENSG00000148584,A1CF
3,3,24086,ASP,29974,ENSG00000148584,A1CF
4,4,24086,ACF64,29974,ENSG00000148584,A1CF
...,...,...,...,...,...,...
67578,67578,29027,KIAA0399,23140,ENSG00000074755,ZZEF1
67579,67579,29027,ZZZ4,23140,ENSG00000074755,ZZEF1
67580,67580,29027,FLJ10821,23140,ENSG00000074755,ZZEF1
67581,67581,24523,DKFZP564I052,26009,ENSG00000036549,ZZZ3


In [144]:
hgnc_gene_symbol_set = set(mini_hgnc_df["gene_symbol"])
hgnc_gene_symbol_count = len(hgnc_gene_symbol_set)

In [145]:
mini_ncbi_df = pd.read_csv(
    "../output/mini_ncbi_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_ncbi_df

Unnamed: 0.1,Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,0,1,A1BG,A1B,5,ENSG00000121410
1,0,1,A1BG,ABG,5,ENSG00000121410
2,0,1,A1BG,GAB,5,ENSG00000121410
3,0,1,A1BG,HYST2477,5,ENSG00000121410
4,1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...,...
92714,193575,8923215,trnD,,,
92715,193576,8923216,trnP,,,
92716,193577,8923217,trnA,,,
92717,193578,8923218,COX1,,,


In [146]:
ncbi_gene_symbol_set = set(mini_ncbi_df["gene_symbol"])
ncbi_gene_symbol_count = len(ncbi_gene_symbol_set)

In [147]:
unique_primary_symbol_summary_index = "HGNC", "ENSG", "NCBI"
unique_primary_symbol_summary = {
    "Number of Unique Primary Gene Symbols": [
        ensg_gene_symbol_count,
        hgnc_gene_symbol_count,
        ncbi_gene_symbol_count,
    ]
}
unique_primary_symbol_summary_df = pd.DataFrame(
    unique_primary_symbol_summary, index = unique_primary_symbol_summary_index
)
unique_primary_symbol_summary_df

Unnamed: 0,Number of Unique Primary Gene Symbols
HGNC,41164
ENSG,45646
NCBI,45727


### <a id='toc6_2_'></a>[All sources](#toc0_)

#### <a id='toc6_2_1_'></a>[How many primary symbols appear in all sources?](#toc0_)

In [148]:
all_sources_unique_primary_symbol_set = (
    ensg_gene_symbol_set
    & hgnc_gene_symbol_set
    & ncbi_gene_symbol_set
)
all_sources_unique_primary_symbol_count = len(all_sources_unique_primary_symbol_set)
all_sources_unique_primary_symbol_count

40930

Side Quest

### NCBI has almost 4x times the amount of unique primary gene symbols than the other sources. Why? What are they?

In [149]:
only_ncbi_gene_symbol_set = ncbi_gene_symbol_set - all_sources_unique_primary_symbol_set

In [150]:
len(only_ncbi_gene_symbol_set)

4797

In [151]:
only_ncbi_gene_symbol_set

{'RPL37P16',
 'ERVH-2',
 'ANIB6',
 'VN1R94P',
 'SPG41',
 'ACKR5',
 'HDLCQ5',
 'RPS5P6',
 'SNORD136',
 'DYZ1L2',
 'FUSE',
 'RPS10P25',
 'SNORD47',
 'COL1AR',
 'RLS4',
 'DEL17Q11.2',
 'MAFD9',
 'DYT23',
 'DELYQ11',
 'SCGB1B1P',
 'PBCA',
 'RPS20P8',
 'RPLP1P2',
 'ERVH-3',
 'HYPLIP2',
 'TRH-GTG1-4',
 'PIRC91',
 'PARK3',
 'TRR-ACG2-4',
 'KCNIP1-AS2',
 'TCO',
 'VN1R73P',
 'MIR10397',
 'RPS6P5',
 'RNA5-8SP3',
 'VENTXP8',
 'PPR2',
 'TRG-CCC1-1',
 'CDRT10',
 'ND4',
 'CNE6',
 'ARF2P',
 'RPS25P1',
 'MGR10',
 'IGLL2P',
 'KIRREL3-AS4',
 'CLLS5',
 'SALL4P3',
 'LINC02036',
 'DNM1P27',
 'RNU7-5P',
 'TRS-GCT3-1',
 'MSBP2',
 'RPL18AP4',
 'MIR9983',
 'TRL-TAA4-1',
 'LNC-WAL',
 'RPL19P8',
 'ERVE-2',
 'AIS4',
 'HSPD1P20',
 'POFUT3',
 'FRA2K',
 'RNA45SN2',
 'HEAT2',
 'TRNC',
 'DYZ1L24',
 'trnI',
 'TRM-CAT3-1',
 'TRN-GTT16-1',
 'CNE-2',
 'DFNA18',
 'SNORA101B',
 'TRK-TTT3-1',
 'TTTY29P',
 'ERVK-19',
 'TRW-CCA3-1',
 'NUP214-AS1',
 'RPS2P13',
 'LINC02859',
 'TRS-AGA2-5',
 'SLEN2',
 'BCYRN1P2',
 'LINC03055',
 '

#### Most of the symbols that are unique to NCBI (147,913/152,418-97%) begin with "LOC"

In [152]:
filtered_set = {gene for gene in only_ncbi_gene_symbol_set if not gene.startswith('LOC')}
len(filtered_set)

4797

In [153]:
filtered_set

{'RPL37P16',
 'ERVH-2',
 'ANIB6',
 'VN1R94P',
 'SPG41',
 'ACKR5',
 'HDLCQ5',
 'RPS5P6',
 'SNORD136',
 'DYZ1L2',
 'FUSE',
 'RPS10P25',
 'SNORD47',
 'COL1AR',
 'RLS4',
 'DEL17Q11.2',
 'MAFD9',
 'DYT23',
 'DELYQ11',
 'SCGB1B1P',
 'PBCA',
 'RPS20P8',
 'RPLP1P2',
 'ERVH-3',
 'HYPLIP2',
 'TRH-GTG1-4',
 'PIRC91',
 'PARK3',
 'TRR-ACG2-4',
 'KCNIP1-AS2',
 'TCO',
 'VN1R73P',
 'MIR10397',
 'RPS6P5',
 'RNA5-8SP3',
 'VENTXP8',
 'PPR2',
 'TRG-CCC1-1',
 'CDRT10',
 'ND4',
 'CNE6',
 'ARF2P',
 'RPS25P1',
 'MGR10',
 'IGLL2P',
 'KIRREL3-AS4',
 'CLLS5',
 'SALL4P3',
 'LINC02036',
 'DNM1P27',
 'RNU7-5P',
 'TRS-GCT3-1',
 'MSBP2',
 'RPL18AP4',
 'MIR9983',
 'TRL-TAA4-1',
 'LNC-WAL',
 'RPL19P8',
 'ERVE-2',
 'AIS4',
 'HSPD1P20',
 'POFUT3',
 'FRA2K',
 'RNA45SN2',
 'HEAT2',
 'TRNC',
 'DYZ1L24',
 'trnI',
 'TRM-CAT3-1',
 'TRN-GTT16-1',
 'CNE-2',
 'DFNA18',
 'SNORA101B',
 'TRK-TTT3-1',
 'TTTY29P',
 'ERVK-19',
 'TRW-CCA3-1',
 'NUP214-AS1',
 'RPS2P13',
 'LINC02859',
 'TRS-AGA2-5',
 'SLEN2',
 'BCYRN1P2',
 'LINC03055',
 '

### <a id='toc6_2_2_'></a>[How many unique symbols are found between all sources?](#toc0_)

In [154]:
bw_all_sources_unique_primary_symbol_df = pd.concat(
    [
        mini_ensg_df[["alias_symbol", "gene_symbol"]],
        mini_hgnc_df[["alias_symbol", "gene_symbol"]],
        mini_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [155]:
bw_all_sources_unique_primary_symbol_set = set(bw_all_sources_unique_primary_symbol_df["gene_symbol"])
bw_all_sources_unique_primary_symbol_count = len(bw_all_sources_unique_primary_symbol_set)
bw_all_sources_unique_primary_symbol_count

47368

# <a id='toc7_'></a>[How many unique aliases are there?](#toc0_)

## <a id='toc7_1_'></a>[Per Source](#toc0_)

subset= excludes genes without aliases, duplicates of primary-alias pairs, instances where the primary gene symbol is an alias for that gene record. created in alias-primary_collision_analysis notebook

In [156]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,7481.0,MTTF,MT-TF
1,ENSG00000210049,,7481.0,TRNF,MT-TF
2,ENSG00000211459,,7470.0,12S,MT-RNR1
3,ENSG00000211459,,7470.0,MOTS-C,MT-RNR1
4,ENSG00000211459,,7470.0,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,85028.0,30062.0,LINC00100,SNHG12
133059,ENSG00000197989,85028.0,30062.0,PNAS-123,SNHG12
133060,ENSG00000229388,,52502.0,LINC01715,TAF12-DT
133062,ENSG00000274978,26824.0,10108.0,RNU11-1,RNU11


In [157]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
1,37133,FLJ23569,503538.0,ENSG00000268895,A1BG-AS1
2,24086,ACF,29974.0,ENSG00000148584,A1CF
3,24086,ASP,29974.0,ENSG00000148584,A1CF
4,24086,ACF64,29974.0,ENSG00000148584,A1CF
5,24086,ACF65,29974.0,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,29027,KIAA0399,23140.0,ENSG00000074755,ZZEF1
67579,29027,ZZZ4,23140.0,ENSG00000074755,ZZEF1
67580,29027,FLJ10821,23140.0,ENSG00000074755,ZZEF1
67581,24523,DKFZP564I052,26009.0,ENSG00000036549,ZZZ3


In [158]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5.0,ENSG00000121410
0,1,A1BG,ABG,5.0,ENSG00000121410
0,1,A1BG,GAB,5.0,ENSG00000121410
0,1,A1BG,HYST2477,5.0,ENSG00000121410
1,2,A2M,A2MD,7.0,ENSG00000175899
...,...,...,...,...,...
193317,139281660,IFT70A-AS1,AGPS,58181.0,
193317,139281660,IFT70A-AS1,PDE11A,58181.0,
193324,139281667,BCAT1-DT,LNC-BCAT1,54396.0,
193398,139440214,LNCOB1,LNC-OB1,56209.0,


In [159]:
ensg_alias_symbol_set, ensg_alias_count = set(subset_genes_ensg_df["alias_symbol"]), len(set(subset_genes_ensg_df["alias_symbol"]))
hgnc_alias_symbol_set, hgnc_alias_count = set(subset_genes_hgnc_df["alias_symbol"]), len(set(subset_genes_hgnc_df["alias_symbol"]))
ncbi_alias_symbol_set, ncbi_alias_count = set(subset_genes_ncbi_df["alias_symbol"]), len(set(subset_genes_ncbi_df["alias_symbol"]))

In [160]:
unique_alias_summary_index = "HGNC", "ENSG", "NCBI"
unique_alias_summary = {
    "Number of Unique Aliases": [
        ensg_alias_count,
        hgnc_alias_count,
        ncbi_alias_count,
    ]
}
unique_alias_summary_df = pd.DataFrame(
    unique_alias_summary, index = unique_alias_summary_index
)
unique_alias_summary_df

Unnamed: 0,Number of Unique Aliases
HGNC,55361
ENSG,42918
NCBI,69178


## <a id='toc7_2_'></a>[All sources](#toc0_)

### <a id='toc7_2_1_'></a>[How many aliases appear in all sources?](#toc0_)

In [161]:
all_sources_unique_alias_set = (
    ensg_alias_symbol_set
    & hgnc_alias_symbol_set
    & ncbi_alias_symbol_set
)
all_sources_unique_alias_count = len(all_sources_unique_alias_set)
all_sources_unique_alias_count

30077

### <a id='toc7_2_2_'></a>[How many unique aliases are found between all sources?](#toc0_)

In [162]:
bw_all_sources_unique_alias_df = pd.concat(
    [
        subset_genes_ensg_df[["alias_symbol", "gene_symbol"]],
        subset_genes_hgnc_df[["alias_symbol", "gene_symbol"]],
        subset_genes_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [163]:
bw_all_sources_unique_alias_set = set(bw_all_sources_unique_alias_df["alias_symbol"])
bw_all_sources_unique_alias_count = len(bw_all_sources_unique_alias_set)
bw_all_sources_unique_alias_count

81578

# <a id='toc8_'></a>[How many gene records have an alias that is shared?](#toc0_)

## <a id='toc8_1_'></a>[Per Source](#toc0_)

In [164]:
aa_collision_gene_ensg_df = pd.read_csv(
    "../output/aa_collision_gene_ensg_df.csv", index_col=[0])
aa_collision_gene_ensg_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,SLC25A5,"2F1,ANT2,T2,T3",ENSG00000005022,10991.0,292.0,2F1,ENSG
1,KLRG1,"2F1,CLEC15A,MAFA,MAFA-L",ENSG00000139187,6380.0,10219.0,2F1,ENSG
2,S100A8,"60B8AG,CAGA,CFAG,CGLA,MRP-8,MRP8,P8,S100-A8",ENSG00000143546,10498.0,6279.0,60B8AG,ENSG
3,S100A9,"60B8AG,CAGB,CFAG,CGLB,LIAG,MAC387,MIF,MRP-14,M...",ENSG00000163220,10499.0,6280.0,60B8AG,ENSG
4,RNU6V,"87U6,LH87",ENSG00000206832,10230.0,,87U6,ENSG
...,...,...,...,...,...,...,...
3672,SLC30A8,"ZNT-8,ZNT8",ENSG00000164756,20303.0,169026.0,ZNT8,ENSG
3673,SLC30A1,"ZNT1,ZRC1",ENSG00000170385,11012.0,7779.0,ZRC1,ENSG
3674,SLC30A10,"DKFZP547M236,ZNT-10,ZNT10,ZNT8,ZRC1",ENSG00000196660,25355.0,55532.0,ZRC1,ENSG
3675,ZYG11B,"FLJ13456,ZYG11",ENSG00000162378,25820.0,79699.0,ZYG11,ENSG


In [165]:
ensg_aa_collision_primary_symbol_set = set(aa_collision_gene_ensg_df["gene_symbol"])
ensg_aa_collision_primary_symbol_count = len(ensg_aa_collision_primary_symbol_set)

In [166]:
aa_collision_gene_hgnc_df = pd.read_csv(
    "../output/aa_collision_gene_hgnc_df.csv", index_col=[0])
aa_collision_gene_hgnc_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,KLRG1,"MAFA,2F1,MAFA-L,CLEC15A",ENSG00000139187,6380,10219.0,2F1,HGNC
1,SLC25A5,"T2,2F1,T3",ENSG00000005022,10991,292.0,2F1,HGNC
2,S100A8,"P8,MRP8,MRP-8,60B8AG,CGLA,S100-A8",ENSG00000143546,10498,6279.0,60B8AG,HGNC
3,S100A9,"P14,MIF,NIF,LIAG,MRP14,MAC387,60B8AG,CGLB,MRP-...",ENSG00000163220,10499,6280.0,60B8AG,HGNC
4,RNU6V,"87U6,LH87",ENSG00000206832,10230,6071.0,87U6,HGNC
...,...,...,...,...,...,...,...
2869,ZNF22,"KOX15,HKR-T1,ZNF422,Zfp422",ENSG00000165512,13012,7570.0,ZNF422,HGNC
2870,SLC30A10,"DKFZp547M236,ZnT-10,ZRC1,ZNT8,ZNT10",ENSG00000196660,25355,55532.0,ZNT8,HGNC
2871,SLC30A8,"ZnT-8,ZNT8",ENSG00000164756,20303,169026.0,ZNT8,HGNC
2872,SLC30A1,ZRC1,ENSG00000170385,11012,7779.0,ZRC1,HGNC


In [167]:
hgnc_aa_collision_primary_symbol_set = set(aa_collision_gene_hgnc_df["gene_symbol"])
hgnc_aa_collision_primary_symbol_count = len(hgnc_aa_collision_primary_symbol_set)

In [168]:
aa_collision_gene_ncbi_df = pd.read_csv(
    "../output/aa_collision_gene_ncbi_df.csv", index_col=[0])
aa_collision_gene_ncbi_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,BMPR1A,"10q23del,ACVRLK3,ALK-3,ALK3,BMPR-1A,CD292,SKR5",ENSG00000107779,1076.0,657,10Q23DEL,NCBI
1,PTEN,"10q23del,BZS,CWS1,DEC,GLM2,MHAM,MMAC1,PTEN1,PT...",ENSG00000171862,9588.0,5728,10Q23DEL,NCBI
2,ALOX12,"12-LOX,12S-LOX,LOG12",ENSG00000108839,429.0,239,12-LOX,NCBI
3,ALOX15,"12-LOX,15-LOX,15-LOX-1,LOG15",ENSG00000161905,433.0,246,12-LOX,NCBI
4,AKR1C1,"2-ALPHA-HSD,20-ALPHA-HSD,C9,DD1,DD1/DD2,DDH,DD...",ENSG00000187134,384.0,1645,20-ALPHA-HSD,NCBI
...,...,...,...,...,...,...,...
8916,SLC30A10,"HMDPC,HMNDYT1,ZNT10,ZNT8,ZRC1,ZnT-10",ENSG00000196660,25355.0,55532,ZRC1,NCBI
8917,PEX13,"NALD,PBD11A,PBD11B,ZWS",ENSG00000162928,8855.0,5194,ZWS,NCBI
8918,PEX1,"HMLR1,PBD1A,PBD1B,ZWS,ZWS1",ENSG00000127980,8850.0,5189,ZWS,NCBI
8919,ZYG11B,ZYG11,ENSG00000162378,25820.0,79699,ZYG11,NCBI


In [169]:
ncbi_aa_collision_primary_symbol_set = set(aa_collision_gene_ncbi_df["gene_symbol"])
ncbi_aa_collision_primary_symbol_count = len(ncbi_aa_collision_primary_symbol_set)

In [170]:
aa_collision_primary_symbol_summary_index = "HGNC", "ENSG", "NCBI"
aa_collision_primary_symbol_summary= {
    "Number of Gene Records With a Shared Alias": [
        ensg_aa_collision_primary_symbol_count,
        hgnc_aa_collision_primary_symbol_count,
        ncbi_aa_collision_primary_symbol_count,
    ]
}
aa_collision_primary_symbol_summary_df = pd.DataFrame(
    aa_collision_primary_symbol_summary, index = aa_collision_primary_symbol_summary_index
)
aa_collision_primary_symbol_summary_df

Unnamed: 0,Number of Gene Records With a Shared Alias
HGNC,3121
ENSG,2530
NCBI,6053


These values differ from the total number of rows from the aa_collision_gene_dfs because each row indicates a collision. There are some gene records that are involved in multiple collisions. For example: gene A has the aliases A1, 1A, and AA. If all three aliases are used by other genes, then gene A would take up 3 rows in the the aa_collision_gene_df while only being one gene record. In the case of alias-alais collisions in NCBI, the aa_collision_gene_df has 8868 rows. There are a total of 4540 rows that are duplicates. Of those 4540 duplicates, 1685 are unique. 8868 - (4540-1685) = 6013

## <a id='toc8_2_'></a>[All Sources](#toc0_)

### <a id='toc8_2_1_'></a>[How many gene records have at least one shared alias in all sources?](#toc0_)

(intersection of gene records with a collision in all three sources)

In [171]:
all_sources_aa_collision_genes = (
    ensg_aa_collision_primary_symbol_set
    & hgnc_aa_collision_primary_symbol_set
    & ncbi_aa_collision_primary_symbol_set
)
len(all_sources_aa_collision_genes)

2320

### <a id='toc8_2_2_'></a>[How many unique gene records that have at least one shared alias are found between all sources?](#toc0_)

(union of gene records with a collision in all three sources)

In [172]:
bw_all_sources_aa_collision_df = pd.concat(
    [
        aa_collision_gene_ensg_df[["collision", "gene_symbol"]],
        aa_collision_gene_hgnc_df[["collision", "gene_symbol"]],
        aa_collision_gene_ncbi_df[["collision", "gene_symbol"]],
    ]
)

In [173]:
bw_all_sources_aa_collision_genes_set = set(bw_all_sources_aa_collision_df["gene_symbol"])
bw_all_sources_aa_collision_genes_count = len(bw_all_sources_aa_collision_genes_set)
bw_all_sources_aa_collision_genes_count

6241

# <a id='toc9_'></a>[How many alias symbols are being shared?](#toc0_)

## <a id='toc9_1_'></a>[Per Source](#toc0_)

In [174]:
ensg_aa_collision_set = set(aa_collision_gene_ensg_df["collision"])
ensg_aa_collision_count = len(ensg_aa_collision_set)

In [175]:
hgnc_aa_collision_set = set(aa_collision_gene_hgnc_df["collision"])
hgnc_aa_collision_count = len(hgnc_aa_collision_set)

In [176]:
ncbi_aa_collision_set = set(aa_collision_gene_ncbi_df["collision"])
ncbi_aa_collision_count = len(ncbi_aa_collision_set)

In [177]:
aa_collision_alias_symbol_summary_index = "HGNC", "ENSG", "NCBI"
aa_collision_alias_symbol_summary = {
    "Number of Shared Aliases": [
        ensg_aa_collision_count,
        hgnc_aa_collision_count,
        ncbi_aa_collision_count,
    ]
}
aa_collision_alias_symbol_summary_df = pd.DataFrame(
    aa_collision_alias_symbol_summary, index = aa_collision_alias_symbol_summary_index
)
aa_collision_alias_symbol_summary_df

Unnamed: 0,Number of Shared Aliases
HGNC,1615
ENSG,1250
NCBI,3698


## <a id='toc9_2_'></a>[All Sources](#toc0_)

### <a id='toc9_2_1_'></a>[How many aliases are shared in all sources?](#toc0_)

In [178]:
all_sources_aa_collision_aliases = (
    ensg_aa_collision_set
    & hgnc_aa_collision_set
    & ncbi_aa_collision_set
)
len(all_sources_aa_collision_aliases)

1135

### <a id='toc9_2_2_'></a>[How many unique shared aliases are found between all sources?](#toc0_)

In [179]:
bw_all_sources_aa_collision_aliases_set = set(bw_all_sources_aa_collision_df["collision"])
bw_all_sources_aa_collision_aliases_count = len(bw_all_sources_aa_collision_aliases_set)
bw_all_sources_aa_collision_aliases_count

3809

# <a id='toc10_'></a>[How common are the collision symbols?](#toc0_)

## <a id='toc10_1_'></a>[Per Source](#toc0_)

In [180]:
ensg_alias_symbol_set = set(subset_genes_ensg_df["alias_symbol"])
ensg_alias_count = len(ensg_alias_symbol_set)

In [181]:
create_aa_collision_histogram(aa_collision_gene_ensg_df, "ENSG", ensg_aa_collision_count)

In [182]:
aa_collision_ensg_count_df = pd.read_csv(
    "../output/aa_collision_ensg_count_df.csv", index_col=[0])

In [183]:
aa_collision_ensg_distribution_df = pd.read_csv(
    "../output/aa_collision_ensg_distribution_df.csv", index_col=[0])

In [184]:
ensg_alias_count_histogram_df = pd.read_csv(
    "../output/ensg_alias_count_histogram_df.csv", index_col=[0])

In [185]:
hgnc_alias_symbol_set = set(subset_genes_hgnc_df["alias_symbol"])
hgnc_alias_count = len(hgnc_alias_symbol_set)

In [186]:
create_aa_collision_histogram(aa_collision_gene_hgnc_df, "HGNC", hgnc_aa_collision_count)

In [187]:
aa_collision_hgnc_count_df = pd.read_csv(
    "../output/aa_collision_hgnc_count_df.csv", index_col=[0])

In [188]:
aa_collision_hgnc_distribution_df = pd.read_csv(
    "../output/aa_collision_hgnc_distribution_df.csv", index_col=[0])

In [189]:
hgnc_alias_count_histogram_df = pd.read_csv(
    "../output/hgnc_alias_count_histogram_df.csv", index_col=[0])

In [190]:
ncbi_alias_symbol_set = set(subset_genes_ncbi_df["alias_symbol"])
ncbi_alias_count = len(ncbi_alias_symbol_set)

In [191]:
create_aa_collision_histogram(aa_collision_gene_ncbi_df, "NCBI", ncbi_aa_collision_count)

In [192]:
aa_collision_ncbi_count_df = pd.read_csv(
    "../output/aa_collision_ncbi_count_df.csv", index_col=[0])

In [193]:
aa_collision_ncbi_distribution_df = pd.read_csv(
    "../output/aa_collision_ncbi_distribution_df.csv", index_col=[0])

In [194]:
ncbi_alias_count_histogram_df = pd.read_csv(
    "../output/ncbi_alias_count_histogram_df.csv", index_col=[0])

# <a id='toc11_'></a>[How many gene concept-alias relationships are there?](#toc0_)

## <a id='toc10_1_'></a>[Per Source](#toc0_)

In [195]:
ensg_primary_alias_pair_count = len(subset_genes_ensg_df)

In [196]:
hgnc_primary_alias_pair_count = len(subset_genes_hgnc_df)

In [197]:
ncbi_primary_alias_pair_count = len(subset_genes_ncbi_df)

In [198]:
primary_alias_pairs_summary_index = "HGNC", "ENSG", "NCBI"
primary_alias_pairs_summary = {
    "Number of Unique Gene Concept-Alias Pairs": [
        ensg_primary_alias_pair_count,
        hgnc_primary_alias_pair_count,
        ncbi_primary_alias_pair_count,
    ]
}
primary_alias_pairs_summary_df = pd.DataFrame(
    primary_alias_pairs_summary, index=primary_alias_pairs_summary_index
)
primary_alias_pairs_summary_df

Unnamed: 0,Number of Unique Gene Concept-Alias Pairs
HGNC,57423
ENSG,44542
NCBI,74401


## <a id='toc10_2_'></a>[All Sources](#toc0_)

### <a id='toc10_2_1_'></a>[How many unique gene-alias pairs are found between all sources?](#toc0_)

In [199]:
bw_all_sources_primary_alias_pairs_df = pd.concat(
    [
        subset_genes_ensg_df[["alias_symbol", "gene_symbol"]],
        subset_genes_hgnc_df[["alias_symbol", "gene_symbol"]],
        subset_genes_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [200]:
len(bw_all_sources_primary_alias_pairs_df)

176366

#### <a id='toc10_2_1_1_'></a>[Remove duplicate concept-alias pairs](#toc0_)

In [201]:
bw_all_sources_primary_alias_pairs_df = bw_all_sources_primary_alias_pairs_df.drop_duplicates(
    subset=["gene_symbol", "alias_symbol"], keep="first"
)

In [202]:
len(bw_all_sources_primary_alias_pairs_df)

87223

In [203]:
hgnc_prim_and_alias_gene_symbol_set = hgnc_alias_symbol_set.union(hgnc_gene_symbol_set)
len(hgnc_prim_and_alias_gene_symbol_set)

87988

In [204]:
ncbi_prim_and_alias_gene_symbol_set = ncbi_alias_symbol_set.union(ncbi_gene_symbol_set)
len(ncbi_prim_and_alias_gene_symbol_set)

113293

In [205]:
ensg_prim_and_alias_gene_symbol_set = ensg_alias_symbol_set.union(ensg_gene_symbol_set)
len(ensg_prim_and_alias_gene_symbol_set)

95907