# What is the distribution of alias alias collisions in Ensembl, HGNC, and NCBI?

## <a id='toc6_'></a>[How many unique primary gene symbols are there?](#toc0_)

### <a id='toc6_1_'></a>[Per Source](#toc0_)

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
def create_aa_collision_histogram(aa_collision_gene_xxxx_df: pd.DataFrame, source: str, xxxx_aa_collision_count: int):
    """Create a histogram of the frequencies at which aliases are shared

    :param aa_collision_gene_xxxx_df: A df of alias-alias collisions organized by primary gene symbol
    :param source: Representation of the source of the gene records
    :param xxxx_alias_count: Number of aliases total in the source
    :return: A histogram of the percentage of aliases that are shared between 2 genes, 3 genes, and so on
    """
    
    #Count the number of times each shared alias is used
    aa_collision_xxxx_count_df = aa_collision_gene_xxxx_df.pivot_table(
    index=["collision"], aggfunc="size"
    )
    aa_collision_xxxx_count_df = aa_collision_xxxx_count_df.reset_index()
    aa_collision_xxxx_count_df.rename(columns={0: "num_gene_records"}, inplace=True)
    aa_collision_xxxx_count_df = aa_collision_xxxx_count_df.sort_values(
        "num_gene_records", ascending=False)

    #Convert to csv
    aa_collision_xxxx_count_df.to_csv(f'../output/aa_collision_{source}_count_df.csv', index=True)

    #Count the frequency at which aliases are shared 
    aa_collision_xxxx_distribution_df = aa_collision_xxxx_count_df.pivot_table(
    index=["num_gene_records"], aggfunc="size"
    )
    aa_collision_xxxx_distribution_df = aa_collision_xxxx_distribution_df.reset_index()
    aa_collision_xxxx_distribution_df.rename(columns={0: "num_collision_symbol"}, inplace=True)
    aa_collision_xxxx_distribution_df["percent_collision_symbol"] = (
        aa_collision_xxxx_distribution_df["num_collision_symbol"] / xxxx_aa_collision_count
    ) * 100

    #Convert to csv
    aa_collision_xxxx_distribution_df.to_csv(f'../output/aa_collision_{source}_distribution_df.csv', index=True)

    #Create histogram df 
    xxxx_alias_count_histogram_df = aa_collision_xxxx_distribution_df.drop(
    "num_collision_symbol", axis=1)

    #Convert to csv
    xxxx_alias_count_histogram_df.to_csv(f'../output/{source}_alias_count_histogram_df.csv', index=True)

    return px.bar(xxxx_alias_count_histogram_df, x="num_gene_records", y="percent_collision_symbol")



In [3]:
mini_ensg_df = pd.read_csv(
    "../output/mini_ensg_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_ensg_df

Unnamed: 0.1,Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID
0,0,ENSG00000210049,MT-TF,MTTF,7481,
1,1,ENSG00000210049,MT-TF,TRNF,7481,
2,2,ENSG00000211459,MT-RNR1,12S,7470,
3,3,ENSG00000211459,MT-RNR1,MOTS-C,7470,
4,4,ENSG00000211459,MT-RNR1,MTRNR1,7470,
...,...,...,...,...,...,...
117135,117135,ENSG00000200033,RNU6-403P,,47366,
117136,117136,ENSG00000228437,LINC02474,LNCSLCC1,53417,
117137,117137,ENSG00000228437,LINC02474,RP11-400N13.2,53417,
117138,117138,ENSG00000229463,LYST-AS1,LYST-IT2,41320,


In [4]:
ensg_gene_symbol_set = set(mini_ensg_df["gene_symbol"])
ensg_gene_symbol_count = len(ensg_gene_symbol_set)

In [5]:
mini_hgnc_df = pd.read_csv(
    "../output/mini_hgnc_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_hgnc_df

Unnamed: 0.1,Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,0,5,,1,ENSG00000121410,A1BG
1,1,37133,FLJ23569,503538,ENSG00000268895,A1BG-AS1
2,2,24086,ACF,29974,ENSG00000148584,A1CF
3,3,24086,ASP,29974,ENSG00000148584,A1CF
4,4,24086,ACF64,29974,ENSG00000148584,A1CF
...,...,...,...,...,...,...
67578,67578,29027,KIAA0399,23140,ENSG00000074755,ZZEF1
67579,67579,29027,ZZZ4,23140,ENSG00000074755,ZZEF1
67580,67580,29027,FLJ10821,23140,ENSG00000074755,ZZEF1
67581,67581,24523,DKFZP564I052,26009,ENSG00000036549,ZZZ3


In [6]:
hgnc_gene_symbol_set = set(mini_hgnc_df["gene_symbol"])
hgnc_gene_symbol_count = len(hgnc_gene_symbol_set)

In [7]:
mini_ncbi_df = pd.read_csv(
    "../output/mini_ncbi_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_ncbi_df

Unnamed: 0.1,Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,0,1,A1BG,A1B,5,ENSG00000121410
1,0,1,A1BG,ABG,5,ENSG00000121410
2,0,1,A1BG,GAB,5,ENSG00000121410
3,0,1,A1BG,HYST2477,5,ENSG00000121410
4,1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...,...
91920,193451,8923215,trnD,-,,
91921,193452,8923216,trnP,-,,
91922,193453,8923217,trnA,-,,
91923,193454,8923218,COX1,-,,


In [8]:
ncbi_gene_symbol_set = set(mini_ncbi_df["gene_symbol"])
ncbi_gene_symbol_count = len(ncbi_gene_symbol_set)

In [9]:
unique_primary_symbol_summary_index = "HGNC", "ENSG", "NCBI"
unique_primary_symbol_summary = {
    "Number of Unique Primary Gene Symbols": [
        ensg_gene_symbol_count,
        hgnc_gene_symbol_count,
        ncbi_gene_symbol_count,
    ]
}
unique_primary_symbol_summary_df = pd.DataFrame(
    unique_primary_symbol_summary, index = unique_primary_symbol_summary_index
)
unique_primary_symbol_summary_df

Unnamed: 0,Number of Unique Primary Gene Symbols
HGNC,41068
ENSG,45646
NCBI,45390


### <a id='toc6_2_'></a>[All sources](#toc0_)

#### <a id='toc6_2_1_'></a>[How many primary symbols appear in all sources?](#toc0_)

In [10]:
all_sources_unique_primary_symbol_set = (
    ensg_gene_symbol_set
    & hgnc_gene_symbol_set
    & ncbi_gene_symbol_set
)
all_sources_unique_primary_symbol_count = len(all_sources_unique_primary_symbol_set)
all_sources_unique_primary_symbol_count

40885

Side Quest

### NCBI has almost 4x times the amount of unique primary gene symbols than the other sources. Why? What are they?

In [11]:
only_ncbi_gene_symbol_set = ncbi_gene_symbol_set - all_sources_unique_primary_symbol_set

In [12]:
len(only_ncbi_gene_symbol_set)

4505

In [13]:
only_ncbi_gene_symbol_set

{'CYP17A1-AS1',
 'MGC15885',
 'ST7-OT3',
 'TRL-TAG1-1',
 'MHS3',
 'INTLQ1',
 'TRK-CTT16-1',
 'HBB-HS-107',
 'CILD8',
 'AD8',
 'MTATP6P28',
 'MNDEC',
 'MYCLP3',
 'VESTAR',
 'IGHV3-9',
 'IDDM18',
 'HDLBP-AS1',
 'DNM1P37',
 'CRSP8P',
 'TRY-GTA6-1',
 'MFSD4A-AS1',
 'RPL39P13',
 'BCC1',
 'SCZD13',
 'RPL34P5',
 'C6orf147',
 'ND4L',
 'SNORD150',
 'OR3A5P',
 'CRE3',
 'DEL8Q21.11',
 'DNM1P40',
 'PAND1',
 'TRL-CAG1-3',
 'ST3',
 'RNU7-72P',
 'LINC00856',
 'trnI',
 'SNAR-A6',
 'PIRC76',
 'CTRCT27',
 'TRQ-CTG1-2',
 'MYP15',
 'FRA1H',
 'SQTL2',
 'FRA2D',
 'MTND6P27',
 'EKD2',
 'RPL37AP5',
 'LINC00268',
 'DUX4L41',
 'IDDMX',
 'RPS15AP4',
 'CLA3',
 'LINC03039',
 'PSMC6P1',
 'TRG-TCC3-1',
 'HNRNPA1P75',
 'MRBC',
 'TRV-CAC1-6',
 'MTCO1P26',
 'TRA-AGC1-1',
 'HTGS',
 'TRS-AGA2-2',
 'SLC66A2P2',
 'RPS21P6',
 'FRA11F',
 'NIFKP5',
 'MCS+9.7',
 'RPS2P12',
 'PPIP5K1P1',
 'AAA3',
 'FGQTL2',
 'TRL-CAA4-1',
 'RPS6P1',
 'TRK-CTT8-1',
 'PCDHB@',
 'TJP1P1',
 'TRE-TTC7-1',
 'TRQ-CTG8-2',
 'HHC2:066650',
 'RPL9P22',
 

#### Most of the symbols that are unique to NCBI (147,913/152,418-97%) begin with "LOC"

In [14]:
filtered_set = {gene for gene in only_ncbi_gene_symbol_set if not gene.startswith('LOC')}
len(filtered_set)

4505

In [15]:
filtered_set

{'CYP17A1-AS1',
 'MGC15885',
 'TRL-TAG1-1',
 'ST7-OT3',
 'MHS3',
 'INTLQ1',
 'TRK-CTT16-1',
 'HBB-HS-107',
 'CILD8',
 'AD8',
 'MTATP6P28',
 'MNDEC',
 'MYCLP3',
 'VESTAR',
 'IGHV3-9',
 'IDDM18',
 'HDLBP-AS1',
 'DNM1P37',
 'CRSP8P',
 'TRY-GTA6-1',
 'MFSD4A-AS1',
 'RPL39P13',
 'BCC1',
 'SCZD13',
 'RPL34P5',
 'C6orf147',
 'ND4L',
 'SNORD150',
 'OR3A5P',
 'CRE3',
 'DEL8Q21.11',
 'DNM1P40',
 'PAND1',
 'TRL-CAG1-3',
 'ST3',
 'RNU7-72P',
 'LINC00856',
 'trnI',
 'SNAR-A6',
 'PIRC76',
 'CTRCT27',
 'TRQ-CTG1-2',
 'MYP15',
 'FRA1H',
 'SQTL2',
 'FRA2D',
 'MTND6P27',
 'EKD2',
 'RPL37AP5',
 'LINC00268',
 'DUX4L41',
 'IDDMX',
 'RPS15AP4',
 'CLA3',
 'LINC03039',
 'PSMC6P1',
 'TRG-TCC3-1',
 'HNRNPA1P75',
 'MRBC',
 'TRV-CAC1-6',
 'MTCO1P26',
 'TRA-AGC1-1',
 'HTGS',
 'TRS-AGA2-2',
 'SLC66A2P2',
 'RPS21P6',
 'FRA11F',
 'NIFKP5',
 'MCS+9.7',
 'RPS2P12',
 'PPIP5K1P1',
 'AAA3',
 'FGQTL2',
 'RPS6P1',
 'TRL-CAA4-1',
 'TRK-CTT8-1',
 'PCDHB@',
 'TJP1P1',
 'TRE-TTC7-1',
 'TRQ-CTG8-2',
 'HHC2:066650',
 'RPL9P22',
 

### <a id='toc6_2_2_'></a>[How many unique symbols are found between all sources?](#toc0_)

In [16]:
bw_all_sources_unique_primary_symbol_df = pd.concat(
    [
        mini_ensg_df[["alias_symbol", "gene_symbol"]],
        mini_hgnc_df[["alias_symbol", "gene_symbol"]],
        mini_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [17]:
bw_all_sources_unique_primary_symbol_set = set(bw_all_sources_unique_primary_symbol_df["gene_symbol"])
bw_all_sources_unique_primary_symbol_count = len(bw_all_sources_unique_primary_symbol_set)
bw_all_sources_unique_primary_symbol_count

46953

# <a id='toc7_'></a>[How many unique aliases are there?](#toc0_)

## <a id='toc7_1_'></a>[Per Source](#toc0_)

subset= excludes genes without aliases, duplicates of primary-alias pairs, instances where the primary gene symbol is an alias for that gene record. created in alias-primary_collision_analysis notebook

In [18]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID
0,ENSG00000210049,MT-TF,MTTF,7481.0,
1,ENSG00000210049,MT-TF,TRNF,7481.0,
2,ENSG00000211459,MT-RNR1,12S,7470.0,
3,ENSG00000211459,MT-RNR1,MOTS-C,7470.0,
4,ENSG00000211459,MT-RNR1,MTRNR1,7470.0,
...,...,...,...,...,...
117133,ENSG00000232679,LINC01705,ERLR,52493.0,105372950.0
117134,ENSG00000232679,LINC01705,RP11-400N13.3,52493.0,105372950.0
117136,ENSG00000228437,LINC02474,LNCSLCC1,53417.0,
117137,ENSG00000228437,LINC02474,RP11-400N13.2,53417.0,


In [19]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
1,37133,FLJ23569,503538.0,ENSG00000268895,A1BG-AS1
2,24086,ACF,29974.0,ENSG00000148584,A1CF
3,24086,ASP,29974.0,ENSG00000148584,A1CF
4,24086,ACF64,29974.0,ENSG00000148584,A1CF
5,24086,ACF65,29974.0,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,29027,KIAA0399,23140.0,ENSG00000074755,ZZEF1
67579,29027,ZZZ4,23140.0,ENSG00000074755,ZZEF1
67580,29027,FLJ10821,23140.0,ENSG00000074755,ZZEF1
67581,24523,DKFZP564I052,26009.0,ENSG00000036549,ZZZ3


In [20]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5.0,ENSG00000121410
0,1,A1BG,ABG,5.0,ENSG00000121410
0,1,A1BG,GAB,5.0,ENSG00000121410
0,1,A1BG,HYST2477,5.0,ENSG00000121410
1,2,A2M,A2MD,7.0,ENSG00000175899
...,...,...,...,...,...
190961,131840634,GLTC1,GLTC,56861.0,
193342,132532400,GABRA6-AS1,ARBAG,40248.0,
193377,133395150,LNCARGI,ARGI,56890.0,
193378,133834869,MLDHR,MP31,55481.0,


In [21]:
ensg_alias_symbol_set, ensg_alias_count = set(subset_genes_ensg_df["alias_symbol"]), len(set(subset_genes_ensg_df["alias_symbol"]))
hgnc_alias_symbol_set, hgnc_alias_count = set(subset_genes_hgnc_df["alias_symbol"]), len(set(subset_genes_hgnc_df["alias_symbol"]))
ncbi_alias_symbol_set, ncbi_alias_count = set(subset_genes_ncbi_df["alias_symbol"]), len(set(subset_genes_ncbi_df["alias_symbol"]))

In [22]:
unique_alias_summary_index = "HGNC", "ENSG", "NCBI"
unique_alias_summary = {
    "Number of Unique Aliases": [
        ensg_alias_count,
        hgnc_alias_count,
        ncbi_alias_count,
    ]
}
unique_alias_summary_df = pd.DataFrame(
    unique_alias_summary, index = unique_alias_summary_index
)
unique_alias_summary_df

Unnamed: 0,Number of Unique Aliases
HGNC,55213
ENSG,42918
NCBI,68634


## <a id='toc7_2_'></a>[All sources](#toc0_)

### <a id='toc7_2_1_'></a>[How many aliases appear in all sources?](#toc0_)

In [23]:
all_sources_unique_alias_set = (
    ensg_alias_symbol_set
    & hgnc_alias_symbol_set
    & ncbi_alias_symbol_set
)
all_sources_unique_alias_count = len(all_sources_unique_alias_set)
all_sources_unique_alias_count

29983

### <a id='toc7_2_2_'></a>[How many unique aliases are found between all sources?](#toc0_)

In [24]:
bw_all_sources_unique_alias_df = pd.concat(
    [
        subset_genes_ensg_df[["alias_symbol", "gene_symbol"]],
        subset_genes_hgnc_df[["alias_symbol", "gene_symbol"]],
        subset_genes_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [25]:
bw_all_sources_unique_alias_set = set(bw_all_sources_unique_alias_df["alias_symbol"])
bw_all_sources_unique_alias_count = len(bw_all_sources_unique_alias_set)
bw_all_sources_unique_alias_count

81002

# <a id='toc8_'></a>[How many gene records have an alias that is shared?](#toc0_)

## <a id='toc8_1_'></a>[Per Source](#toc0_)

In [26]:
aa_collision_gene_ensg_df = pd.read_csv(
    "../output/aa_collision_gene_ensg_df.csv", index_col=[0])
aa_collision_gene_ensg_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,KLRG1,"2F1,CLEC15A,MAFA,MAFA-L",ENSG00000139187,6380.0,10219.0,2F1,ENSG
1,SLC25A5,"2F1,ANT2,T2,T3",ENSG00000005022,10991.0,292.0,2F1,ENSG
2,S100A8,"60B8AG,CAGA,CFAG,CGLA,MRP-8,MRP8,P8,S100-A8",ENSG00000143546,10498.0,6279.0,60B8AG,ENSG
3,S100A9,"60B8AG,CAGB,CFAG,CGLB,LIAG,MAC387,MIF,MRP-14,M...",ENSG00000163220,10499.0,6280.0,60B8AG,ENSG
4,GNAI3,87U6,ENSG00000065135,4387.0,2773.0,87U6,ENSG
...,...,...,...,...,...,...,...
3674,SLC30A10,"DKFZP547M236,ZNT-10,ZNT10,ZNT8,ZRC1",ENSG00000196660,25355.0,55532.0,ZNT8,ENSG
3675,SLC30A10,"DKFZP547M236,ZNT-10,ZNT10,ZNT8,ZRC1",ENSG00000196660,25355.0,55532.0,ZRC1,ENSG
3676,SLC30A1,"ZNT1,ZRC1",ENSG00000170385,11012.0,7779.0,ZRC1,ENSG
3677,ZYG11B,"FLJ13456,ZYG11",ENSG00000162378,25820.0,79699.0,ZYG11,ENSG


In [27]:
ensg_aa_collision_primary_symbol_set = set(aa_collision_gene_ensg_df["gene_symbol"])
ensg_aa_collision_primary_symbol_count = len(ensg_aa_collision_primary_symbol_set)

In [28]:
aa_collision_gene_hgnc_df = pd.read_csv(
    "../output/aa_collision_gene_hgnc_df.csv", index_col=[0])
aa_collision_gene_hgnc_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,KLRG1,"MAFA,2F1,MAFA-L,CLEC15A",ENSG00000139187,6380,10219.0,2F1,HGNC
1,SLC25A5,"T2,2F1,T3",ENSG00000005022,10991,292.0,2F1,HGNC
2,S100A8,"P8,MRP8,MRP-8,60B8AG,CGLA,S100-A8",ENSG00000143546,10498,6279.0,60B8AG,HGNC
3,S100A9,"P14,MIF,NIF,LIAG,MRP14,MAC387,60B8AG,CGLB,MRP-...",ENSG00000163220,10499,6280.0,60B8AG,HGNC
4,RNU6V,"87U6,LH87",ENSG00000206832,10230,6071.0,87U6,HGNC
...,...,...,...,...,...,...,...
2869,ZNF22,"KOX15,HKR-T1,ZNF422,Zfp422",ENSG00000165512,13012,7570.0,ZNF422,HGNC
2870,SLC30A10,"DKFZp547M236,ZnT-10,ZRC1,ZNT8,ZNT10",ENSG00000196660,25355,55532.0,ZNT8,HGNC
2871,SLC30A8,"ZnT-8,ZNT8",ENSG00000164756,20303,169026.0,ZNT8,HGNC
2872,SLC30A1,ZRC1,ENSG00000170385,11012,7779.0,ZRC1,HGNC


In [29]:
hgnc_aa_collision_primary_symbol_set = set(aa_collision_gene_hgnc_df["gene_symbol"])
hgnc_aa_collision_primary_symbol_count = len(hgnc_aa_collision_primary_symbol_set)

In [30]:
aa_collision_gene_ncbi_df = pd.read_csv(
    "../output/aa_collision_gene_ncbi_df.csv", index_col=[0])
aa_collision_gene_ncbi_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,PTEN,"10q23del,BZS,CWS1,DEC,GLM2,MHAM,MMAC1,PTEN1,PT...",ENSG00000171862,9588.0,5728,10Q23DEL,NCBI
1,BMPR1A,"10q23del,ACVRLK3,ALK-3,ALK3,BMPR-1A,CD292,SKR5",ENSG00000107779,1076.0,657,10Q23DEL,NCBI
2,ALOX15,"12-LOX,15-LOX,15-LOX-1,LOG15",ENSG00000161905,433.0,246,12-LOX,NCBI
3,ALOX12,"12-LOX,12S-LOX,LOG12",ENSG00000108839,429.0,239,12-LOX,NCBI
4,AKR1C1,"2-ALPHA-HSD,20-ALPHA-HSD,C9,DD1,DD1/DD2,DDH,DD...",ENSG00000187134,384.0,1645,20-ALPHA-HSD,NCBI
...,...,...,...,...,...,...,...
8863,SLC30A10,"HMDPC,HMNDYT1,ZNT10,ZNT8,ZRC1,ZnT-10",ENSG00000196660,25355.0,55532,ZRC1,NCBI
8864,PEX1,"HMLR1,PBD1A,PBD1B,ZWS,ZWS1",ENSG00000127980,8850.0,5189,ZWS,NCBI
8865,PEX13,"NALD,PBD11A,PBD11B,ZWS",ENSG00000162928,8855.0,5194,ZWS,NCBI
8866,ZYG11A,ZYG11,ENSG00000203995,32058.0,440590,ZYG11,NCBI


In [31]:
ncbi_aa_collision_primary_symbol_set = set(aa_collision_gene_ncbi_df["gene_symbol"])
ncbi_aa_collision_primary_symbol_count = len(ncbi_aa_collision_primary_symbol_set)

In [32]:
aa_collision_primary_symbol_summary_index = "HGNC", "ENSG", "NCBI"
aa_collision_primary_symbol_summary= {
    "Number of Gene Records With a Shared Alias": [
        ensg_aa_collision_primary_symbol_count,
        hgnc_aa_collision_primary_symbol_count,
        ncbi_aa_collision_primary_symbol_count,
    ]
}
aa_collision_primary_symbol_summary_df = pd.DataFrame(
    aa_collision_primary_symbol_summary, index = aa_collision_primary_symbol_summary_index
)
aa_collision_primary_symbol_summary_df

Unnamed: 0,Number of Gene Records With a Shared Alias
HGNC,3113
ENSG,2530
NCBI,6013


These values differ from the total number of rows from the aa_collision_gene_dfs because each row indicates a collision. There are some gene records that are involved in multiple collisions. For example: gene A has the aliases A1, 1A, and AA. If all three aliases are used by other genes, then gene A would take up 3 rows in the the aa_collision_gene_df while only being one gene record. In the case of alias-alais collisions in NCBI, the aa_collision_gene_df has 8868 rows. There are a total of 4540 rows that are duplicates. Of those 4540 duplicates, 1685 are unique. 8868 - (4540-1685) = 6013

## <a id='toc8_2_'></a>[All Sources](#toc0_)

### <a id='toc8_2_1_'></a>[How many gene records have at least one shared alias in all sources?](#toc0_)

(intersection of gene records with a collision in all three sources)

In [33]:
all_sources_aa_collision_genes = (
    ensg_aa_collision_primary_symbol_set
    & hgnc_aa_collision_primary_symbol_set
    & ncbi_aa_collision_primary_symbol_set
)
len(all_sources_aa_collision_genes)

2319

### <a id='toc8_2_2_'></a>[How many unique gene records that have at least one shared alias are found between all sources?](#toc0_)

(union of gene records with a collision in all three sources)

In [34]:
bw_all_sources_aa_collision_df = pd.concat(
    [
        aa_collision_gene_ensg_df[["collision", "gene_symbol"]],
        aa_collision_gene_hgnc_df[["collision", "gene_symbol"]],
        aa_collision_gene_ncbi_df[["collision", "gene_symbol"]],
    ]
)

In [35]:
bw_all_sources_aa_collision_genes_set = set(bw_all_sources_aa_collision_df["gene_symbol"])
bw_all_sources_aa_collision_genes_count = len(bw_all_sources_aa_collision_genes_set)
bw_all_sources_aa_collision_genes_count

6192

# <a id='toc9_'></a>[How many alias symbols are being shared?](#toc0_)

## <a id='toc9_1_'></a>[Per Source](#toc0_)

In [36]:
ensg_aa_collision_set = set(aa_collision_gene_ensg_df["collision"])
ensg_aa_collision_count = len(ensg_aa_collision_set)

In [37]:
hgnc_aa_collision_set = set(aa_collision_gene_hgnc_df["collision"])
hgnc_aa_collision_count = len(hgnc_aa_collision_set)

In [38]:
ncbi_aa_collision_set = set(aa_collision_gene_ncbi_df["collision"])
ncbi_aa_collision_count = len(ncbi_aa_collision_set)

In [39]:
aa_collision_alias_symbol_summary_index = "HGNC", "ENSG", "NCBI"
aa_collision_alias_symbol_summary = {
    "Number of Shared Aliases": [
        ensg_aa_collision_count,
        hgnc_aa_collision_count,
        ncbi_aa_collision_count,
    ]
}
aa_collision_alias_symbol_summary_df = pd.DataFrame(
    aa_collision_alias_symbol_summary, index = aa_collision_alias_symbol_summary_index
)
aa_collision_alias_symbol_summary_df

Unnamed: 0,Number of Shared Aliases
HGNC,1617
ENSG,1250
NCBI,3673


## <a id='toc9_2_'></a>[All Sources](#toc0_)

### <a id='toc9_2_1_'></a>[How many aliases are shared in all sources?](#toc0_)

In [40]:
all_sources_aa_collision_aliases = (
    ensg_aa_collision_set
    & hgnc_aa_collision_set
    & ncbi_aa_collision_set
)
len(all_sources_aa_collision_aliases)

1131

### <a id='toc9_2_2_'></a>[How many unique shared aliases are found between all sources?](#toc0_)

In [41]:
bw_all_sources_aa_collision_aliases_set = set(bw_all_sources_aa_collision_df["collision"])
bw_all_sources_aa_collision_aliases_count = len(bw_all_sources_aa_collision_aliases_set)
bw_all_sources_aa_collision_aliases_count

3786

# <a id='toc10_'></a>[How common are the collision symbols?](#toc0_)

## <a id='toc10_1_'></a>[Per Source](#toc0_)

In [42]:
ensg_alias_symbol_set = set(subset_genes_ensg_df["alias_symbol"])
ensg_alias_count = len(ensg_alias_symbol_set)

In [43]:
create_aa_collision_histogram(aa_collision_gene_ensg_df, "ENSG", ensg_aa_collision_count)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [44]:
aa_collision_ensg_count_df = pd.read_csv(
    "../output/aa_collision_ensg_count_df.csv", index_col=[0])

In [45]:
aa_collision_ensg_distribution_df = pd.read_csv(
    "../output/aa_collision_ensg_distribution_df.csv", index_col=[0])

In [46]:
ensg_alias_count_histogram_df = pd.read_csv(
    "../output/ensg_alias_count_histogram_df.csv", index_col=[0])

In [47]:
hgnc_alias_symbol_set = set(subset_genes_hgnc_df["alias_symbol"])
hgnc_alias_count = len(hgnc_alias_symbol_set)

In [48]:
create_aa_collision_histogram(aa_collision_gene_hgnc_df, "HGNC", hgnc_aa_collision_count)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [49]:
aa_collision_hgnc_count_df = pd.read_csv(
    "../output/aa_collision_hgnc_count_df.csv", index_col=[0])

In [50]:
aa_collision_hgnc_distribution_df = pd.read_csv(
    "../output/aa_collision_hgnc_distribution_df.csv", index_col=[0])

In [51]:
hgnc_alias_count_histogram_df = pd.read_csv(
    "../output/hgnc_alias_count_histogram_df.csv", index_col=[0])

In [52]:
ncbi_alias_symbol_set = set(subset_genes_ncbi_df["alias_symbol"])
ncbi_alias_count = len(ncbi_alias_symbol_set)

In [53]:
create_aa_collision_histogram(aa_collision_gene_ncbi_df, "NCBI", ncbi_aa_collision_count)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [54]:
aa_collision_ncbi_count_df = pd.read_csv(
    "../output/aa_collision_ncbi_count_df.csv", index_col=[0])

In [55]:
aa_collision_ncbi_distribution_df = pd.read_csv(
    "../output/aa_collision_ncbi_distribution_df.csv", index_col=[0])

In [56]:
ncbi_alias_count_histogram_df = pd.read_csv(
    "../output/ncbi_alias_count_histogram_df.csv", index_col=[0])

# <a id='toc11_'></a>[How many gene concept-alias relationships are there?](#toc0_)

## <a id='toc10_1_'></a>[Per Source](#toc0_)

In [57]:
ensg_primary_alias_pair_count = len(subset_genes_ensg_df)

In [58]:
hgnc_primary_alias_pair_count = len(subset_genes_hgnc_df)

In [59]:
ncbi_primary_alias_pair_count = len(subset_genes_ncbi_df)

In [60]:
primary_alias_pairs_summary_index = "HGNC", "ENSG", "NCBI"
primary_alias_pairs_summary = {
    "Number of Unique Gene Concept-Alias Pairs": [
        ensg_primary_alias_pair_count,
        hgnc_primary_alias_pair_count,
        ncbi_primary_alias_pair_count,
    ]
}
primary_alias_pairs_summary_df = pd.DataFrame(
    primary_alias_pairs_summary, index=primary_alias_pairs_summary_index
)
primary_alias_pairs_summary_df

Unnamed: 0,Number of Unique Gene Concept-Alias Pairs
HGNC,57275
ENSG,44542
NCBI,73829


## <a id='toc10_2_'></a>[All Sources](#toc0_)

### <a id='toc10_2_1_'></a>[How many unique gene-alias pairs are found between all sources?](#toc0_)

In [61]:
bw_all_sources_primary_alias_pairs_df = pd.concat(
    [
        subset_genes_ensg_df[["alias_symbol", "gene_symbol"]],
        subset_genes_hgnc_df[["alias_symbol", "gene_symbol"]],
        subset_genes_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [62]:
len(bw_all_sources_primary_alias_pairs_df)

175646

#### <a id='toc10_2_1_1_'></a>[Remove duplicate concept-alias pairs](#toc0_)

In [63]:
bw_all_sources_primary_alias_pairs_df = bw_all_sources_primary_alias_pairs_df.drop_duplicates(
    subset=["gene_symbol", "alias_symbol"], keep="first"
)

In [64]:
len(bw_all_sources_primary_alias_pairs_df)

86552