# What is the distribution of alias alias collisions in Ensembl, HGNC, and NCBI?

## <a id='toc6_'></a>[How many unique primary gene symbols are there?](#toc0_)

### <a id='toc6_1_'></a>[Per Source](#toc0_)

In [52]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
from pathlib import Path

In [53]:
# Add output/ to sys.path so you can import from it
sys.path.append(str(Path().resolve().parent / "output"))
from collision_analysis_shared_variables import *

In [54]:
def create_aa_collision_histogram(aa_collision_gene_xxxx_df: pd.DataFrame, source: str, xxxx_aa_collision_count: int):
    """Create a histogram of the frequencies at which aliases are shared

    :param aa_collision_gene_xxxx_df: A df of alias-alias collisions organized by primary gene symbol
    :param source: Representation of the source of the gene records
    :param xxxx_alias_count: Number of aliases total in the source
    :return: A histogram of the percentage of aliases that are shared between 2 genes, 3 genes, and so on
    """
    
    #Count the number of times each shared alias is used
    aa_collision_xxxx_count_df = aa_collision_gene_xxxx_df.pivot_table(
    index=["collision"], aggfunc="size"
    )
    aa_collision_xxxx_count_df = aa_collision_xxxx_count_df.reset_index()
    aa_collision_xxxx_count_df.rename(columns={0: "num_gene_records"}, inplace=True)
    aa_collision_xxxx_count_df = aa_collision_xxxx_count_df.sort_values(
        "num_gene_records", ascending=False)

    #Convert to csv
    aa_collision_xxxx_count_df.to_csv(f'../output/aa_collision_{source}_count_df.csv', index=True)

    #Count the frequency at which aliases are shared 
    aa_collision_xxxx_distribution_df = aa_collision_xxxx_count_df.pivot_table(
    index=["num_gene_records"], aggfunc="size"
    )
    aa_collision_xxxx_distribution_df = aa_collision_xxxx_distribution_df.reset_index()
    aa_collision_xxxx_distribution_df.rename(columns={0: "num_collision_symbol"}, inplace=True)
    aa_collision_xxxx_distribution_df["percent_collision_symbol"] = (
        aa_collision_xxxx_distribution_df["num_collision_symbol"] / xxxx_aa_collision_count
    ) * 100

    #Convert to csv
    aa_collision_xxxx_distribution_df.to_csv(f'../output/aa_collision_{source}_distribution_df.csv', index=True)

    #Create histogram df 
    xxxx_alias_count_histogram_df = aa_collision_xxxx_distribution_df.drop(
    "num_collision_symbol", axis=1)

    #Convert to csv
    xxxx_alias_count_histogram_df.to_csv(f'../output/{source}_alias_count_histogram_df.csv', index=True)

    return px.bar(xxxx_alias_count_histogram_df, x="num_gene_records", y="percent_collision_symbol")



In [55]:
mini_ensg_df = pd.read_csv(
    "../output/mini_ensg_df.csv", index_col=[0]
)
mini_ensg_df

Unnamed: 0.1,Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF
1,1,ENSG00000210049,,HGNC:7481,TRNF,MT-TF
2,2,ENSG00000211459,,HGNC:7470,12S,MT-RNR1
3,3,ENSG00000211459,,HGNC:7470,MOTS-C,MT-RNR1
4,4,ENSG00000211459,,HGNC:7470,MTRNR1,MT-RNR1
...,...,...,...,...,...,...
95019,133058,ENSG00000197989,GENE ID:85028,HGNC:30062,LINC00100,SNHG12
95020,133059,ENSG00000197989,GENE ID:85028,HGNC:30062,PNAS-123,SNHG12
95021,133060,ENSG00000229388,,HGNC:52502,LINC01715,TAF12-DT
95022,133062,ENSG00000274978,GENE ID:26824,HGNC:10108,RNU11-1,RNU11


In [56]:
mini_hgnc_df = pd.read_csv(
    "../output/mini_hgnc_df.csv", index_col=[0]
)
mini_hgnc_df

Unnamed: 0.1,Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,0,HGNC:100,ASIC1,BNaC2,GENE ID:41,ENSG00000110881
1,0,HGNC:100,ASIC1,hBNaC2,GENE ID:41,ENSG00000110881
2,1,HGNC:10000,RGS4,,GENE ID:5999,ENSG00000117152
3,2,HGNC:10001,RGS5,,GENE ID:8490,ENSG00000143248
4,3,HGNC:10002,RGS6,,GENE ID:9628,ENSG00000182732
...,...,...,...,...,...,...
66202,44232,HGNC:9997,RGS16,RGS-r,GENE ID:6004,ENSG00000143333
66203,44233,HGNC:9998,RGS2,,GENE ID:5997,ENSG00000116741
66204,44234,HGNC:9999,RGS3,C2PA,GENE ID:5998,ENSG00000138835
66205,44234,HGNC:9999,RGS3,FLJ20370,GENE ID:5998,ENSG00000138835


In [57]:
mini_ncbi_df = pd.read_csv(
    "../output/mini_ncbi_df.csv", index_col=[0]
)
mini_ncbi_df

Unnamed: 0.1,Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,0,GENE ID:1,A1BG,A1B,HGNC:5,ENSG00000121410
1,0,GENE ID:1,A1BG,ABG,HGNC:5,ENSG00000121410
2,0,GENE ID:1,A1BG,GAB,HGNC:5,ENSG00000121410
3,0,GENE ID:1,A1BG,HYST2477,HGNC:5,ENSG00000121410
4,1,GENE ID:2,A2M,A2MD,HGNC:7,ENSG00000175899
...,...,...,...,...,...,...
91372,193502,GENE ID:141732005,ADCY2-AS1,,HGNC:40064,
91373,193503,GENE ID:141732006,NSG2-AS1,,HGNC:41074,
91374,193504,GENE ID:141732007,ST18-AS1,,HGNC:58430,
91375,193505,GENE ID:141732008,MICAL2-AS1,,HGNC:58437,


### <a id='toc6_2_'></a>[All sources](#toc0_)

#### <a id='toc6_2_1_'></a>[How many primary symbols appear in all sources?](#toc0_)

In [58]:
primary_symbol_set_hgnc = set(mini_hgnc_df['gene_symbol'])
primary_symbol_set_ensg = set(mini_ensg_df['gene_symbol'])
primary_symbol_set_ncbi = set(mini_ncbi_df['gene_symbol'])

In [59]:
all_sources_unique_primary_symbol_set = (
    primary_symbol_set_hgnc
    & primary_symbol_set_ensg
    & primary_symbol_set_ncbi
)
all_sources_unique_primary_symbol_count = len(all_sources_unique_primary_symbol_set)
all_sources_unique_primary_symbol_count

40948

### <a id='toc6_2_2_'></a>[How many unique symbols are found between all sources?](#toc0_)

In [60]:
bw_all_sources_unique_primary_symbol_df = pd.concat(
    [
        mini_ensg_df[["alias_symbol", "gene_symbol"]],
        mini_hgnc_df[["alias_symbol", "gene_symbol"]],
        mini_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [61]:
bw_all_sources_unique_primary_symbol_set = set(bw_all_sources_unique_primary_symbol_df["gene_symbol"])
bw_all_sources_unique_primary_symbol_count = len(bw_all_sources_unique_primary_symbol_set)
bw_all_sources_unique_primary_symbol_count

44797

## <a id='toc7_2_'></a>[All sources](#toc0_)

### <a id='toc7_2_1_'></a>[How many aliases appear in all sources?](#toc0_)

In [62]:
alias_symbol_set_ensg = set(mini_ensg_df['alias_symbol'])
alias_symbol_set_hgnc = set(mini_hgnc_df['alias_symbol'])
alias_symbol_set_ncbi = set(mini_ncbi_df['alias_symbol'])

In [63]:
all_sources_unique_alias_set = (
    alias_symbol_set_ensg
    & alias_symbol_set_hgnc
    & alias_symbol_set_ncbi
)
all_sources_unique_alias_count = len(all_sources_unique_alias_set)
all_sources_unique_alias_count

23310

### <a id='toc7_2_2_'></a>[How many unique aliases are found between all sources?](#toc0_)

In [64]:
bw_all_sources_unique_alias_df = pd.concat(
    [
        mini_ensg_df[["alias_symbol", "gene_symbol"]],
        mini_hgnc_df[["alias_symbol", "gene_symbol"]],
        mini_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [65]:
bw_all_sources_unique_alias_set = set(bw_all_sources_unique_alias_df["alias_symbol"])
bw_all_sources_unique_alias_count = len(bw_all_sources_unique_alias_set)
bw_all_sources_unique_alias_count

93012

## <a id='toc8_2_'></a>[All Sources](#toc0_)

### <a id='toc8_2_1_'></a>[How many genes have at least one shared alias in all sources?](#toc0_)

(intersection of genes with a collision in all three sources)

In [66]:
ensg_aa_collision_primary_symbol_set = set(merged_alias_aa_collision_ensg_df["gene_symbol"])
hgnc_aa_collision_primary_symbol_set = set(merged_alias_aa_collision_hgnc_df["gene_symbol"])
ncbi_aa_collision_primary_symbol_set = set(merged_alias_aa_collision_ncbi_df["gene_symbol"])

In [67]:
all_sources_aa_collision_genes = (
    ensg_aa_collision_primary_symbol_set
    & hgnc_aa_collision_primary_symbol_set
    & ncbi_aa_collision_primary_symbol_set
)
len(all_sources_aa_collision_genes)

2017

### <a id='toc8_2_2_'></a>[How many unique gene records that have at least one shared alias are found between all sources?](#toc0_)

(union of gene records with a collision in all three sources)

In [68]:
bw_all_sources_aa_collision_df = pd.concat(
    [
        merged_alias_aa_collision_hgnc_df[["collision", "gene_symbol"]],
        merged_alias_aa_collision_ensg_df[["collision", "gene_symbol"]],
        merged_alias_aa_collision_ncbi_df[["collision", "gene_symbol"]],
    ]
)

In [69]:
bw_all_sources_aa_collision_genes_set = set(bw_all_sources_aa_collision_df["gene_symbol"])
bw_all_sources_aa_collision_genes_count = len(bw_all_sources_aa_collision_genes_set)
bw_all_sources_aa_collision_genes_count

5944

In [70]:
ensg_aa_collision_set = set(merged_alias_aa_collision_ensg_df["collision"])
ensg_aa_collision_count = len(ensg_aa_collision_set)

In [71]:
hgnc_aa_collision_set = set(merged_alias_aa_collision_hgnc_df["collision"])
hgnc_aa_collision_count = len(hgnc_aa_collision_set)

In [72]:
ncbi_aa_collision_set = set(merged_alias_aa_collision_ncbi_df["collision"])
ncbi_aa_collision_count = len(ncbi_aa_collision_set)

## <a id='toc9_2_'></a>[All Sources](#toc0_)

### <a id='toc9_2_1_'></a>[How many aliases are shared in all sources?](#toc0_)

In [73]:
all_sources_aa_collision_aliases = (
    hgnc_aa_collision_set
    & ensg_aa_collision_set
    & ncbi_aa_collision_set
)
len(all_sources_aa_collision_aliases)

912

### <a id='toc9_2_2_'></a>[How many unique shared aliases are found between all sources?](#toc0_)

In [74]:
bw_all_sources_aa_collision_aliases_set = set(bw_all_sources_aa_collision_df["collision"])
bw_all_sources_aa_collision_aliases_count = len(bw_all_sources_aa_collision_aliases_set)
bw_all_sources_aa_collision_aliases_count

3700

# <a id='toc10_'></a>[How common are the collision symbols?](#toc0_)

## <a id='toc10_1_'></a>[Per Source](#toc0_)

In [75]:
ensg_alias_symbol_set = set(mini_ensg_df[mini_ensg_df['ENSG_ID'].isin(aa_record_set_ensg)]["alias_symbol"])
ensg_alias_count = len(ensg_alias_symbol_set)

In [76]:
create_aa_collision_histogram(merged_alias_aa_collision_ensg_df, "ENSG", ensg_aa_collision_count)

In [77]:
aa_collision_ensg_count_df = pd.read_csv(
    "../output/aa_collision_ensg_count_df.csv", index_col=[0])

In [78]:
aa_collision_ensg_distribution_df = pd.read_csv(
    "../output/aa_collision_ensg_distribution_df.csv", index_col=[0])

In [79]:
ensg_alias_count_histogram_df = pd.read_csv(
    "../output/ensg_alias_count_histogram_df.csv", index_col=[0])

In [80]:
hgnc_alias_symbol_set = set(mini_hgnc_df[mini_hgnc_df['HGNC_ID'].isin(aa_record_set_hgnc)]["alias_symbol"])
hgnc_alias_count = len(hgnc_alias_symbol_set)

In [81]:
create_aa_collision_histogram(merged_alias_aa_collision_hgnc_df, "HGNC", hgnc_aa_collision_count)

In [82]:
aa_collision_hgnc_count_df = pd.read_csv(
    "../output/aa_collision_hgnc_count_df.csv", index_col=[0])

In [83]:
aa_collision_hgnc_distribution_df = pd.read_csv(
    "../output/aa_collision_hgnc_distribution_df.csv", index_col=[0])

In [84]:
hgnc_alias_count_histogram_df = pd.read_csv(
    "../output/hgnc_alias_count_histogram_df.csv", index_col=[0])

In [85]:
ncbi_alias_symbol_set = set(mini_ncbi_df[mini_ncbi_df['NCBI_ID'].isin(aa_record_set_ncbi)]["alias_symbol"])
ncbi_alias_count = len(ncbi_alias_symbol_set)

In [86]:
create_aa_collision_histogram(merged_alias_aa_collision_ncbi_df, "NCBI", ncbi_aa_collision_count)

In [87]:
aa_collision_ncbi_count_df = pd.read_csv(
    "../output/aa_collision_ncbi_count_df.csv", index_col=[0])

In [88]:
aa_collision_ncbi_distribution_df = pd.read_csv(
    "../output/aa_collision_ncbi_distribution_df.csv", index_col=[0])

In [89]:
ncbi_alias_count_histogram_df = pd.read_csv(
    "../output/ncbi_alias_count_histogram_df.csv", index_col=[0])

# <a id='toc11_'></a>[How many gene concept-alias relationships are there?](#toc0_)

## <a id='toc10_1_'></a>[Per Source](#toc0_)

In [90]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])

In [91]:
ensg_primary_alias_pair_count = len(subset_genes_ensg_df)

In [92]:
hgnc_primary_alias_pair_count = len(subset_genes_hgnc_df)

In [93]:
ncbi_primary_alias_pair_count = len(subset_genes_ncbi_df)

In [94]:
primary_alias_pairs_summary_index = "HGNC", "ENSG", "NCBI"
primary_alias_pairs_summary = {
    "Number of Unique Gene Concept-Alias Pairs": [
        ensg_primary_alias_pair_count,
        hgnc_primary_alias_pair_count,
        ncbi_primary_alias_pair_count,
    ]
}
primary_alias_pairs_summary_df = pd.DataFrame(
    primary_alias_pairs_summary, index=primary_alias_pairs_summary_index
)
primary_alias_pairs_summary_df

Unnamed: 0,Number of Unique Gene Concept-Alias Pairs
HGNC,95005
ENSG,66192
NCBI,91377


## <a id='toc10_2_'></a>[All Sources](#toc0_)

### <a id='toc10_2_1_'></a>[How many unique gene-alias pairs are found between all sources?](#toc0_)

In [95]:
bw_all_sources_primary_alias_pairs_df = pd.concat(
    [
        subset_genes_ensg_df[["alias_symbol", "gene_symbol"]],
        subset_genes_hgnc_df[["alias_symbol", "gene_symbol"]],
        subset_genes_ncbi_df[["alias_symbol", "gene_symbol"]],
    ]
)

In [96]:
len(bw_all_sources_primary_alias_pairs_df)

252574

#### <a id='toc10_2_1_1_'></a>[Remove duplicate concept-alias pairs](#toc0_)

In [97]:
bw_all_sources_primary_alias_pairs_df = bw_all_sources_primary_alias_pairs_df.drop_duplicates(
    subset=["gene_symbol", "alias_symbol"], keep="first"
)

In [98]:
len(bw_all_sources_primary_alias_pairs_df)

123067

In [99]:
hgnc_prim_and_alias_gene_symbol_set = hgnc_alias_symbol_set.union(primary_symbol_set_hgnc)
len(hgnc_prim_and_alias_gene_symbol_set)

49662

In [100]:
ncbi_prim_and_alias_gene_symbol_set = ncbi_alias_symbol_set.union(primary_symbol_set_ncbi)
len(ncbi_prim_and_alias_gene_symbol_set)

64543

In [101]:
ensg_prim_and_alias_gene_symbol_set = ensg_alias_symbol_set.union(primary_symbol_set_ensg)
len(ensg_prim_and_alias_gene_symbol_set)

50129