**Table of contents**<a id='toc0_'></a>    
- [ENSG](#toc1_)    
    - [How many total unique gene records are there in Ensembl](#toc1_1_1_)    
    - [Identify alias-alias collision symbols](#toc1_1_2_)    
- [HGNC](#toc2_)    
    - [How many total unique gene records are there in HGNC](#toc2_1_1_)    
    - [Identify alias-alias collision symbols](#toc2_1_2_)    
- [NCBI Info](#toc3_)    
    - [How many total unique gene records are there in NCBI Gene](#toc3_1_1_)    
    - [Identify alias-alias collision symbols](#toc3_1_2_)    
- [Merge to create Alias-Alias Collision Table- On Primary Gene Symbol](#toc4_)    
- [Merge to create Alias-Alias Collision Table- On Alias Symbol](#toc5_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [208]:
import pandas as pd
import numpy as np
import plotly.express as px

In [209]:
def create_aa_collision_df(subset_genes_xxxx_df: pd.DataFrame, merged_alias_xxxx_df: pd.DataFrame, source: str) -> pd.DataFrame:
    """Create a df of alias-alias collision symbols 

    :param subset_genes_xxxx_df: Processed df of gene records
    :param source: Representation of the source of the gene records
    :return: A df of genes that share an alias with another gene
    """

    #Create df with genes that have an alias that can be found in another gene's alias set
    aa_collision_xxxx_df = subset_genes_xxxx_df.copy()
    aa_collision_xxxx_df = aa_collision_xxxx_df.dropna(subset=["alias_symbol"])
    aa_collision_xxxx_df['alias_symbol'] = aa_collision_xxxx_df['alias_symbol'].str.upper()

    aa_collision_xxxx_df["alias_duplicates"] = aa_collision_xxxx_df.duplicated(
    subset="alias_symbol", keep=False
    )
    aa_collision_xxxx_df = aa_collision_xxxx_df[aa_collision_xxxx_df["alias_duplicates"]]
    aa_collision_xxxx_df = aa_collision_xxxx_df.rename(
    columns={"alias_symbol": "collision"})
    aa_collision_xxxx_df = aa_collision_xxxx_df.drop(["alias_duplicates"], axis=1)
    aa_collision_xxxx_df = aa_collision_xxxx_df.sort_values("collision")

    #Add a source tag for future merging efforts
    aa_collision_xxxx_df["source"] = str(source)
    aa_collision_xxxx_df.to_csv(f'../output/single_alias_aa_collision_{source.lower()}_df.csv', index=True)

    #Create a secondary collision df that merges the alias symbols for each record
    columns_map = {
    'ENSG': ['NCBI_ID', 'HGNC_ID'],
    'HGNC': ['NCBI_ID', 'ENSG_ID'],
    'NCBI': ['HGNC_ID', 'ENSG_ID']
    }
    cols_of_interest = columns_map.get(source, [])

    merged_alias_aa_collision_xxxx_df = aa_collision_xxxx_df.drop(columns=cols_of_interest)

    merged_alias_aa_collision_xxxx_df = pd.merge(
        merged_alias_aa_collision_xxxx_df,
        merged_alias_xxxx_df[[f"{source}_ID"] + cols_of_interest + ['alias_symbol']],
        on=[f"{source}_ID"],
        how="left"
    )
    #Convert the df into a csv
    merged_alias_aa_collision_xxxx_df.to_csv(f'../output/merged_alias_aa_collision_{source.lower()}_df.csv', index=True)

    return merged_alias_aa_collision_xxxx_df.head()

# <a id='toc1_'></a>[ENSG](#toc0_)

In [210]:
mini_ensg_df = pd.read_csv(
    "../output/mini_ensg_df.csv", index_col=0,
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)

In [211]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])

In [212]:
merged_alias_ensg_df = pd.read_csv(
    "../output/merged_alias_ensg_df.csv", index_col=[0])

### <a id='toc1_1_1_'></a>[How many total unique gene records are there in Ensembl](#toc0_)

In [213]:
gene_record_set_ensg = set(mini_ensg_df['ENSG_ID'])
gene_record_count_ensg = len(gene_record_set_ensg)
gene_record_count_ensg

48401

### <a id='toc1_1_2_'></a>[Identify alias-alias collision symbols](#toc0_)

In [214]:
create_aa_collision_df(subset_genes_ensg_df, merged_alias_ensg_df, source="ENSG")

Unnamed: 0,ENSG_ID,collision,gene_symbol,source,NCBI_ID,HGNC_ID,alias_symbol
0,ENSG00000274945,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
1,ENSG00000274193,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
2,ENSG00000276779,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
3,ENSG00000273498,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
4,ENSG00000276979,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"


In [215]:
merged_alias_aa_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_aa_collision_ensg_df.csv", index_col=[0])
merged_alias_aa_collision_ensg_df

Unnamed: 0,ENSG_ID,collision,gene_symbol,source,NCBI_ID,HGNC_ID,alias_symbol
0,ENSG00000274945,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
1,ENSG00000274193,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
2,ENSG00000276779,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
3,ENSG00000273498,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
4,ENSG00000276979,103AS,KIR2DL4,ENSG,3805,6332,"103AS,15.212,CD158D"
...,...,...,...,...,...,...,...
24640,ENSG00000186448,ZSCAN41,ZNF197,ENSG,10168110354863,12988,"D3S1363E,P18,ZKSCAN9,ZNF166,ZSCAN41"
24641,ENSG00000162714,ZSCAN49,ZNF496,ENSG,84838,23713,"MGC15548,ZKSCAN17,ZSCAN49"
24642,ENSG00000291884,ZSCAN49,ZNF496,ENSG,84838,23713,"MGC15548,ZKSCAN17,ZSCAN49"
24643,ENSG00000162378,ZYG11,ZYG11B,ENSG,79699,25820,"FLJ13456,ZYG11"


How many ambiguous symbols result from alias-alias collisions?

In [216]:
aa_collision_ambiguous_symbol_set_ensg = set(merged_alias_aa_collision_ensg_df["collision"])
aa_collision_ambiguous_symbol_count_ensg = len(aa_collision_ambiguous_symbol_set_ensg)
aa_collision_ambiguous_symbol_count_ensg

6751

How many records have at least one alias-alias collision (alias that matches another record's alias gene symbol)?

In [217]:
aa_record_set_ensg = set(merged_alias_aa_collision_ensg_df["ENSG_ID"])
aa_record_count_ensg = len(aa_record_set_ensg)
aa_record_count_ensg

9496

# <a id='toc2_'></a>[HGNC](#toc0_)

In [218]:
mini_hgnc_df = pd.read_csv(
    "../output/mini_hgnc_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)

In [219]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])

In [220]:
merged_alias_hgnc_df = pd.read_csv(
    "../output/merged_alias_hgnc_df.csv", index_col=[0])

### <a id='toc2_1_2_'></a>[Identify alias-alias collision symbols](#toc0_)

In [221]:
create_aa_collision_df(subset_genes_hgnc_df, merged_alias_hgnc_df, source="HGNC")

Unnamed: 0,HGNC_ID,gene_symbol,collision,source,NCBI_ID,ENSG_ID,alias_symbol
0,10991,SLC25A5,2F1,HGNC,292,ENSG00000005022,"2F1,T2,T3"
1,6380,KLRG1,2F1,HGNC,10219,ENSG00000139187,"2F1,CLEC15A,MAFA,MAFA-L"
2,6888,MAPKAPK3,3PK,HGNC,7867,ENSG00000114738,"3PK,3pK,MAPKAP3,MK-3,MK3"
3,6888,MAPKAPK3,3PK,HGNC,7867,ENSG00000114738,"3PK,3pK,MAPKAP3,MK-3,MK3"
4,10498,S100A8,60B8AG,HGNC,6279,ENSG00000143546,"60B8AG,CGLA,MRP-8,MRP8,P8,S100-A8"


In [222]:
merged_alias_aa_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_aa_collision_hgnc_df.csv", index_col=[0])
merged_alias_aa_collision_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,collision,source,NCBI_ID,ENSG_ID,alias_symbol
0,10991,SLC25A5,2F1,HGNC,292,ENSG00000005022,"2F1,T2,T3"
1,6380,KLRG1,2F1,HGNC,10219,ENSG00000139187,"2F1,CLEC15A,MAFA,MAFA-L"
2,6888,MAPKAPK3,3PK,HGNC,7867,ENSG00000114738,"3PK,3pK,MAPKAP3,MK-3,MK3"
3,6888,MAPKAPK3,3PK,HGNC,7867,ENSG00000114738,"3PK,3pK,MAPKAP3,MK-3,MK3"
4,10498,S100A8,60B8AG,HGNC,6279,ENSG00000143546,"60B8AG,CGLA,MRP-8,MRP8,P8,S100-A8"
...,...,...,...,...,...,...,...
2881,20672,PHF8,ZNF422,HGNC,23133,ENSG00000172943,"JHDM1F,KDM7B,KIAA1111,ZNF422"
2882,20303,SLC30A8,ZNT8,HGNC,169026,ENSG00000164756,"ZNT8,ZnT-8"
2883,25355,SLC30A10,ZNT8,HGNC,55532,ENSG00000196660,"DKFZp547M236,ZNT10,ZNT8,ZRC1,ZnT-10"
2884,11012,SLC30A1,ZRC1,HGNC,7779,ENSG00000170385,ZRC1


How many ambiguous symbols result from alias-alias collisions?

In [223]:
aa_collision_ambiguous_symbol_set_hgnc = set(merged_alias_aa_collision_hgnc_df["collision"])
aa_collision_ambiguous_symbol_count_hgnc = len(aa_collision_ambiguous_symbol_set_hgnc)
aa_collision_ambiguous_symbol_count_hgnc

1262

How many records have at least one alias-alias collision (alias that matches another record's alias gene symbol)?

In [224]:
aa_record_set_hgnc = set(merged_alias_aa_collision_hgnc_df["HGNC_ID"])
aa_record_count_hgnc = len(aa_record_set_hgnc)
aa_record_count_hgnc

2507

# <a id='toc3_'></a>[NCBI Info](#toc0_)

In [225]:
mini_ncbi_df = pd.read_csv(
    "../output/mini_ncbi_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)

In [226]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])

In [227]:
merged_alias_ncbi_df = pd.read_csv(
    "../output/merged_alias_ncbi_df.csv", index_col=[0])

### <a id='toc3_1_2_'></a>[Identify alias-alias collision symbols](#toc0_)

In [228]:
create_aa_collision_df(subset_genes_ncbi_df, merged_alias_ncbi_df, source="NCBI")

Unnamed: 0,NCBI_ID,gene_symbol,collision,source,HGNC_ID,ENSG_ID,alias_symbol
0,657,BMPR1A,10Q23DEL,NCBI,1076,ENSG00000107779,"10q23del,ACVRLK3,ALK-3,ALK3,BMPR-1A,CD292,SKR5"
1,5728,PTEN,10Q23DEL,NCBI,9588,ENSG00000171862,"10q23del,BZS,CWS1,DEC,GLM2,MHAM,MMAC1,PTEN1,PT..."
2,239,ALOX12,12-LOX,NCBI,429,ENSG00000108839,"12-LOX,12S-LOX,LOG12"
3,246,ALOX15,12-LOX,NCBI,433,ENSG00000161905,"12-LOX,15-LOX,15-LOX-1,LOG15"
4,1645,AKR1C1,20-ALPHA-HSD,NCBI,384,ENSG00000187134,"2-ALPHA-HSD,20-ALPHA-HSD,C9,DD1,DD1/DD2,DDH,DD..."


In [229]:
merged_alias_aa_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_aa_collision_ncbi_df.csv", index_col=[0])
merged_alias_aa_collision_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,collision,source,HGNC_ID,ENSG_ID,alias_symbol
0,657,BMPR1A,10Q23DEL,NCBI,1076,ENSG00000107779,"10q23del,ACVRLK3,ALK-3,ALK3,BMPR-1A,CD292,SKR5"
1,5728,PTEN,10Q23DEL,NCBI,9588,ENSG00000171862,"10q23del,BZS,CWS1,DEC,GLM2,MHAM,MMAC1,PTEN1,PT..."
2,239,ALOX12,12-LOX,NCBI,429,ENSG00000108839,"12-LOX,12S-LOX,LOG12"
3,246,ALOX15,12-LOX,NCBI,433,ENSG00000161905,"12-LOX,15-LOX,15-LOX-1,LOG15"
4,1645,AKR1C1,20-ALPHA-HSD,NCBI,384,ENSG00000187134,"2-ALPHA-HSD,20-ALPHA-HSD,C9,DD1,DD1/DD2,DDH,DD..."
...,...,...,...,...,...,...,...
8801,7779,SLC30A1,ZRC1,NCBI,11012,ENSG00000170385,"ZNT1,ZRC1"
8802,5189,PEX1,ZWS,NCBI,8850,ENSG00000127980,"HMLR1,PBD1A,PBD1B,ZWS,ZWS1"
8803,5194,PEX13,ZWS,NCBI,8855,ENSG00000162928,"NALD,PBD11A,PBD11B,ZWS"
8804,79699,ZYG11B,ZYG11,NCBI,25820,ENSG00000162378,ZYG11


How many ambiguous symbols result from alias-alias collisions?

In [230]:
aa_collision_ambiguous_symbol_set_ncbi = set(merged_alias_aa_collision_ncbi_df["collision"])
aa_collision_ambiguous_symbol_count_ncbi = len(aa_collision_ambiguous_symbol_set_ncbi)
aa_collision_ambiguous_symbol_count_ncbi

3651

How many records have at least one alias-alias collision (alias that matches another record's alias gene symbol)?

In [231]:
aa_record_set_ncbi = set(merged_alias_aa_collision_ncbi_df["NCBI_ID"])
aa_record_count_ncbi = len(aa_record_set_ncbi)
aa_record_count_ncbi

5972

# <a id='toc4_'></a>[Merge to create Alias-Alias Collision Table- On Primary Gene Symbol](#toc0_)

In [232]:
merged_aa_collision_gene_df = pd.concat(
    [
        merged_alias_aa_collision_hgnc_df[["gene_symbol", "alias_symbol", "ENSG_ID", "collision", "source"]],
        merged_alias_aa_collision_ncbi_df[["gene_symbol", "alias_symbol", "ENSG_ID", "collision", "source"]],
        merged_alias_aa_collision_ensg_df[["gene_symbol", "alias_symbol", "ENSG_ID", "collision", "source"]],
    ]
)
merged_aa_collision_gene_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,collision,source
0,SLC25A5,"2F1,T2,T3",ENSG00000005022,2F1,HGNC
1,KLRG1,"2F1,CLEC15A,MAFA,MAFA-L",ENSG00000139187,2F1,HGNC
2,MAPKAPK3,"3PK,3pK,MAPKAP3,MK-3,MK3",ENSG00000114738,3PK,HGNC
3,MAPKAPK3,"3PK,3pK,MAPKAP3,MK-3,MK3",ENSG00000114738,3PK,HGNC
4,S100A8,"60B8AG,CGLA,MRP-8,MRP8,P8,S100-A8",ENSG00000143546,60B8AG,HGNC
...,...,...,...,...,...
24640,ZNF197,"D3S1363E,P18,ZKSCAN9,ZNF166,ZSCAN41",ENSG00000186448,ZSCAN41,ENSG
24641,ZNF496,"MGC15548,ZKSCAN17,ZSCAN49",ENSG00000162714,ZSCAN49,ENSG
24642,ZNF496,"MGC15548,ZKSCAN17,ZSCAN49",ENSG00000291884,ZSCAN49,ENSG
24643,ZYG11B,"FLJ13456,ZYG11",ENSG00000162378,ZYG11,ENSG


In [233]:
merged_aa_collision_gene_df.to_csv(
    "../output/merged_aa_collision_gene_df.csv", index=False
)

In [234]:
merged_aa_collision_gene_df.loc[merged_aa_collision_gene_df.collision == "ALP"]

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,collision,source
86,ATRNL1,"ALP,FLJ45344,KIAA0534",ENSG00000107518,ALP,HGNC
87,ASRGL1,"ALP,ALP1,FLJ22316",ENSG00000162174,ALP,HGNC
88,SLPI,"ALK1,ALP,BLPI,HUSI,HUSI-I,WAP4,WFDC4",ENSG00000124107,ALP,HGNC
89,PDLIM3,ALP,ENSG00000154553,ALP,HGNC
90,CCL27,"ALP,CTACK,CTAK,ESkine,ILC,PESKY,skinkine",ENSG00000213927,ALP,HGNC
263,NAT10,"ALP,Kre33,NET43",ENSG00000135372,ALP,NCBI
264,ASRGL1,"ALP,ALP1,CRASH",ENSG00000162174,ALP,NCBI
265,CCL27,"ALP,CTACK,CTAK,ESKINE,ILC,PESKY,SCYA27",ENSG00000213927,ALP,NCBI
266,PDLIM3,ALP,ENSG00000154553,ALP,NCBI
267,ATRNL1,"ALP,bA338L11.1,bA454H24.1",ENSG00000107518,ALP,NCBI
