**Table of contents**<a id='toc0_'></a>    
- [ENSG](#toc1_)    
    - [How many total unique gene records are there in Ensembl](#toc1_1_1_)    
    - [Identify alias-alias collision symbols](#toc1_1_2_)    
- [HGNC](#toc2_)    
    - [How many total unique gene records are there in HGNC](#toc2_1_1_)    
    - [Identify alias-alias collision symbols](#toc2_1_2_)    
- [NCBI Info](#toc3_)    
    - [How many total unique gene records are there in NCBI Gene](#toc3_1_1_)    
    - [Identify alias-alias collision symbols](#toc3_1_2_)    
- [Merge to create Alias-Alias Collision Table- On Primary Gene Symbol](#toc4_)    
- [Merge to create Alias-Alias Collision Table- On Alias Symbol](#toc5_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [118]:
import pandas as pd
import numpy as np
import plotly.express as px

In [119]:
def create_aa_collision_df(subset_genes_xxxx_df: pd.DataFrame, merged_alias_xxxx_df: pd.DataFrame, source: str) -> pd.DataFrame:
    """Create a df of alias-alias collision symbols 

    :param subset_genes_xxxx_df: Processed df of gene records
    :param source: Representation of the source of the gene records
    :return: A df of genes that share an alias with another gene
    """

    #Create df with genes that have an alias that can be found in another gene's alias set
    aa_collision_gene_xxxx_df = subset_genes_xxxx_df.copy()
    aa_collision_gene_xxxx_df["alias_duplicates"] = aa_collision_gene_xxxx_df.duplicated(
    subset="alias_symbol", keep=False
    )
    aa_collision_gene_xxxx_df = aa_collision_gene_xxxx_df[aa_collision_gene_xxxx_df["alias_duplicates"]]
    aa_collision_gene_xxxx_df = aa_collision_gene_xxxx_df.rename(
    columns={"alias_symbol": "collision"})
    aa_collision_gene_xxxx_df = aa_collision_gene_xxxx_df.drop(["alias_duplicates"], axis=1)
    aa_collision_gene_xxxx_df = aa_collision_gene_xxxx_df.sort_values("collision")

    #Add a source tag for future merging efforts
    aa_collision_gene_xxxx_df["source"] = str(source)

    #Create a secondary collision df that merges the alias symbols for each record
    aa_collision_gene_xxxx_df = pd.merge(aa_collision_gene_xxxx_df, merged_alias_xxxx_df, on=["ENSG_ID", "gene_symbol", "HGNC_ID"], how="left")
    aa_collision_gene_xxxx_df = aa_collision_gene_xxxx_df[["gene_symbol","alias_symbol","ENSG_ID","HGNC_ID","NCBI_ID","collision","source"]]

    #Convert the df into a csv
    aa_collision_gene_xxxx_df.to_csv(f'../output/aa_collision_gene_{source}_df.csv', index=True)

    #Create a secondary collision df that prioritizes the collision symbol
    aa_collision_alias_xxxx_df = aa_collision_gene_xxxx_df[
    ["collision", "gene_symbol", "ENSG_ID", "source"]
    ]
    aa_collision_alias_xxxx_df = aa_collision_alias_xxxx_df.map(str)
    aa_collision_alias_xxxx_df = (
    aa_collision_alias_xxxx_df.groupby("collision")
    .agg({"ENSG_ID": ", ".join, "gene_symbol": ", ".join, "source": "first"})
    .reset_index()
    )

    #Convert the df into a csv and save
    aa_collision_alias_xxxx_df.to_csv(f'../output/aa_collision_alias_{source}_df.csv', index=True)

    return subset_genes_xxxx_df.head(), aa_collision_gene_xxxx_df.head(), aa_collision_alias_xxxx_df.head()

# <a id='toc1_'></a>[ENSG](#toc0_)

In [120]:
mini_ensg_df = pd.read_csv(
    "../output/mini_ensg_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_ensg_df

Unnamed: 0.1,Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,0,ENSG00000210049,,7481,MTTF,MT-TF
1,1,ENSG00000210049,,7481,TRNF,MT-TF
2,2,ENSG00000211459,,7470,12S,MT-RNR1
3,3,ENSG00000211459,,7470,MOTS-C,MT-RNR1
4,4,ENSG00000211459,,7470,MTRNR1,MT-RNR1
...,...,...,...,...,...,...
95019,133058,ENSG00000197989,85028,30062,LINC00100,SNHG12
95020,133059,ENSG00000197989,85028,30062,PNAS-123,SNHG12
95021,133060,ENSG00000229388,,52502,LINC01715,TAF12-DT
95022,133062,ENSG00000274978,26824,10108,RNU11-1,RNU11


In [121]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,7481.0,MTTF,MT-TF
1,ENSG00000210049,,7481.0,TRNF,MT-TF
2,ENSG00000211459,,7470.0,12S,MT-RNR1
3,ENSG00000211459,,7470.0,MOTS-C,MT-RNR1
4,ENSG00000211459,,7470.0,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,85028.0,30062.0,LINC00100,SNHG12
133059,ENSG00000197989,85028.0,30062.0,PNAS-123,SNHG12
133060,ENSG00000229388,,52502.0,LINC01715,TAF12-DT
133062,ENSG00000274978,26824.0,10108.0,RNU11-1,RNU11


In [122]:
merged_alias_ensg_df = pd.read_csv(
    "../output/merged_alias_ensg_df.csv", index_col=[0])
merged_alias_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858.0,"T245,TM4SF6,TSPAN-6"
1,ENSG00000000005,TNMD,17757.0,"BRICD4,CHM1L,MYODULIN,TEM,TENDIN"
2,ENSG00000000419,DPM1,3005.0,"CDGIE,MPDS"
3,ENSG00000000457,SCYL3,19285.0,"PACE-1,PACE1"
4,ENSG00000000460,FIRRM,25565.0,"APOLO1,C1ORF112,FLIP,FLJ10706,MEICA1"
...,...,...,...,...
48400,ENSG00000310527,WASH9P,,
48401,ENSG00000310539,DDX11L2,,","
48402,ENSG00000310540,LINC02968,56008.0,AL683807.1
48403,ENSG00000310560,PAXX,27849.0,"C9ORF142,XLS"


### <a id='toc1_1_1_'></a>[How many total unique gene records are there in Ensembl](#toc0_)

By ENSG ID

In [123]:
ensg_gene_id_set = set(mini_ensg_df["ENSG_ID"])
len(ensg_gene_id_set)

48401

### <a id='toc1_1_2_'></a>[Identify alias-alias collision symbols](#toc0_)

In [124]:
create_aa_collision_df(subset_genes_ensg_df, merged_alias_ensg_df, source="ENSG")

(           ENSG_ID  NCBI_ID  HGNC_ID alias_symbol gene_symbol
 0  ENSG00000210049      NaN   7481.0         MTTF       MT-TF
 1  ENSG00000210049      NaN   7481.0         TRNF       MT-TF
 2  ENSG00000211459      NaN   7470.0          12S     MT-RNR1
 3  ENSG00000211459      NaN   7470.0       MOTS-C     MT-RNR1
 4  ENSG00000211459      NaN   7470.0       MTRNR1     MT-RNR1,
   gene_symbol                                       alias_symbol  \
 0     SLC25A5                                     2F1,ANT2,T2,T3   
 1       KLRG1                            2F1,CLEC15A,MAFA,MAFA-L   
 2      S100A8        60B8AG,CAGA,CFAG,CGLA,MRP-8,MRP8,P8,S100-A8   
 3      S100A9  60B8AG,CAGB,CFAG,CGLB,LIAG,MAC387,MIF,MRP-14,M...   
 4       RNU6V                                          87U6,LH87   
 
            ENSG_ID  HGNC_ID  NCBI_ID collision source  
 0  ENSG00000005022  10991.0    292.0       2F1   ENSG  
 1  ENSG00000139187   6380.0  10219.0       2F1   ENSG  
 2  ENSG00000143546  10498.0   627

In [125]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,7481.0,MTTF,MT-TF
1,ENSG00000210049,,7481.0,TRNF,MT-TF
2,ENSG00000211459,,7470.0,12S,MT-RNR1
3,ENSG00000211459,,7470.0,MOTS-C,MT-RNR1
4,ENSG00000211459,,7470.0,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,85028.0,30062.0,LINC00100,SNHG12
133059,ENSG00000197989,85028.0,30062.0,PNAS-123,SNHG12
133060,ENSG00000229388,,52502.0,LINC01715,TAF12-DT
133062,ENSG00000274978,26824.0,10108.0,RNU11-1,RNU11


In [126]:
aa_collision_gene_ensg_df = pd.read_csv(
    "../output/aa_collision_gene_ensg_df.csv", index_col=[0])
aa_collision_gene_ensg_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,SLC25A5,"2F1,ANT2,T2,T3",ENSG00000005022,10991.0,292.0,2F1,ENSG
1,KLRG1,"2F1,CLEC15A,MAFA,MAFA-L",ENSG00000139187,6380.0,10219.0,2F1,ENSG
2,S100A8,"60B8AG,CAGA,CFAG,CGLA,MRP-8,MRP8,P8,S100-A8",ENSG00000143546,10498.0,6279.0,60B8AG,ENSG
3,S100A9,"60B8AG,CAGB,CFAG,CGLB,LIAG,MAC387,MIF,MRP-14,M...",ENSG00000163220,10499.0,6280.0,60B8AG,ENSG
4,RNU6V,"87U6,LH87",ENSG00000206832,10230.0,,87U6,ENSG
...,...,...,...,...,...,...,...
3672,SLC30A8,"ZNT-8,ZNT8",ENSG00000164756,20303.0,169026.0,ZNT8,ENSG
3673,SLC30A1,"ZNT1,ZRC1",ENSG00000170385,11012.0,7779.0,ZRC1,ENSG
3674,SLC30A10,"DKFZP547M236,ZNT-10,ZNT10,ZNT8,ZRC1",ENSG00000196660,25355.0,55532.0,ZRC1,ENSG
3675,ZYG11B,"FLJ13456,ZYG11",ENSG00000162378,25820.0,79699.0,ZYG11,ENSG


In [127]:
aa_collision_alias_ensg_df = pd.read_csv(
    "../output/aa_collision_alias_ensg_df.csv", index_col=[0])
aa_collision_alias_ensg_df

Unnamed: 0,collision,ENSG_ID,gene_symbol,source
0,2F1,"ENSG00000005022, ENSG00000139187","SLC25A5, KLRG1",ENSG
1,60B8AG,"ENSG00000143546, ENSG00000163220","S100A8, S100A9",ENSG
2,87U6,"ENSG00000206832, ENSG00000065135","RNU6V, GNAI3",ENSG
3,9G8,"ENSG00000164609, ENSG00000115875","SLU7, SRSF7",ENSG
4,A1,"ENSG00000033627, ENSG00000163918, ENSG00000035...","ATP6V0A1, RFC4, RFC1, RFC2",ENSG
...,...,...,...,...
1610,ZIP4,"ENSG00000285243, ENSG00000120498","SLC39A4, TEX11",ENSG
1611,ZNF422,"ENSG00000165512, ENSG00000172943","ZNF22, PHF8",ENSG
1612,ZNT8,"ENSG00000196660, ENSG00000164756","SLC30A10, SLC30A8",ENSG
1613,ZRC1,"ENSG00000170385, ENSG00000196660","SLC30A1, SLC30A10",ENSG


# <a id='toc2_'></a>[HGNC](#toc0_)

In [128]:
mini_hgnc_df = pd.read_csv(
    "../output/mini_hgnc_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_hgnc_df

Unnamed: 0.1,Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,0,5,,1,ENSG00000121410,A1BG
1,1,37133,FLJ23569,503538,ENSG00000268895,A1BG-AS1
2,2,24086,ACF,29974,ENSG00000148584,A1CF
3,3,24086,ASP,29974,ENSG00000148584,A1CF
4,4,24086,ACF64,29974,ENSG00000148584,A1CF
...,...,...,...,...,...,...
67578,67578,29027,KIAA0399,23140,ENSG00000074755,ZZEF1
67579,67579,29027,ZZZ4,23140,ENSG00000074755,ZZEF1
67580,67580,29027,FLJ10821,23140,ENSG00000074755,ZZEF1
67581,67581,24523,DKFZP564I052,26009,ENSG00000036549,ZZZ3


In [129]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
1,37133,FLJ23569,503538.0,ENSG00000268895,A1BG-AS1
2,24086,ACF,29974.0,ENSG00000148584,A1CF
3,24086,ASP,29974.0,ENSG00000148584,A1CF
4,24086,ACF64,29974.0,ENSG00000148584,A1CF
5,24086,ACF65,29974.0,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,29027,KIAA0399,23140.0,ENSG00000074755,ZZEF1
67579,29027,ZZZ4,23140.0,ENSG00000074755,ZZEF1
67580,29027,FLJ10821,23140.0,ENSG00000074755,ZZEF1
67581,24523,DKFZP564I052,26009.0,ENSG00000036549,ZZZ3


In [130]:
merged_alias_hgnc_df = pd.read_csv(
    "../output/merged_alias_hgnc_df.csv", index_col=[0])
merged_alias_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,"T245,TSPAN-6"
1,ENSG00000000005,TNMD,17757,"myodulin,ChM1L,tendin,TEM,BRICD4"
2,ENSG00000000419,DPM1,3005,"MPDS,CDGIE"
3,ENSG00000000457,SCYL3,19285,"PACE-1,PACE1"
4,ENSG00000000460,FIRRM,25565,"FLJ10706,Apolo1,FLIP,MEICA1"
...,...,...,...,...
45641,,ZNF97,13173,
45642,,ZNFP1,13181,
45643,,ZPAXP,51635,ZPX1P
45644,,ZRK,13193,


### <a id='toc2_1_1_'></a>[How many total unique gene records are there in HGNC](#toc0_)

By HGNC ID

In [131]:
hgnc_gene_id_set = set(mini_hgnc_df["HGNC_ID"])
len(hgnc_gene_id_set)

45646

### <a id='toc2_1_2_'></a>[Identify alias-alias collision symbols](#toc0_)

In [132]:
create_aa_collision_df(subset_genes_hgnc_df, merged_alias_hgnc_df, source="HGNC")

(   HGNC_ID alias_symbol   NCBI_ID          ENSG_ID gene_symbol
 1    37133     FLJ23569  503538.0  ENSG00000268895    A1BG-AS1
 2    24086          ACF   29974.0  ENSG00000148584        A1CF
 3    24086          ASP   29974.0  ENSG00000148584        A1CF
 4    24086        ACF64   29974.0  ENSG00000148584        A1CF
 5    24086        ACF65   29974.0  ENSG00000148584        A1CF,
   gene_symbol                                       alias_symbol  \
 0       KLRG1                            MAFA,2F1,MAFA-L,CLEC15A   
 1     SLC25A5                                          T2,2F1,T3   
 2      S100A8                  P8,MRP8,MRP-8,60B8AG,CGLA,S100-A8   
 3      S100A9  P14,MIF,NIF,LIAG,MRP14,MAC387,60B8AG,CGLB,MRP-...   
 4       RNU6V                                          87U6,LH87   
 
            ENSG_ID  HGNC_ID  NCBI_ID collision source  
 0  ENSG00000139187     6380  10219.0       2F1   HGNC  
 1  ENSG00000005022    10991    292.0       2F1   HGNC  
 2  ENSG00000143546    10498

In [133]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])

In [134]:
subset_genes_hgnc_df.loc[subset_genes_hgnc_df["gene_symbol"] == "NPY6R"]

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
36566,7959,PP2,4888.0,ENSG00000226306,NPY6R
36567,7959,NPY1RL,4888.0,ENSG00000226306,NPY6R
36568,7959,NPY6RP,4888.0,ENSG00000226306,NPY6R


In [135]:
aa_collision_gene_hgnc_df = pd.read_csv(
    "../output/aa_collision_gene_hgnc_df.csv", index_col=[0])

In [136]:
aa_collision_alias_hgnc_df = pd.read_csv(
    "../output/aa_collision_alias_hgnc_df.csv", index_col=[0])

# <a id='toc3_'></a>[NCBI Info](#toc0_)

In [137]:
mini_ncbi_df = pd.read_csv(
    "../output/mini_ncbi_df.csv",
    dtype={"HGNC_ID": pd.Int64Dtype(), "NCBI_ID": pd.Int64Dtype()},
)
mini_ncbi_df

Unnamed: 0.1,Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,0,1,A1BG,A1B,5,ENSG00000121410
1,0,1,A1BG,ABG,5,ENSG00000121410
2,0,1,A1BG,GAB,5,ENSG00000121410
3,0,1,A1BG,HYST2477,5,ENSG00000121410
4,1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...,...
92714,193575,8923215,trnD,,,
92715,193576,8923216,trnP,,,
92716,193577,8923217,trnA,,,
92717,193578,8923218,COX1,,,


In [138]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5.0,ENSG00000121410
0,1,A1BG,ABG,5.0,ENSG00000121410
0,1,A1BG,GAB,5.0,ENSG00000121410
0,1,A1BG,HYST2477,5.0,ENSG00000121410
1,2,A2M,A2MD,7.0,ENSG00000175899
...,...,...,...,...,...
193317,139281660,IFT70A-AS1,AGPS,58181.0,
193317,139281660,IFT70A-AS1,PDE11A,58181.0,
193324,139281667,BCAT1-DT,LNC-BCAT1,54396.0,
193398,139440214,LNCOB1,LNC-OB1,56209.0,


In [139]:
merged_alias_ncbi_df = pd.read_csv(
    "../output/merged_alias_ncbi_df.csv", index_col=[0])
merged_alias_ncbi_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858.0,"T245,TM4SF6,TSPAN-6"
1,ENSG00000000005,TNMD,17757.0,"BRICD4,CHM1L,TEM"
2,ENSG00000000419,DPM1,3005.0,"CDGIE,MPDS"
3,ENSG00000000457,SCYL3,19285.0,"PACE-1,PACE1"
4,ENSG00000000460,FIRRM,25565.0,"Apolo1,C1orf112,FLIP,MEICA1"
...,...,...,...,...
45740,,trnS,,",,,"
45741,,trnT,,","
45742,,trnV,,","
45743,,trnW,,","


### <a id='toc3_1_1_'></a>[How many total unique gene records are there in NCBI Gene](#toc0_)

By ENSG ID

In [140]:
ncbi_gene_id_set = set(mini_ncbi_df["NCBI_ID"])
len(ncbi_gene_id_set)

45880

### <a id='toc3_1_2_'></a>[Identify alias-alias collision symbols](#toc0_)

In [141]:
create_aa_collision_df(subset_genes_ncbi_df, merged_alias_ncbi_df, source="NCBI")

(   NCBI_ID gene_symbol alias_symbol  HGNC_ID          ENSG_ID
 0        1        A1BG          A1B      5.0  ENSG00000121410
 0        1        A1BG          ABG      5.0  ENSG00000121410
 0        1        A1BG          GAB      5.0  ENSG00000121410
 0        1        A1BG     HYST2477      5.0  ENSG00000121410
 1        2         A2M         A2MD      7.0  ENSG00000175899,
   gene_symbol                                       alias_symbol  \
 0      BMPR1A     10q23del,ACVRLK3,ALK-3,ALK3,BMPR-1A,CD292,SKR5   
 1        PTEN  10q23del,BZS,CWS1,DEC,GLM2,MHAM,MMAC1,PTEN1,PT...   
 2      ALOX12                               12-LOX,12S-LOX,LOG12   
 3      ALOX15                       12-LOX,15-LOX,15-LOX-1,LOG15   
 4      AKR1C1  2-ALPHA-HSD,20-ALPHA-HSD,C9,DD1,DD1/DD2,DDH,DD...   
 
            ENSG_ID  HGNC_ID  NCBI_ID     collision source  
 0  ENSG00000107779   1076.0      657      10Q23DEL   NCBI  
 1  ENSG00000171862   9588.0     5728      10Q23DEL   NCBI  
 2  ENSG00000108839   

In [142]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5.0,ENSG00000121410
0,1,A1BG,ABG,5.0,ENSG00000121410
0,1,A1BG,GAB,5.0,ENSG00000121410
0,1,A1BG,HYST2477,5.0,ENSG00000121410
1,2,A2M,A2MD,7.0,ENSG00000175899
...,...,...,...,...,...
193317,139281660,IFT70A-AS1,AGPS,58181.0,
193317,139281660,IFT70A-AS1,PDE11A,58181.0,
193324,139281667,BCAT1-DT,LNC-BCAT1,54396.0,
193398,139440214,LNCOB1,LNC-OB1,56209.0,


In [143]:
aa_collision_gene_ncbi_df = pd.read_csv(
    "../output/aa_collision_gene_ncbi_df.csv", index_col=[0])
aa_collision_gene_ncbi_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,BMPR1A,"10q23del,ACVRLK3,ALK-3,ALK3,BMPR-1A,CD292,SKR5",ENSG00000107779,1076.0,657,10Q23DEL,NCBI
1,PTEN,"10q23del,BZS,CWS1,DEC,GLM2,MHAM,MMAC1,PTEN1,PT...",ENSG00000171862,9588.0,5728,10Q23DEL,NCBI
2,ALOX12,"12-LOX,12S-LOX,LOG12",ENSG00000108839,429.0,239,12-LOX,NCBI
3,ALOX15,"12-LOX,15-LOX,15-LOX-1,LOG15",ENSG00000161905,433.0,246,12-LOX,NCBI
4,AKR1C1,"2-ALPHA-HSD,20-ALPHA-HSD,C9,DD1,DD1/DD2,DDH,DD...",ENSG00000187134,384.0,1645,20-ALPHA-HSD,NCBI
...,...,...,...,...,...,...,...
8916,SLC30A10,"HMDPC,HMNDYT1,ZNT10,ZNT8,ZRC1,ZnT-10",ENSG00000196660,25355.0,55532,ZRC1,NCBI
8917,PEX13,"NALD,PBD11A,PBD11B,ZWS",ENSG00000162928,8855.0,5194,ZWS,NCBI
8918,PEX1,"HMLR1,PBD1A,PBD1B,ZWS,ZWS1",ENSG00000127980,8850.0,5189,ZWS,NCBI
8919,ZYG11B,ZYG11,ENSG00000162378,25820.0,79699,ZYG11,NCBI


In [144]:
aa_collision_alias_ncbi_df = pd.read_csv(
    "../output/aa_collision_alias_ncbi_df.csv", index_col=[0])

# <a id='toc4_'></a>[Merge to create Alias-Alias Collision Table- On Primary Gene Symbol](#toc0_)

In [145]:
merged_aa_collision_gene_df = pd.concat(
    [
        aa_collision_gene_hgnc_df[["gene_symbol", "alias_symbol", "ENSG_ID", "collision", "source"]],
        aa_collision_gene_ncbi_df[["gene_symbol", "alias_symbol", "ENSG_ID", "collision", "source"]],
        aa_collision_gene_ensg_df[["gene_symbol", "alias_symbol", "ENSG_ID", "collision", "source"]],
    ]
)
merged_aa_collision_gene_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,collision,source
0,KLRG1,"MAFA,2F1,MAFA-L,CLEC15A",ENSG00000139187,2F1,HGNC
1,SLC25A5,"T2,2F1,T3",ENSG00000005022,2F1,HGNC
2,S100A8,"P8,MRP8,MRP-8,60B8AG,CGLA,S100-A8",ENSG00000143546,60B8AG,HGNC
3,S100A9,"P14,MIF,NIF,LIAG,MRP14,MAC387,60B8AG,CGLB,MRP-...",ENSG00000163220,60B8AG,HGNC
4,RNU6V,"87U6,LH87",ENSG00000206832,87U6,HGNC
...,...,...,...,...,...
3672,SLC30A8,"ZNT-8,ZNT8",ENSG00000164756,ZNT8,ENSG
3673,SLC30A1,"ZNT1,ZRC1",ENSG00000170385,ZRC1,ENSG
3674,SLC30A10,"DKFZP547M236,ZNT-10,ZNT10,ZNT8,ZRC1",ENSG00000196660,ZRC1,ENSG
3675,ZYG11B,"FLJ13456,ZYG11",ENSG00000162378,ZYG11,ENSG


In [146]:
merged_aa_collision_gene_df.to_csv(
    "../output/merged_aa_collision_gene_df.csv", index=False
)

In [147]:
merged_aa_collision_gene_df.loc[merged_aa_collision_gene_df.collision == "ALP"]

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,collision,source
82,ASRGL1,"FLJ22316,ALP1,ALP",ENSG00000162174,ALP,HGNC
83,ATHS,ALP,,ALP,HGNC
84,ATRNL1,"KIAA0534,FLJ45344,ALP",ENSG00000107518,ALP,HGNC
85,PDLIM3,ALP,ENSG00000154553,ALP,HGNC
86,CCL27,"ALP,ILC,CTACK,skinkine,ESkine,PESKY,CTAK",ENSG00000213927,ALP,HGNC
87,SLPI,"HUSI-I,ALK1,ALP,BLPI,HUSI,WAP4,WFDC4",ENSG00000124107,ALP,HGNC
275,NAT10,"ALP,Kre33,NET43",ENSG00000135372,ALP,NCBI
276,PDLIM3,ALP,ENSG00000154553,ALP,NCBI
277,ATHS,ALP,,ALP,NCBI
278,ALPP,"ALP,ALPI,IAP,PALP,PLAP,PLAP-1",ENSG00000163283,ALP,NCBI


# <a id='toc5_'></a>[Merge to create Alias-Alias Collision Table- On Alias Symbol](#toc0_)

In [148]:
merged_aa_collision_alias_df = pd.concat(
    [
        aa_collision_alias_hgnc_df[["collision", "gene_symbol", "ENSG_ID", "source"]],
        aa_collision_alias_ncbi_df[["collision", "gene_symbol", "ENSG_ID", "source"]],
        aa_collision_alias_ensg_df[["collision", "gene_symbol", "ENSG_ID", "source"]],
    ]
)
merged_aa_collision_alias_df

Unnamed: 0,collision,gene_symbol,ENSG_ID,source
0,2F1,"KLRG1, SLC25A5","ENSG00000139187, ENSG00000005022",HGNC
1,60B8AG,"S100A8, S100A9","ENSG00000143546, ENSG00000163220",HGNC
2,87U6,"RNU6V, GNAI3","ENSG00000206832, ENSG00000065135",HGNC
3,9G8,"SRSF7, SLU7","ENSG00000115875, ENSG00000164609",HGNC
4,A1,"ATP6V0A1, RFC1, RFC4, RFC2","ENSG00000033627, ENSG00000035928, ENSG00000163...",HGNC
...,...,...,...,...
1610,ZIP4,"SLC39A4, TEX11","ENSG00000285243, ENSG00000120498",ENSG
1611,ZNF422,"ZNF22, PHF8","ENSG00000165512, ENSG00000172943",ENSG
1612,ZNT8,"SLC30A10, SLC30A8","ENSG00000196660, ENSG00000164756",ENSG
1613,ZRC1,"SLC30A1, SLC30A10","ENSG00000170385, ENSG00000196660",ENSG


In [149]:
merged_aa_collision_alias_df["gene_symbol"] = merged_aa_collision_alias_df[
    "gene_symbol"
].str.split(",")
merged_aa_collision_alias_df["gene_symbol_count"] = [
    len(c) for c in merged_aa_collision_alias_df["gene_symbol"]
]
merged_aa_collision_alias_df = merged_aa_collision_alias_df.sort_values(
    by="gene_symbol_count", ascending=False
)
merged_aa_collision_alias_df

Unnamed: 0,collision,gene_symbol,ENSG_ID,source,gene_symbol_count
3621,VH,"[IGHV3-72, SLC7A4, IGHV3-64, IGHV3-53, IGH...","ENSG00000225698, ENSG00000099960, ENSG00000223...",NCBI,37
1409,H4-16,"[H4C15, H4C3, H4C4, H4C8, H4C16, H4C14, ...","ENSG00000270276, ENSG00000197061, ENSG00000277...",NCBI,14
1415,H4C15,"[H4C13, H4C9, H4C4, H4C5, H4C1, H4C14, H...","ENSG00000275126, ENSG00000276180, ENSG00000277...",NCBI,13
1421,H4C6,"[H4C11, H4C5, H4C14, H4C2, H4C15, H4C4, ...","ENSG00000197238, ENSG00000276966, ENSG00000270...",NCBI,13
1410,H4C1,"[H4C12, H4C15, H4C2, H4C6, H4C13, H4C4, ...","ENSG00000273542, ENSG00000270276, ENSG00000278...",NCBI,13
...,...,...,...,...,...
1146,FCP,"[HBFQTL2, FCP1]","nan, nan",NCBI,2
1145,FCHL2,"[HYPLIP2, APOB]","nan, ENSG00000084674",NCBI,2
1144,FCGR3,"[FCGR3A, FCGR3B]","ENSG00000203747, ENSG00000162747",NCBI,2
1143,FCGR2,"[FCGR2B, FCGR2A]","ENSG00000072694, ENSG00000143226",NCBI,2


In [150]:
merged_aa_collision_alias_df.loc[merged_aa_collision_alias_df["collision"] == "ASP"]

Unnamed: 0,collision,gene_symbol,ENSG_ID,source,gene_symbol_count
232,ASP,"[ATG5, A1CF, ROPN1L, ASIP, C3, ASPM, ASP...","ENSG00000057663, ENSG00000148584, ENSG00000145...",NCBI,8
82,ASP,"[TMPRSS11D, ROPN1L, ATG5, A1CF, ASPM, ASP...","ENSG00000153802, ENSG00000145491, ENSG00000057...",HGNC,7
112,ASP,"[ROPN1L, ASPA, ASIP, A1CF, ASPM, ATG5, T...","ENSG00000145491, ENSG00000108381, ENSG00000101...",ENSG,7


In [151]:
merged_aa_collision_alias_df.to_csv(
    "../output/merged_aa_collision_alias_df.csv", index=True, quoting=0
)

In [152]:
aa_collision_set = set(merged_aa_collision_alias_df["collision"].tolist())
len(aa_collision_set)

3809