### Import 

In [43]:
import pandas as pd
import numpy as np

In [44]:
def create_ap_collision_df(mini_xxxx_df: pd.DataFrame, source: str) -> pd.DataFrame:
    """Create a df of alias-primary collision symbols 

    :param mini_xxxx_df: Processed df of gene records
    :param source: Representation of the source of the gene records
    :param split_on_character: Character that is used to seperate alias symbols in the mini_xxxx_df
    :return: A df of genes that share an alias with another gene
    """

    #Strip "HGNC" from HGNC ID
    mini_xxxx_df["HGNC_ID"] = mini_xxxx_df["HGNC_ID"].str.lstrip("HGNC:")

    #Remove placeholder LOC genes
    mini_xxxx_df = mini_xxxx_df[~mini_xxxx_df['gene_symbol'].str.startswith('LOC', na=False)]

    #Convert the df into a csv and save
    mini_xxxx_df.to_csv(f'../output/mini_{source.lower()}_df.csv', index=True) 

    #Make a new df where the alias symbols are merged together for each record
    merged_alias_xxxx_df = mini_xxxx_df.copy()
    merged_alias_xxxx_df["alias_symbol"] = merged_alias_xxxx_df["alias_symbol"].fillna("").astype(str)
    merged_alias_xxxx_df = (
    merged_alias_xxxx_df.groupby(
        ["ENSG_ID", "gene_symbol", "HGNC_ID"], dropna=False
    )["alias_symbol"]
    .apply(lambda x: ",".join(x.dropna())).reset_index())

    #Convert the df into a csv and save
    merged_alias_xxxx_df.to_csv(f'../output/merged_alias_{source}_df.csv', index=True) 

    #Make a set of the primary gene symbols
    xxxx_gene_symbol_set = set(mini_xxxx_df["gene_symbol"])

    #Drop genes without any aliases
    subset_genes_xxxx_df = mini_xxxx_df.copy()
    subset_genes_xxxx_df = subset_genes_xxxx_df.replace(" ", np.nan)
    subset_genes_xxxx_df = subset_genes_xxxx_df.replace("", np.nan)
    subset_genes_xxxx_df = subset_genes_xxxx_df.replace("-", np.nan)
    subset_genes_xxxx_df = subset_genes_xxxx_df.dropna(subset=["alias_symbol"])

    #Remove duplicate instances of primary gene symbol- alias pairs
    #(occur because the same primary gene symbol may have multiple different ENSG IDs, 
    #see gene RFLNB for example)
    subset_genes_xxxx_df["gene_symbol"] = subset_genes_xxxx_df["gene_symbol"].str.upper()
    subset_genes_xxxx_df["alias_symbol"] = subset_genes_xxxx_df["alias_symbol"].str.upper()
    subset_genes_xxxx_df = subset_genes_xxxx_df.drop_duplicates(subset=['gene_symbol', 'alias_symbol'], keep='first')

    #Remove instances where the primary gene symbol is an alias for that gene record
    #the pair would be the same symbol twice
    subset_genes_xxxx_df = subset_genes_xxxx_df[subset_genes_xxxx_df['gene_symbol'] != subset_genes_xxxx_df['alias_symbol']]

    #Convert the df into a csv and save
    subset_genes_xxxx_df.to_csv(f'../output/subset_genes_{source}_df.csv', index=True) 

    #Create df with genes that have an alias that can be found as another gene's primary gene symbol
    ap_collision_xxxx_df = subset_genes_xxxx_df.copy()
    ap_collision_xxxx_df.loc[:, "alias_symbol"] = ap_collision_xxxx_df["alias_symbol"].apply(lambda x: {x})
    ap_collision_xxxx_df.loc[:, "collision"] = ap_collision_xxxx_df["alias_symbol"].apply(
        lambda x: x & xxxx_gene_symbol_set
    )
    ap_collision_xxxx_df = ap_collision_xxxx_df[ap_collision_xxxx_df["collision"].apply(lambda x: len(x) > 0)]
    ap_collision_xxxx_df = ap_collision_xxxx_df.applymap(lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x)
    ap_collision_xxxx_df['NCBI_ID'] = ap_collision_xxxx_df['NCBI_ID'].fillna(0).astype(int)
    ap_collision_xxxx_df = ap_collision_xxxx_df.sort_values("collision")

    #Add a source tag for future merging efforts
    ap_collision_xxxx_df["source"] = str(source.upper())

    #Convert the df into a csv
    ap_collision_xxxx_df.to_csv(f'../output/single_alias_ap_collision_{source.lower()}_df.csv', index=True)

    #Create a secondary collision df that merges the alias symbols for each record
    merged_alias_ap_collision_xxxx_df = ap_collision_xxxx_df.drop(columns=['alias_symbol'])
    merged_alias_ap_collision_xxxx_df = pd.merge(merged_alias_ap_collision_xxxx_df, merged_alias_xxxx_df, on=["ENSG_ID", "gene_symbol", "HGNC_ID"], how="left")
    merged_alias_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df[["gene_symbol","alias_symbol","ENSG_ID","HGNC_ID","NCBI_ID","collision","source"]]

    #Test to make sure all collisions are in the alias list for the record
    test_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.copy()
    test_ap_collision_xxxx_df['alias_symbol'] = test_ap_collision_xxxx_df['alias_symbol'].apply(lambda x: x if isinstance(x, list) else [x])
    test_ap_collision_xxxx_df['collision_in_alias'] = test_ap_collision_xxxx_df.apply(lambda row: row['collision'] in test_ap_collision_xxxx_df['alias_symbol'].values, axis=1)
    true_count = test_ap_collision_xxxx_df['collision_in_alias'].sum()

    if true_count == 0:
        #Convert the df into a csv and save
        merged_alias_ap_collision_xxxx_df.to_csv(f'../output/merged_alias_ap_collision_{source.lower()}_df.csv', index=True)
        print("All collisions are present in gene alias lists.")
    else:
        print("Some collisions are not present in gene alias lists.")

    return mini_xxxx_df.head(), subset_genes_xxxx_df.head(), ap_collision_xxxx_df.head()

# Ensembl

In [45]:
mini_ensg_df = pd.read_csv(
    "../input/ensg_biomart_gene20240626.txt", sep="\t",dtype={"NCBI gene (formerly Entrezgene) ID": pd.Int64Dtype()}
)
mini_ensg_df = mini_ensg_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Gene Synonym": "alias_symbol",
        "Gene name": "gene_symbol",
        "Gene stable ID": "ENSG_ID",
        "NCBI gene (formerly Entrezgene) ID": "NCBI_ID",
    }
)
mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID
0,ENSG00000210049,MT-TF,MTTF,HGNC:7481,
1,ENSG00000210049,MT-TF,TRNF,HGNC:7481,
2,ENSG00000211459,MT-RNR1,12S,HGNC:7470,
3,ENSG00000211459,MT-RNR1,MOTS-C,HGNC:7470,
4,ENSG00000211459,MT-RNR1,MTRNR1,HGNC:7470,
...,...,...,...,...,...
117135,ENSG00000200033,RNU6-403P,,HGNC:47366,
117136,ENSG00000228437,LINC02474,LNCSLCC1,HGNC:53417,
117137,ENSG00000228437,LINC02474,RP11-400N13.2,HGNC:53417,
117138,ENSG00000229463,LYST-AS1,LYST-IT2,HGNC:41320,


In [46]:
create_ap_collision_df(mini_ensg_df, "ENSG")

All collisions are present in gene alias lists.


  ap_collision_xxxx_df = ap_collision_xxxx_df.applymap(lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x)


(           ENSG_ID gene_symbol alias_symbol HGNC_ID  NCBI_ID
 0  ENSG00000210049       MT-TF         MTTF    7481     <NA>
 1  ENSG00000210049       MT-TF         TRNF    7481     <NA>
 2  ENSG00000211459     MT-RNR1          12S    7470     <NA>
 3  ENSG00000211459     MT-RNR1       MOTS-C    7470     <NA>
 4  ENSG00000211459     MT-RNR1       MTRNR1    7470     <NA>,
            ENSG_ID gene_symbol alias_symbol HGNC_ID  NCBI_ID
 0  ENSG00000210049       MT-TF         MTTF    7481     <NA>
 1  ENSG00000210049       MT-TF         TRNF    7481     <NA>
 2  ENSG00000211459     MT-RNR1          12S    7470     <NA>
 3  ENSG00000211459     MT-RNR1       MOTS-C    7470     <NA>
 4  ENSG00000211459     MT-RNR1       MTRNR1    7470     <NA>,
                ENSG_ID gene_symbol alias_symbol HGNC_ID  NCBI_ID collision  \
 33522  ENSG00000283293       RN7SK          7SK   10037   125050       7SK   
 51305  ENSG00000057252       SOAT1        ACAT1   11177     6646     ACAT1   
 29843  ENSG00000

In [47]:
mini_ensg_df= pd.read_csv(
    "../output/mini_ensg_df.csv", index_col=[0])
mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID
0,ENSG00000210049,MT-TF,MTTF,7481.0,
1,ENSG00000210049,MT-TF,TRNF,7481.0,
2,ENSG00000211459,MT-RNR1,12S,7470.0,
3,ENSG00000211459,MT-RNR1,MOTS-C,7470.0,
4,ENSG00000211459,MT-RNR1,MTRNR1,7470.0,
...,...,...,...,...,...
117135,ENSG00000200033,RNU6-403P,,47366.0,
117136,ENSG00000228437,LINC02474,LNCSLCC1,53417.0,
117137,ENSG00000228437,LINC02474,RP11-400N13.2,53417.0,
117138,ENSG00000229463,LYST-AS1,LYST-IT2,41320.0,


In [48]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID
0,ENSG00000210049,MT-TF,MTTF,7481.0,
1,ENSG00000210049,MT-TF,TRNF,7481.0,
2,ENSG00000211459,MT-RNR1,12S,7470.0,
3,ENSG00000211459,MT-RNR1,MOTS-C,7470.0,
4,ENSG00000211459,MT-RNR1,MTRNR1,7470.0,
...,...,...,...,...,...
117133,ENSG00000232679,LINC01705,ERLR,52493.0,105372950.0
117134,ENSG00000232679,LINC01705,RP11-400N13.3,52493.0,105372950.0
117136,ENSG00000228437,LINC02474,LNCSLCC1,53417.0,
117137,ENSG00000228437,LINC02474,RP11-400N13.2,53417.0,


In [49]:
subset_genes_ensg_df.loc[
    subset_genes_ensg_df["gene_symbol"] == "ARK2C"
]

Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID
4932,ENSG00000141622,ARK2C,ARKL2,31696.0,494470.0
4933,ENSG00000141622,ARK2C,LNCAMPC,31696.0,494470.0
4934,ENSG00000141622,ARK2C,RNF111L2,31696.0,494470.0
4935,ENSG00000141622,ARK2C,RNF165,31696.0,494470.0


In [50]:
merged_alias_ap_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ensg_df.csv", index_col=[0])
merged_alias_ap_collision_ensg_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,RN7SK,7SK,ENSG00000283293,10037.0,125050,7SK,ENSG
1,SOAT1,"ACAT,ACAT1,SOAT,STAT",ENSG00000057252,11177.0,6646,ACAT1,ENSG
2,SOAT2,ACAT2,ENSG00000167780,11178.0,8435,ACAT2,ENSG
3,NDUFAB1,"ACP,ACP1,FASN2A,SDAP",ENSG00000004779,7694.0,4706,ACP1,ENSG
4,ACTBP8,ACTBP2,ENSG00000220267,141.0,0,ACTBP2,ENSG
...,...,...,...,...,...,...,...
689,ZNF121,"D19S204,ZHC32,ZNF20",ENSG00000197961,12904.0,7675,ZNF20,ENSG
690,RNF141,"ZFP26,ZNF230",ENSG00000110315,21159.0,50862,ZNF230,ENSG
691,ZNF322P1,"ZNF322,ZNF322B",ENSG00000188801,14003.0,0,ZNF322,ENSG
692,ZNF106,"SH3BP3,ZFP106,ZNF474",ENSG00000103994,12886.0,64397,ZNF474,ENSG


### Make a set of the primary gene symbols

In [51]:
ensg_gene_symbol_set = set(mini_ensg_df["gene_symbol"])

In [52]:
total_number_ensembl_gene_symbols = len(ensg_gene_symbol_set)
total_number_ensembl_gene_symbols

41068

How many collisions are there? How many records are involved in one?


In [53]:
ensg_alias_primary_collision_set = set(merged_alias_ap_collision_ensg_df["collision"])
len(ensg_alias_primary_collision_set)

621

In [54]:
ensg_alias_primary_collision_primary_symbol_set = set(merged_alias_ap_collision_ensg_df["gene_symbol"])
len(ensg_alias_primary_collision_primary_symbol_set)

678

1. Why is the alias-gene collision set not the same length as the set of primary symbols with collisions ?
2. Why is the length of the alias-gene collison set shorter?
 - A priamry gene symbol with an alias-gene collision has an alias that matches a different gene's primary gene symbol.
 - Multiple genes can share a single alias (alias-alias collision)
 - If that shared alias is an alias-gene collision, then there will be more unique gene symbols in the set of primary symbols with collisions than the set of alias-gene collisions. 

# HGNC

## Set up table

In [55]:
mini_hgnc_df = pd.read_csv(
    "../input/hgnc_biomart_gene20240626.txt", sep="\t",dtype={"NCBI gene ID": pd.Int64Dtype()}
)
mini_hgnc_df = mini_hgnc_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Approved symbol": "gene_symbol",
        "Alias symbol": "alias_symbol",
        "Ensembl gene ID": "ENSG_ID",
        "NCBI gene ID": "NCBI_ID",
    }
)
mini_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,HGNC:5,,1,ENSG00000121410,A1BG
1,HGNC:37133,FLJ23569,503538,ENSG00000268895,A1BG-AS1
2,HGNC:24086,ACF,29974,ENSG00000148584,A1CF
3,HGNC:24086,ASP,29974,ENSG00000148584,A1CF
4,HGNC:24086,ACF64,29974,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,HGNC:29027,KIAA0399,23140,ENSG00000074755,ZZEF1
67579,HGNC:29027,ZZZ4,23140,ENSG00000074755,ZZEF1
67580,HGNC:29027,FLJ10821,23140,ENSG00000074755,ZZEF1
67581,HGNC:24523,DKFZP564I052,26009,ENSG00000036549,ZZZ3


In [56]:
create_ap_collision_df(mini_hgnc_df, "HGNC")

All collisions are present in gene alias lists.


  ap_collision_xxxx_df = ap_collision_xxxx_df.applymap(lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x)


(  HGNC_ID alias_symbol  NCBI_ID          ENSG_ID gene_symbol
 0       5          NaN        1  ENSG00000121410        A1BG
 1   37133     FLJ23569   503538  ENSG00000268895    A1BG-AS1
 2   24086          ACF    29974  ENSG00000148584        A1CF
 3   24086          ASP    29974  ENSG00000148584        A1CF
 4   24086        ACF64    29974  ENSG00000148584        A1CF,
   HGNC_ID alias_symbol  NCBI_ID          ENSG_ID gene_symbol
 1   37133     FLJ23569   503538  ENSG00000268895    A1BG-AS1
 2   24086          ACF    29974  ENSG00000148584        A1CF
 3   24086          ASP    29974  ENSG00000148584        A1CF
 4   24086        ACF64    29974  ENSG00000148584        A1CF
 5   24086        ACF65    29974  ENSG00000148584        A1CF,
       HGNC_ID alias_symbol  NCBI_ID          ENSG_ID gene_symbol collision  \
 42104   14947        AAVS1    54776  ENSG00000125503    PPP1R12C     AAVS1   
 56627   11177        ACAT1     6646  ENSG00000057252       SOAT1     ACAT1   
 56628   11178   

In [57]:
mini_hgnc_df= pd.read_csv(
    "../output/mini_hgnc_df.csv", index_col=[0])
mini_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,5,,1.0,ENSG00000121410,A1BG
1,37133,FLJ23569,503538.0,ENSG00000268895,A1BG-AS1
2,24086,ACF,29974.0,ENSG00000148584,A1CF
3,24086,ASP,29974.0,ENSG00000148584,A1CF
4,24086,ACF64,29974.0,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,29027,KIAA0399,23140.0,ENSG00000074755,ZZEF1
67579,29027,ZZZ4,23140.0,ENSG00000074755,ZZEF1
67580,29027,FLJ10821,23140.0,ENSG00000074755,ZZEF1
67581,24523,DKFZP564I052,26009.0,ENSG00000036549,ZZZ3


In [58]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
1,37133,FLJ23569,503538.0,ENSG00000268895,A1BG-AS1
2,24086,ACF,29974.0,ENSG00000148584,A1CF
3,24086,ASP,29974.0,ENSG00000148584,A1CF
4,24086,ACF64,29974.0,ENSG00000148584,A1CF
5,24086,ACF65,29974.0,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,29027,KIAA0399,23140.0,ENSG00000074755,ZZEF1
67579,29027,ZZZ4,23140.0,ENSG00000074755,ZZEF1
67580,29027,FLJ10821,23140.0,ENSG00000074755,ZZEF1
67581,24523,DKFZP564I052,26009.0,ENSG00000036549,ZZZ3


In [59]:
subset_genes_hgnc_df.loc[
    subset_genes_hgnc_df["gene_symbol"] == "ARK2C"
]

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
2708,31696,ARKL2,494470.0,ENSG00000141622,ARK2C
2709,31696,RNF111L2,494470.0,ENSG00000141622,ARK2C
2711,31696,LNCAMPC,494470.0,ENSG00000141622,ARK2C


In [60]:
merged_alias_ap_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_ap_collision_hgnc_df.csv", index_col=[0])
merged_alias_ap_collision_hgnc_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,PPP1R12C,"DKFZP434D0412,p84,MBS85,p85,AAVS1",ENSG00000125503,14947,54776,AAVS1,HGNC
1,SOAT1,"ACAT,ACAT1",ENSG00000057252,11177,6646,ACAT1,HGNC
2,SOAT2,ACAT2,ENSG00000167780,11178,8435,ACAT2,HGNC
3,GLI3,"PAP-A,PAPA,PAPA1,PAPB,ACLS,PPDIV",ENSG00000106571,4319,2737,ACLS,HGNC
4,NDUFAB1,"SDAP,FASN2A,ACP,ACP1",ENSG00000004779,7694,4706,ACP1,HGNC
...,...,...,...,...,...,...,...
648,ZNF121,"ZHC32,ZNF20",ENSG00000197961,12904,7675,ZNF20,HGNC
649,RNF141,"ZFP26,ZNF230",ENSG00000110315,21159,50862,ZNF230,HGNC
650,ZNF106,"ZNF474,SH3BP3",ENSG00000103994,12886,64397,ZNF474,HGNC
651,ZFP1,"FLJ34243,ZNF475",ENSG00000184517,23328,162239,ZNF475,HGNC


### Make a set of the primary gene symbols

In [61]:
hgnc_gene_symbol_set = set(mini_hgnc_df["gene_symbol"])
# all_gene_symbols_set

In [62]:
total_number_hgnc_gene_symbols = len(hgnc_gene_symbol_set)
total_number_hgnc_gene_symbols

45646

How many collisions are there? How many records are involved in one?

In [63]:
hgnc_alias_primary_collision_set = set(merged_alias_ap_collision_hgnc_df["collision"])
len(hgnc_alias_primary_collision_set)

576

In [64]:
hgnc_alias_primary_collision_primary_symbol_set = set(merged_alias_ap_collision_hgnc_df["gene_symbol"])
len(hgnc_alias_primary_collision_primary_symbol_set)

637

# NCBI Info

In [65]:
mini_ncbi_df = pd.read_csv("../input/Homo_sapiens.gene_info20240627", sep="\t")

### Drop all columns besides ENSG_ID, gene_symbol, and alias_symbol

In [66]:
mini_ncbi_df = mini_ncbi_df[
["GeneID", "Symbol", "Synonyms", "dbXrefs"]
]
mini_ncbi_df = mini_ncbi_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Synonyms": "alias_symbol"}
)
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,dbXrefs
0,1,A1BG,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...
2,3,A2MP1,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...
3,9,NAT1,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...
4,10,NAT2,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...
...,...,...,...,...
193451,8923215,trnD,-,-
193452,8923216,trnP,-,-
193453,8923217,trnA,-,-
193454,8923218,COX1,-,-


Split dbXrefs into individual columns

In [67]:
mini_ncbi_df = mini_ncbi_df.assign(
    MIM=np.nan,
    HGNC_ID=np.nan,
    ENSG_ID=np.nan,
    AllianceGenome=np.nan,
    MIRbase=np.nan,
    IMGTgene_db=np.nan,
    dash=np.nan,
    unknown=np.nan,
)

In [68]:
index_pos = 0

print(len(mini_ncbi_df))
while index_pos < len(mini_ncbi_df):
    xrefs = mini_ncbi_df["dbXrefs"][index_pos].split("|")

    for xref in xrefs:
        xref = xref.lower()
        if xref.startswith("mim:"):
            xref = xref.replace("mim:", "")
            mini_ncbi_df["MIM"][index_pos] = xref
        elif xref.startswith("hgnc:hgnc:"):
            xref = xref.replace("hgnc:hgnc:", "")
            mini_ncbi_df["HGNC_ID"][index_pos] = xref
        elif xref.startswith("ensembl:"):
            xref = xref.replace("ensembl:", "")
            mini_ncbi_df["ENSG_ID"][index_pos] = xref
        elif xref.startswith("alliancegenome:"):
            xref = xref.replace("alliancegenome:", "")
            mini_ncbi_df["AllianceGenome"][index_pos] = xref
        elif xref.startswith("mirbase"):
            xref = xref.replace("mirbase:", "")
            mini_ncbi_df["MIRbase"][index_pos] = xref
        elif xref.startswith("imgt/gene-db:"):
            xref = xref.replace("imgt/gene-db:", "")
            mini_ncbi_df["IMGTgene_db"][index_pos] = xref
        elif xref.startswith("-"):
            mini_ncbi_df["dash"][index_pos] = xref
        else:
            mini_ncbi_df["unknown"][index_pos] = xref

    index_pos += 1
    pass

print(index_pos)

193456


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  mini_ncbi_df["MIM"][index_pos] = xref
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["MIM"][index

193456


In [69]:
mini_ncbi_df["ENSG_ID"] = mini_ncbi_df["ENSG_ID"].str.replace("ensg", "ENSG", 1)

In [70]:
mini_ncbi_df = mini_ncbi_df.drop(
    [
        "AllianceGenome",
        "MIRbase",
        "IMGTgene_db",
        "dash",
        "unknown",
        "dbXrefs",
        "MIM",
    ],
    axis=1,
)
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B|ABG|GAB|HYST2477,5,ENSG00000121410
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7,7,ENSG00000175899
2,3,A2MP1,A2MP,8,ENSG00000291190
3,9,NAT1,AAC1|MNAT|NAT-1|NATI,7645,ENSG00000171428
4,10,NAT2,AAC2|NAT-2|PNAT,7646,ENSG00000156006
...,...,...,...,...,...
193451,8923215,trnD,-,,
193452,8923216,trnP,-,,
193453,8923217,trnA,-,,
193454,8923218,COX1,-,,


In [71]:
mini_ncbi_df['alias_symbol'] = mini_ncbi_df['alias_symbol'].str.split('|')
mini_ncbi_df = mini_ncbi_df.explode('alias_symbol')
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...
193451,8923215,trnD,-,,
193452,8923216,trnP,-,,
193453,8923217,trnA,-,,
193454,8923218,COX1,-,,


In [72]:
create_ap_collision_df(mini_ncbi_df, "NCBI")

All collisions are present in gene alias lists.


  ap_collision_xxxx_df = ap_collision_xxxx_df.applymap(lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x)


(   NCBI_ID gene_symbol alias_symbol HGNC_ID          ENSG_ID
 0        1        A1BG          A1B       5  ENSG00000121410
 0        1        A1BG          ABG       5  ENSG00000121410
 0        1        A1BG          GAB       5  ENSG00000121410
 0        1        A1BG     HYST2477       5  ENSG00000121410
 1        2         A2M         A2MD       7  ENSG00000175899,
    NCBI_ID gene_symbol alias_symbol HGNC_ID          ENSG_ID
 0        1        A1BG          A1B       5  ENSG00000121410
 0        1        A1BG          ABG       5  ENSG00000121410
 0        1        A1BG          GAB       5  ENSG00000121410
 0        1        A1BG     HYST2477       5  ENSG00000121410
 1        2         A2M         A2MD       7  ENSG00000175899,
        NCBI_ID gene_symbol alias_symbol HGNC_ID          ENSG_ID collision  \
 2826      3494       IGHA2          A2M    5479  ENSG00000211890       A2M   
 24278   404744   NPSR1-AS1         AAA1   22128  ENSG00000197085      AAA1   
 16430    89876  

In [73]:
mini_ncbi_df= pd.read_csv(
    "../output/mini_ncbi_df.csv", index_col=[0])
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5.0,ENSG00000121410
0,1,A1BG,ABG,5.0,ENSG00000121410
0,1,A1BG,GAB,5.0,ENSG00000121410
0,1,A1BG,HYST2477,5.0,ENSG00000121410
1,2,A2M,A2MD,7.0,ENSG00000175899
...,...,...,...,...,...
193451,8923215,trnD,-,,
193452,8923216,trnP,-,,
193453,8923217,trnA,-,,
193454,8923218,COX1,-,,


In [74]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5.0,ENSG00000121410
0,1,A1BG,ABG,5.0,ENSG00000121410
0,1,A1BG,GAB,5.0,ENSG00000121410
0,1,A1BG,HYST2477,5.0,ENSG00000121410
1,2,A2M,A2MD,7.0,ENSG00000175899
...,...,...,...,...,...
190961,131840634,GLTC1,GLTC,56861.0,
193342,132532400,GABRA6-AS1,ARBAG,40248.0,
193377,133395150,LNCARGI,ARGI,56890.0,
193378,133834869,MLDHR,MP31,55481.0,


In [75]:
merged_alias_ap_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ncbi_df.csv", index_col=[0])
merged_alias_ap_collision_ncbi_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,IGHA2,A2M,ENSG00000211890,5479.0,3494,A2M,NCBI
1,NPSR1-AS1,AAA1,ENSG00000197085,22128.0,404744,AAA1,NCBI
2,CFAP91,"AAT1,AAT1alpha,C3orf15,CaM-IP2,MAATS1,SPATA26,...",ENSG00000183833,24010.0,89876,AAT1,NCBI
3,GPT,"AAT1,ALT,ALT1,GPT1,SGPT",ENSG00000167701,4552.0,2875,AAT1,NCBI
4,PPP1R12C,"AAVS1,LENG3,MBS85,p84,p85",ENSG00000125503,14947.0,54776,AAVS1,NCBI
...,...,...,...,...,...,...,...
2119,ZNF785,ZNF688,ENSG00000197162,26496.0,146540,ZNF688,NCBI
2120,ZP4,"ZBP,ZP1,ZP1B,ZPB,ZPB2,Zp-4",ENSG00000116996,15770.0,57829,ZP1,NCBI
2121,LMBR1,"ACHP,C7orf2,DIF14,LSS,PPD2,THYP,TPT,TPTPS,ZRS",ENSG00000105983,13243.0,64327,ZRS,NCBI
2122,ZNF446,"ZKSCAN20,ZSCAN30,ZSCAN52",ENSG00000083838,21036.0,55663,ZSCAN30,NCBI


### Make a set of primary gene symbols

In [76]:
ncbi_gene_symbol_set = set(mini_ncbi_df["gene_symbol"])

In [77]:
total_number_ncbi_gene_symbols = len(ncbi_gene_symbol_set)
total_number_ncbi_gene_symbols

45390

How many collisions are there? How many records are involved in one?

In [78]:
ncbi_alias_primary_collision_set = set(
    merged_alias_ap_collision_ncbi_df["collision"]
)
len(ncbi_alias_primary_collision_set)

1566

In [79]:
ncbi_alias_primary_collision_primary_symbol_set = set(
    merged_alias_ap_collision_ncbi_df["gene_symbol"]
)
len(ncbi_alias_primary_collision_primary_symbol_set)

1689

# Merge 3 sets together

In [80]:
merged_alias_primary_collisions_df = pd.concat(
    [
        merged_alias_ap_collision_hgnc_df[
            ["gene_symbol", "alias_symbol", "collision", "source"]
        ],
        merged_alias_ap_collision_ncbi_df[
            ["gene_symbol", "alias_symbol", "collision", "source"]
        ],
        merged_alias_ap_collision_ensg_df[
            ["gene_symbol", "alias_symbol", "collision", "source"]
        ],
    ]
)
merged_alias_primary_collisions_df

Unnamed: 0,gene_symbol,alias_symbol,collision,source
0,PPP1R12C,"DKFZP434D0412,p84,MBS85,p85,AAVS1",AAVS1,HGNC
1,SOAT1,"ACAT,ACAT1",ACAT1,HGNC
2,SOAT2,ACAT2,ACAT2,HGNC
3,GLI3,"PAP-A,PAPA,PAPA1,PAPB,ACLS,PPDIV",ACLS,HGNC
4,NDUFAB1,"SDAP,FASN2A,ACP,ACP1",ACP1,HGNC
...,...,...,...,...
689,ZNF121,"D19S204,ZHC32,ZNF20",ZNF20,ENSG
690,RNF141,"ZFP26,ZNF230",ZNF230,ENSG
691,ZNF322P1,"ZNF322,ZNF322B",ZNF322,ENSG
692,ZNF106,"SH3BP3,ZFP106,ZNF474",ZNF474,ENSG


In [81]:
merged_alias_primary_collisions_df.loc[
    merged_alias_primary_collisions_df["collision"] == "CFM1"
]

Unnamed: 0,gene_symbol,alias_symbol,collision,source
84,RFLNB,"MGC45871,RefilinB,Cfm1",CFM1,HGNC
215,RFLNB,"CFM1,FAM101B",CFM1,NCBI


In [83]:
duplicate_rows = merged_alias_primary_collisions_df[merged_alias_primary_collisions_df.duplicated(subset=['collision','source'], keep=False)]
duplicate_rows

Unnamed: 0,gene_symbol,alias_symbol,collision,source
13,DANCR,"ANCR,AGU2,lncRNA-ANCR",ANCR,HGNC
14,UBE3A,"AS,ANCR,E6-AP,FLJ26981",ANCR,HGNC
15,SLC25A23,"FLJ30339,MGC2615,APC2",APC2,HGNC
16,ANAPC2,"APC2,KIAA1406",APC2,HGNC
19,AREG,"AR,CRDGF",AR,HGNC
...,...,...,...,...
665,RNU4-5P,"RNU4P5,U4,U4/6",U4,ENSG
666,RNU4-4P,"RNU4P4,U4,U4/5",U4,ENSG
667,RNU4-1,"RNU4A,RNU4B2,U4,U4BL",U4,ENSG
668,RNU6-50P,"RNU6-50,RNU6P1,U6",U6,ENSG


In [84]:
duplicate_rows = merged_alias_primary_collisions_df[merged_alias_primary_collisions_df.duplicated(subset=['gene_symbol','source'], keep=False)]
duplicate_rows

Unnamed: 0,gene_symbol,alias_symbol,collision,source
28,GDNF,"ATF1,ATF2,HFB1-GDNF",ATF1,HGNC
29,GDNF,"ATF1,ATF2,HFB1-GDNF",ATF2,HGNC
62,TMPRSS4,"TMPRSS3,MT-SP2,CAP2",CAP2,HGNC
72,ACKR2,"CCR10,D6,CCR9",CCR10,HGNC
74,ACKR2,"CCR10,D6,CCR9",CCR9,HGNC
...,...,...,...,...
583,SLIT3,"MEGF5,SLIL2,SLIT-3,SLIT1,SLIT2",SLIT1,ENSG
584,SLIT3,"MEGF5,SLIL2,SLIT-3,SLIT1,SLIT2",SLIT2,ENSG
602,SPNS1,"HSPIN1,LAT,NRS,PP2030,SLC63A1,SPIN1,SPINL",SPIN1,ENSG
615,UXT,"ART-27,SKP2,STAP1",STAP1,ENSG


# Convert to csv

In [80]:
merged_alias_primary_collisions_df.to_csv(
    "../output/merged_alias_primary_collisions_df.csv", index=False
)

In [82]:
common_ap_collisions = (
    ncbi_alias_primary_collision_primary_symbol_set
    & hgnc_alias_primary_collision_primary_symbol_set
    & ensg_alias_primary_collision_primary_symbol_set
)
common_ap_collisions

{'ABCD1',
 'ACD',
 'ACKR2',
 'ACOD1',
 'ACTBP8',
 'ADRA1D',
 'AGXT',
 'AIFM2',
 'AKR1B1',
 'AKR1B10',
 'ALPK3',
 'AMH',
 'ANAPC2',
 'ANKRD37',
 'ANTXR1',
 'AOC1',
 'APEX1',
 'AREG',
 'ARHGAP21',
 'ARHGEF7',
 'ARID4A',
 'ART4',
 'ARTN',
 'ASIC2',
 'AURKAIP1',
 'AZIN2',
 'B3GNTL1',
 'BANF1P1',
 'BCAT2',
 'BRIP1',
 'BTF3P11',
 'BTN3A3',
 'BVES',
 'C1D',
 'C1QTNF1',
 'C6ORF89',
 'CACNA1A',
 'CADPS',
 'CADPS2',
 'CAPN5',
 'CARD16',
 'CCL13',
 'CCL14',
 'CCL15',
 'CCM2',
 'CD200R1',
 'CDH19',
 'CDH20',
 'CDKN2AIP',
 'CDPF1',
 'CELSR1',
 'CES1',
 'CFAP73',
 'CFH',
 'CHAF1B',
 'CHAMP1',
 'CHD6',
 'CHEK2',
 'CHORDC1',
 'CLASP1',
 'CLCF1',
 'CNGB1',
 'CNKSR2',
 'CNOT6',
 'CNRIP1',
 'CNTN1',
 'COASY',
 'COPS2',
 'COPS3',
 'CORIN',
 'COX7A2L',
 'CPA4',
 'CPAMD8',
 'CPNE1',
 'CPNE2',
 'CPPED1',
 'CREB3L4',
 'CSNK2A2',
 'CSTB',
 'CTDSP2',
 'CXCL10',
 'CXXC1',
 'CYCSP5',
 'CYP11B1',
 'CYP11B2',
 'CYP21A2',
 'CYP2A6',
 'DCAF5',
 'DCBLD2',
 'DDOST',
 'DDR2',
 'DDX11',
 'DDX18',
 'DEAF1',
 'DEFA6',
 'DE

In [83]:
len(common_ap_collisions)

496