### Import 

In [631]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [632]:
def create_ap_collision_df(mini_xxxx_df: pd.DataFrame, source: str) -> pd.DataFrame:
    """Create a df of alias-primary collision symbols 

    :param mini_xxxx_df: Processed df of gene records
    :param source: Representation of the source of the gene records
    :param split_on_character: Character that is used to seperate alias symbols in the mini_xxxx_df
    :return: A df of genes that share an alias with another gene
    """

    #Strip "HGNC" from HGNC ID
    mini_xxxx_df["HGNC_ID"] = mini_xxxx_df["HGNC_ID"].str.lstrip("HGNC:")

    #Remove placeholder gene records (those with no primary gene symbol)
    mini_xxxx_df = mini_xxxx_df.dropna(subset=["gene_symbol"])

    #Remove placeholder LOC genes
    mini_xxxx_df = mini_xxxx_df[~mini_xxxx_df['gene_symbol'].str.startswith('LOC', na=False)]

    mini_xxxx_df['HGNC_ID'] = mini_xxxx_df['HGNC_ID'].fillna(0).astype(int)
    mini_xxxx_df['NCBI_ID'] = mini_xxxx_df['NCBI_ID'].fillna(0).astype(int)

    #mini_xxx_df
    ## no LOC genes (only present in NCBI)
    ## no records with no primary symbols (only present in ENSG)
    ## each alias on a separate row
    #Convert the df into a csv and save
    mini_xxxx_df.to_csv(f'../output/mini_{source.lower()}_df.csv', index=True) 

    #Make a new df where the alias symbols are merged together for each record
    merged_alias_xxxx_df = (
        mini_xxxx_df
        .copy()
        .fillna("")
        .groupby(f"{source}_ID", dropna=False)
        .agg(lambda col: ",".join(sorted(set(map(str, col)))))
        .reset_index()
    )

    #merged_alias_xxxx_df
    ##merging aliases associated with the same record (same gene record identifier) into a list
    #Convert the df into a csv and save
    merged_alias_xxxx_df.to_csv(f'../output/merged_alias_{source}_df.csv', index=True) 

    #Make a set of the primary gene symbols
    xxxx_gene_symbol_set = set(mini_xxxx_df["gene_symbol"])
    uppercased_xxxx_gene_symbol_set = {s.upper() for s in xxxx_gene_symbol_set}

    
    subset_genes_xxxx_df = mini_xxxx_df.copy()

    #Remove alias symbols that are an exact match to their respective primary gene symbol
    subset_genes_xxxx_df["alias_symbol_upper"] = subset_genes_xxxx_df["alias_symbol"].str.upper()
    subset_genes_xxxx_df["gene_symbol_upper"] = subset_genes_xxxx_df["gene_symbol"].str.upper()
    subset_genes_xxxx_df = subset_genes_xxxx_df[subset_genes_xxxx_df['gene_symbol_upper'] != subset_genes_xxxx_df['alias_symbol_upper']]
    subset_genes_xxxx_df = subset_genes_xxxx_df.drop(["alias_symbol_upper","gene_symbol_upper"], axis=1)

    #subset_genes_xxxx_df
    ## no aliases that match the primary gene symbol
    ## each alias on a separate row
    #Convert the df into a csv and save
    subset_genes_xxxx_df.to_csv(f'../output/subset_genes_{source}_df.csv', index=True) 

    #Create df with genes that have an alias that can be found as another gene's primary gene symbol
    ap_collision_xxxx_df = subset_genes_xxxx_df.copy()
    ap_collision_xxxx_df = ap_collision_xxxx_df.dropna(subset=["alias_symbol"])
    ap_collision_xxxx_df["alias_symbol_upper"] = ap_collision_xxxx_df["alias_symbol"].str.upper()

    #makes list of aliases into a set
    ap_collision_xxxx_df.loc[:, "alias_symbol_upper"] = ap_collision_xxxx_df["alias_symbol_upper"].apply(lambda x: {x})

    ap_collision_xxxx_df.loc[:, "collision"] = ap_collision_xxxx_df["alias_symbol_upper"].apply(
        lambda x: x & uppercased_xxxx_gene_symbol_set
    )
    ap_collision_xxxx_df = ap_collision_xxxx_df.drop("alias_symbol_upper", axis=1)
    ap_collision_xxxx_df = ap_collision_xxxx_df[ap_collision_xxxx_df["collision"].apply(lambda x: len(x) > 0)]
    #changes back the set of aliases to a list, and make the set of collisions to a list
    ap_collision_xxxx_df['collision'] = ap_collision_xxxx_df['collision'].apply(
        lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x
    )
    ap_collision_xxxx_df = ap_collision_xxxx_df.sort_values("collision")

    #Add a source tag for future merging efforts
    ap_collision_xxxx_df["source"] = str(source.upper())

    #ap_collision_xxxx_df
    ## only the gene records with an alias that matches another gene record's primary gene symbol
    ## each alias on a separate row
    #Convert the df into a csv
    ap_collision_xxxx_df.to_csv(f'../output/single_alias_ap_collision_{source.lower()}_df.csv', index=True)

    #Create a secondary collision df that merges the alias symbols for each record
    columns_map = {
        'ENSG': ['NCBI_ID', 'HGNC_ID'],
        'HGNC': ['NCBI_ID', 'ENSG_ID'],
        'NCBI': ['HGNC_ID', 'ENSG_ID']
    }

    cols_of_interest = columns_map.get(source, [])

    merged_alias_ap_collision_xxxx_df = ap_collision_xxxx_df.drop(columns=cols_of_interest + ['alias_symbol'])

    merged_alias_ap_collision_xxxx_df = pd.merge(
        merged_alias_ap_collision_xxxx_df,
        merged_alias_xxxx_df[[f"{source}_ID"] + cols_of_interest + ['alias_symbol']],
        on=[f"{source}_ID"],
        how="left"
    )
    merged_alias_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.drop_duplicates(subset=[f"{source}_ID"])

    #Test to make sure all collisions are in the alias list for the record
    test_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.copy()
    test_ap_collision_xxxx_df["alias_symbol_upper"] = test_ap_collision_xxxx_df["alias_symbol"].str.upper()   
    test_ap_collision_xxxx_df['alias_symbol_upper'] = test_ap_collision_xxxx_df['alias_symbol_upper'].apply(lambda x: x if isinstance(x, list) else [x])
    test_ap_collision_xxxx_df['collision_in_alias'] = test_ap_collision_xxxx_df.apply(lambda row: row['collision'] in test_ap_collision_xxxx_df['alias_symbol_upper'].values, axis=1)
    true_count = test_ap_collision_xxxx_df['collision_in_alias'].sum()
    test_ap_collision_xxxx_df = test_ap_collision_xxxx_df.drop("alias_symbol_upper", axis=1)

    if true_count == 0:
        #merged_alias_ap_collision_xxxx_df
        ## only the gene records with an alias that matches another gene record's primary gene symbol
        ## merging aliases associated with the same record (same gene record identifier) into a list
        #Convert the df into a csv and save
        merged_alias_ap_collision_xxxx_df.to_csv(f'../output/merged_alias_ap_collision_{source.lower()}_df.csv', index=True)
        print("All collisions are present in gene alias lists.")
    else:
        print("Some collisions are not present in gene alias lists.")

    return mini_xxxx_df.head()

# Ensembl

In [633]:
mini_ensg_df = pd.read_csv(
    "../input/ensg_biomart_gene20250625.txt", sep="\t",dtype={"NCBI gene (formerly Entrezgene) ID": pd.Int64Dtype()}
)
mini_ensg_df = mini_ensg_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Gene Synonym": "alias_symbol",
        "Gene name": "gene_symbol",
        "Gene stable ID": "ENSG_ID",
        "NCBI gene (formerly Entrezgene) ID": "NCBI_ID",
    }
)
mini_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF
1,ENSG00000210049,,HGNC:7481,TRNF,MT-TF
2,ENSG00000211459,,HGNC:7470,12S,MT-RNR1
3,ENSG00000211459,,HGNC:7470,MOTS-C,MT-RNR1
4,ENSG00000211459,,HGNC:7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133060,ENSG00000229388,,HGNC:52502,LINC01715,TAF12-DT
133061,ENSG00000289291,,,,
133062,ENSG00000274978,26824,HGNC:10108,RNU11-1,RNU11
133063,ENSG00000274978,26824,HGNC:10108,U11,RNU11


In [634]:
mini_ensg_df = mini_ensg_df.replace(" ", np.nan)
mini_ensg_df = mini_ensg_df.replace("", np.nan)
mini_ensg_df = mini_ensg_df.replace("-", np.nan)
mini_ensg_df = mini_ensg_df.replace("<NA>", np.nan)

how many gene records for the Ensembl data set before any kind of cleaning?

In [635]:
total_raw_gene_record_count_ensg = len(set(mini_ensg_df['ENSG_ID']))
total_raw_gene_record_count_ensg

86364

how many gene records have no primary gene symbol?

In [636]:
raw_no_primary_symbol_record_count_ensg = (
    mini_ensg_df[mini_ensg_df["gene_symbol"].isna()]
    .dropna(subset=["ENSG_ID"])
    ["ENSG_ID"]
    .nunique()
)
raw_no_primary_symbol_record_count_ensg

37963

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [637]:
raw_loc_record_count_ensg = (
    mini_ensg_df[mini_ensg_df["gene_symbol"].str.startswith('LOC', na=False)&
    ~mini_ensg_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["ENSG_ID"])
    ["ENSG_ID"]
    .nunique()
)
raw_loc_record_count_ensg

0

In [638]:
create_ap_collision_df(mini_ensg_df, "ENSG")

All collisions are present in gene alias lists.


Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,0,7481,MTTF,MT-TF
1,ENSG00000210049,0,7481,TRNF,MT-TF
2,ENSG00000211459,0,7470,12S,MT-RNR1
3,ENSG00000211459,0,7470,MOTS-C,MT-RNR1
4,ENSG00000211459,0,7470,MTRNR1,MT-RNR1


In [639]:
mini_ensg_df = pd.read_csv(
    "../output/mini_ensg_df.csv", index_col=[0])

In [640]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,0,7481,MTTF,MT-TF
1,ENSG00000210049,0,7481,TRNF,MT-TF
2,ENSG00000211459,0,7470,12S,MT-RNR1
3,ENSG00000211459,0,7470,MOTS-C,MT-RNR1
4,ENSG00000211459,0,7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,85028,30062,LINC00100,SNHG12
133059,ENSG00000197989,85028,30062,PNAS-123,SNHG12
133060,ENSG00000229388,0,52502,LINC01715,TAF12-DT
133062,ENSG00000274978,26824,10108,RNU11-1,RNU11


In [641]:
ap_collision_ensg_df = pd.read_csv(
    "../output/single_alias_ap_collision_ensg_df.csv", index_col=[0])
ap_collision_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol,collision,source
24761,ENSG00000283293,125050,10037,7SK,RN7SK,7SK,ENSG
37365,ENSG00000057252,6646,11177,ACAT1,SOAT1,ACAT1,ENSG
17666,ENSG00000167780,8435,11178,ACAT2,SOAT2,ACAT2,ENSG
35063,ENSG00000004779,4706,7694,ACP1,NDUFAB1,ACP1,ENSG
47647,ENSG00000220267,0,141,ACTBP2,ACTBP8,ACTBP2,ENSG
...,...,...,...,...,...,...,...
53226,ENSG00000197961,7675,12904,ZNF20,ZNF121,ZNF20,ENSG
35374,ENSG00000110315,50862,21159,ZNF230,RNF141,ZNF230,ENSG
96229,ENSG00000188801,0,14003,ZNF322,ZNF322P1,ZNF322,ENSG
81813,ENSG00000103994,64397,12886,ZNF474,ZNF106,ZNF474,ENSG


In [642]:
merged_alias_ap_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ensg_df.csv", index_col=[0])
merged_alias_ap_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,collision,source,NCBI_ID,HGNC_ID,alias_symbol
0,ENSG00000283293,RN7SK,7SK,ENSG,125050,10037,7SK
1,ENSG00000057252,SOAT1,ACAT1,ENSG,6646,11177,"ACAT,ACAT1,SOAT,STAT"
2,ENSG00000167780,SOAT2,ACAT2,ENSG,8435,11178,ACAT2
3,ENSG00000004779,NDUFAB1,ACP1,ENSG,4706,7694,"ACP,ACP1,FASN2A,SDAP"
4,ENSG00000220267,ACTBP8,ACTBP2,ENSG,0,141,ACTBP2
...,...,...,...,...,...,...,...
806,ENSG00000197961,ZNF121,ZNF20,ENSG,7675,12904,"D19S204,ZHC32,ZNF20"
807,ENSG00000110315,RNF141,ZNF230,ENSG,50862,21159,"ZFP26,ZNF230"
808,ENSG00000188801,ZNF322P1,ZNF322,ENSG,0,14003,"ZNF322,ZNF322B"
809,ENSG00000103994,ZNF106,ZNF474,ENSG,64397,12886,"SH3BP3,ZFP106,ZNF474"


how many gene records are in the Ensembl database?

In [643]:
gene_record_set_ensg = set(mini_ensg_df['ENSG_ID'])
gene_record_count_ensg = len(gene_record_set_ensg)
gene_record_count_ensg

48401

how many unique primary gene symbols are in the Ensembl data set?

In [644]:
primary_symbol_set_ensg = set(mini_ensg_df['gene_symbol'])
primary_symbol_count_ensg = len(primary_symbol_set_ensg)
primary_symbol_count_ensg

41164

how many unique alias gene symbols are in the Ensembl data set?

In [645]:
alias_symbol_set_ensg = set(mini_ensg_df['alias_symbol'])
alias_symbol_count_ensg = len(alias_symbol_set_ensg)
alias_symbol_count_ensg

55413

how many unique gene symbols in total are there in the Ensembl data set??

In [646]:
total_symbol_count_ensg = pd.concat([mini_ensg_df["gene_symbol"], mini_ensg_df["alias_symbol"]]).dropna().nunique()
total_symbol_count_ensg

95941

how many gene records have no alias symbols?

In [647]:
no_alias_symbol_record_count_ensg = (
    mini_ensg_df[mini_ensg_df["alias_symbol"].isna()]
    .dropna(subset=["ENSG_ID"])
    ["ENSG_ID"]
    .nunique()
)
no_alias_symbol_record_count_ensg

19363

how many gene records have a primary gene symbol that is a C#orf?

In [648]:
orf_record_set_ensg = set(
    mini_ensg_df[
        mini_ensg_df["gene_symbol"].str.contains(r'^C.*ORF', case=False, na=False) &
        ~mini_ensg_df["gene_symbol"].str.contains("-", na=False)
    ]
    .dropna(subset=["ENSG_ID"])["ENSG_ID"]
)

orf_record_count_ensg = len(orf_record_set_ensg)
orf_record_count_ensg

288

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [649]:
fam_record_set_ensg = set(
    mini_ensg_df[
        mini_ensg_df["gene_symbol"].str.startswith("FAM", na=False) &
        ~mini_ensg_df["gene_symbol"].str.contains("-", na=False)
    ]
    .dropna(subset=["ENSG_ID"])["ENSG_ID"]
)

fam_record_count_ensg = len(fam_record_set_ensg)
fam_record_count_ensg

430

how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [650]:
kiaa_record_set_ensg = set(
    mini_ensg_df[
        mini_ensg_df["gene_symbol"].str.startswith("KIAA", na=False) &
        ~mini_ensg_df["gene_symbol"].str.contains("-", na=False)
    ]
    .dropna(subset=["ENSG_ID"])["ENSG_ID"]
)

kiaa_record_count_ensg = len(kiaa_record_set_ensg)
kiaa_record_count_ensg

33

which gene records share a primary symbol with other gene records?

In [651]:
# groupby name and return a boolean of whether each has more than 1 unique ENSG ID
multi_primary = mini_ensg_df.groupby(["gene_symbol"]).ENSG_ID.nunique().gt(1)

num_rows = mini_ensg_df[mini_ensg_df.gene_symbol.isin(multi_primary[multi_primary].index)].shape[0]
print(num_rows)

# use loc to only see those values that have `True` in `multi_primary`:
mini_ensg_df.loc[mini_ensg_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol').iloc[6000:6060]

25007


Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
92891,ENSG00000138641,8916,4876,KIAA0032,HERC3
92878,ENSG00000287542,8916,0,,HERC3
52117,ENSG00000273529,388585,19764,BHLHB38,HES5
14696,ENSG00000197921,388585,19764,BHLHB38,HES5
4416,ENSG00000280680,55733,18270,MART-2,HHAT
4421,ENSG00000280680,55733,18270,SKN,HHAT
4418,ENSG00000280680,55733,18270,RASP,HHAT
4417,ENSG00000280680,55733,18270,MART2,HHAT
43282,ENSG00000054392,55733,18270,MART2,HHAT
43285,ENSG00000054392,55733,18270,SKI,HHAT


How many ambiguous symbols result from alias-primary collisions?


In [652]:
ap_collision_ambiguous_symbol_set_ensg = set(merged_alias_ap_collision_ensg_df["collision"])
ap_collision_ambiguous_symbol_count_ensg = len(ap_collision_ambiguous_symbol_set_ensg)
ap_collision_ambiguous_symbol_count_ensg

606

How many records have at least one alias-primary collision (alias that matches another record's primary gene symbol)?

In [653]:
ap_record_set_ensg = set(merged_alias_ap_collision_ensg_df["ENSG_ID"])
ap_record_count_ensg = len(ap_record_set_ensg)
ap_record_count_ensg

767

1. Why is the alias-gene collision set not the same length as the set of primary symbols with collisions ?
2. Why is the length of the alias-gene collison set shorter?
 - A gene record with an alias-primary collision has an alias that matches a different gene's primary gene symbol.
 - Multiple gene records can share a single alias (alias-alias collision)
 - If that shared alias is an alias-primary collision, then there will be more unique gene symbols in the set of primary symbols with collisions than the set of alias-primary collisions. 

# HGNC

## Set up table

In [654]:
hgnc_file_path = "../input/hgnc_biomart_gene20250625.txt"

mini_hgnc_df = pd.read_csv(
    hgnc_file_path, sep="\t"
)

# Rename columns
mini_hgnc_df = mini_hgnc_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Approved symbol": "gene_symbol",
        "Ensembl gene ID": "ENSG_ID",
    }
)

#structure and labeling in HGNC download files changed between 2024 amd 2025
if "Alias symbol" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"Alias symbol": "alias_symbol"})
elif "Alias symbols" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"Alias symbols": "alias_symbol"})
else:
    mini_hgnc_df["alias_symbol"] = pd.NA

if "NCBI gene ID" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"NCBI gene ID": "NCBI_ID"})
elif "NCBI Gene ID" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"NCBI Gene ID": "NCBI_ID"})
else:
    mini_hgnc_df["NCBI_ID"] = pd.NA   

mini_hgnc_df["NCBI_ID"] = mini_hgnc_df["NCBI_ID"].astype(pd.Int64Dtype())

# Extract date from filename and check if it is before June 25, 2025
match = re.search(r'(\d{8})', hgnc_file_path)
file_date = datetime.strptime(match.group(1), "%Y%m%d") if match else None
cutoff_date = datetime.strptime("20250625", "%Y%m%d")

# Apply list-splitting logic for newer files
if file_date and file_date >= cutoff_date:
    mini_hgnc_df['alias_symbol'] = (
        mini_hgnc_df['alias_symbol']
        .fillna('')
        .str.split(',')
        .apply(lambda x: [a.strip() for a in x if a.strip()])
    )
    mini_hgnc_df = mini_hgnc_df.explode('alias_symbol')

mini_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,HGNC:100,ASIC1,BNaC2,41,ENSG00000110881
0,HGNC:100,ASIC1,hBNaC2,41,ENSG00000110881
1,HGNC:10000,RGS4,,5999,ENSG00000117152
2,HGNC:10001,RGS5,,8490,ENSG00000143248
3,HGNC:10002,RGS6,,9628,ENSG00000182732
...,...,...,...,...,...
44232,HGNC:9997,RGS16,RGS-r,6004,ENSG00000143333
44233,HGNC:9998,RGS2,,5997,ENSG00000116741
44234,HGNC:9999,RGS3,C2PA,5998,ENSG00000138835
44234,HGNC:9999,RGS3,FLJ20370,5998,ENSG00000138835


In [655]:
mini_hgnc_df = mini_hgnc_df.replace(" ", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("-", np.nan)

how many gene records for the HGNC data set before any kind of cleaning?

In [656]:
total_raw_gene_record_set_hgnc = set(mini_hgnc_df['HGNC_ID'])
total_raw_gene_record_count_hgnc = len(total_raw_gene_record_set_hgnc)
total_raw_gene_record_count_hgnc

44235

how many gene records have no primary gene symbol?

In [657]:
raw_no_primary_symbol_record_count_hgnc = (
    mini_hgnc_df[mini_hgnc_df["gene_symbol"].isna()]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
    .nunique()
)
raw_no_primary_symbol_record_count_hgnc

0

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [658]:
raw_loc_record_count_hgnc = (
    mini_hgnc_df[mini_hgnc_df["gene_symbol"].str.startswith('LOC', na=False)&
    ~mini_hgnc_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
    .nunique()
)
raw_loc_record_count_hgnc

0

In [659]:
create_ap_collision_df(mini_hgnc_df, "HGNC")

All collisions are present in gene alias lists.


Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,100,ASIC1,BNaC2,41,ENSG00000110881
0,100,ASIC1,hBNaC2,41,ENSG00000110881
1,10000,RGS4,,5999,ENSG00000117152
2,10001,RGS5,,8490,ENSG00000143248
3,10002,RGS6,,9628,ENSG00000182732


In [660]:
mini_hgnc_df = pd.read_csv(
    "../output/mini_hgnc_df.csv", index_col=[0])

In [661]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,100,ASIC1,BNaC2,41,ENSG00000110881
0,100,ASIC1,hBNaC2,41,ENSG00000110881
1,10000,RGS4,,5999,ENSG00000117152
2,10001,RGS5,,8490,ENSG00000143248
3,10002,RGS6,,9628,ENSG00000182732
...,...,...,...,...,...
44232,9997,RGS16,RGS-r,6004,ENSG00000143333
44233,9998,RGS2,,5997,ENSG00000116741
44234,9999,RGS3,C2PA,5998,ENSG00000138835
44234,9999,RGS3,FLJ20370,5998,ENSG00000138835


In [662]:
merged_alias_ap_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_ap_collision_hgnc_df.csv", index_col=[0])
merged_alias_ap_collision_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,collision,source,NCBI_ID,ENSG_ID,alias_symbol
0,14947,PPP1R12C,AAVS1,HGNC,54776,ENSG00000125503,"AAVS1,DKFZP434D0412,MBS85,p84,p85"
1,11177,SOAT1,ACAT1,HGNC,6646,ENSG00000057252,"ACAT,ACAT1"
2,11178,SOAT2,ACAT2,HGNC,8435,ENSG00000167780,ACAT2
3,1612,CCRL2,ACKR5,HGNC,9034,ENSG00000121797,"ACKR5,CKRX,CRAM-A,CRAM-B,HCR"
4,7694,NDUFAB1,ACP1,HGNC,4706,ENSG00000004779,"ACP,ACP1,FASN2A,SDAP"
...,...,...,...,...,...,...,...
556,3147,ECEL1,XCE,HGNC,9427,ENSG00000171551,"DINE,XCE"
557,12904,ZNF121,ZNF20,HGNC,7675,ENSG00000197961,"ZHC32,ZNF20"
558,21159,RNF141,ZNF230,HGNC,50862,ENSG00000110315,"ZFP26,ZNF230"
559,12886,ZNF106,ZNF474,HGNC,64397,ENSG00000103994,"SH3BP3,ZNF474"


how many gene records are in the HGNC database?

In [663]:
gene_record_set_hgnc = set(mini_hgnc_df['HGNC_ID'])
gene_record_count_hgnc = len(gene_record_set_hgnc)
gene_record_count_hgnc

44235

how many unique primary gene symbols are in the HGNC data set?

In [664]:
primary_symbol_set_hgnc = set(mini_hgnc_df['gene_symbol'])
primary_symbol_count_hgnc = len(primary_symbol_set_hgnc)
primary_symbol_count_hgnc

44235

how many unique alias gene symbols are in the HGNC data set?

In [665]:
alias_symbol_set_hgnc = set(mini_hgnc_df['alias_symbol'])
alias_symbol_count_hgnc = len(alias_symbol_set_hgnc)
alias_symbol_count_hgnc

43075

how many unique gene symbols in total are there in the HGNC data set??

In [666]:
total_symbol_count_hgnc = len(set(mini_hgnc_df['gene_symbol']) | set(mini_hgnc_df['alias_symbol']))
total_symbol_count_hgnc

86842

how many gene records have no alias symbols?

In [667]:
no_alias_symbol_record_count_hgnc = (
    mini_hgnc_df[mini_hgnc_df["alias_symbol"].isna()]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
    .nunique()
)
no_alias_symbol_record_count_hgnc

21768

how many gene records have a primary gene symbol that is a C#orf?

In [668]:
orf_record_set_hgnc = set(
    mini_hgnc_df[
        mini_hgnc_df["gene_symbol"].str.contains(r'^C.*ORF', case=False, na=False) &
        ~mini_hgnc_df["gene_symbol"].str.contains("-", na=False)
    ]
    .dropna(subset=["HGNC_ID"])["HGNC_ID"]
)

orf_record_count_hgnc = len(orf_record_set_hgnc)
orf_record_count_hgnc

246

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [669]:
fam_record_set_hgnc = set(
    mini_hgnc_df[mini_hgnc_df["gene_symbol"].str.startswith('FAM', na=False)&
    ~mini_hgnc_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
)
fam_record_count_hgnc = len(fam_record_set_hgnc)
fam_record_count_hgnc

371

how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [670]:
kiaa_record_set_hgnc = set(
    mini_hgnc_df[mini_hgnc_df["gene_symbol"].str.startswith('KIAA', na=False)&
    ~mini_hgnc_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
)
kiaa_record_count_hgnc = len(kiaa_record_set_hgnc)
kiaa_record_count_hgnc

32

which gene records share a primary symbol with other gene records?

In [671]:
# groupby name and return a boolean of whether each has more than 1 unique Country
multi_primary = mini_hgnc_df.groupby(["gene_symbol"]).HGNC_ID.nunique().gt(1)

num_rows = mini_hgnc_df[mini_hgnc_df.gene_symbol.isin(multi_primary[multi_primary].index)].shape[0]
print(num_rows)

# use loc to only see those values that have `True` in `multi_country`:
multi_record_same_symbol_df = mini_hgnc_df.loc[mini_hgnc_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol')
multi_record_same_symbol_df.head(60)

0


Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID


How many ambiguous symbols result from alias-primary collisions?

In [672]:
ap_collision_ambiguous_symbol_set_hgnc = set(
    merged_alias_ap_collision_hgnc_df["collision"]
)
ap_collision_ambiguous_symbol_count_hgnc = len(ap_collision_ambiguous_symbol_set_hgnc)
ap_collision_ambiguous_symbol_count_hgnc

492

How many records have at least one alias-primary collision (alias that matches another record's primary gene symbol)?

In [673]:
ap_record_set_hgnc = set(merged_alias_ap_collision_hgnc_df["HGNC_ID"])
ap_record_count_hgnc = len(ap_record_set_hgnc)
ap_record_count_hgnc

547

# NCBI Info

In [674]:
ncbi_file_path = "../input/Homo_sapiens.gene_info20250625"

mini_ncbi_df = pd.read_csv(ncbi_file_path, sep="\t")
# Drop all columns besides ENSG_ID, gene_symbol, and alias_symbol
mini_ncbi_df = mini_ncbi_df[
["GeneID", "Symbol", "Synonyms", "dbXrefs"]
]
mini_ncbi_df = mini_ncbi_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Synonyms": "alias_symbol"})


Split dbXrefs into individual columns

In [675]:
mini_ncbi_df = mini_ncbi_df.assign(
    MIM=np.nan,
    HGNC_ID=np.nan,
    ENSG_ID=np.nan,
    AllianceGenome=np.nan,
    MIRbase=np.nan,
    IMGTgene_db=np.nan,
    dash=np.nan,
    unknown=np.nan,
)

In [676]:
index_pos = 0

print(len(mini_ncbi_df))
while index_pos < len(mini_ncbi_df):
    xrefs = mini_ncbi_df["dbXrefs"][index_pos].split("|")

    for xref in xrefs:
        xref = xref.lower()
        if xref.startswith("mim:"):
            xref = xref.replace("mim:", "")
            mini_ncbi_df["MIM"][index_pos] = xref
        elif xref.startswith("hgnc:hgnc:"):
            xref = xref.replace("hgnc:hgnc:", "")
            mini_ncbi_df["HGNC_ID"][index_pos] = xref
        elif xref.startswith("ensembl:"):
            xref = xref.replace("ensembl:", "")
            mini_ncbi_df["ENSG_ID"][index_pos] = xref
        elif xref.startswith("alliancegenome:"):
            xref = xref.replace("alliancegenome:", "")
            mini_ncbi_df["AllianceGenome"][index_pos] = xref
        elif xref.startswith("mirbase"):
            xref = xref.replace("mirbase:", "")
            mini_ncbi_df["MIRbase"][index_pos] = xref
        elif xref.startswith("imgt/gene-db:"):
            xref = xref.replace("imgt/gene-db:", "")
            mini_ncbi_df["IMGTgene_db"][index_pos] = xref
        elif xref.startswith("-"):
            mini_ncbi_df["dash"][index_pos] = xref
        else:
            mini_ncbi_df["unknown"][index_pos] = xref

    index_pos += 1
    pass

print(index_pos)

193580


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  mini_ncbi_df["MIM"][index_pos] = xref
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["MIM"][index

193580


In [677]:
mini_ncbi_df["ENSG_ID"] = mini_ncbi_df["ENSG_ID"].str.replace("ensg", "ENSG", 1)

In [678]:
mini_ncbi_df = mini_ncbi_df.drop(
    [
        "AllianceGenome",
        "MIRbase",
        "IMGTgene_db",
        "dash",
        "unknown",
        "dbXrefs",
        "MIM",
    ],
    axis=1,
)

need to remove phenotype records. This file (ncbi_records_w_property_pheno_only_20250630.txt) was generated through the NCBI Gene web page by searching for these parameters "phenotype only [Properties] AND "Homo sapiens"[porgn:__txid9606]"

In [679]:
ncbi_pheno_records_df = pd.read_csv("../input/ncbi_records_w_property_pheno_only_20250630.txt", sep="\t")

ncbi_pheno_records_df = ncbi_pheno_records_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Aliases": "alias_symbol"})

In [680]:
mini_ncbi_df = mini_ncbi_df[~mini_ncbi_df['NCBI_ID'].isin(ncbi_pheno_records_df['NCBI_ID'])]

In [681]:
# Extract date from filename and check if it is before June 25, 2025
match = re.search(r'(\d{8})', ncbi_file_path)
file_date = datetime.strptime(match.group(1), "%Y%m%d") if match else None
cutoff_date = datetime.strptime("20250625", "%Y%m%d")

# Apply list-splitting logic for newer files
if file_date and file_date >= cutoff_date:
    mini_ncbi_df['alias_symbol'] = (
        mini_ncbi_df['alias_symbol']
        .fillna('')
        .str.split('|')
        .apply(lambda x: [a.strip() for a in x if a.strip()])
    )
    mini_ncbi_df = mini_ncbi_df.explode('alias_symbol')
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...
193575,8923215,trnD,-,,
193576,8923216,trnP,-,,
193577,8923217,trnA,-,,
193578,8923218,COX1,-,,


In [682]:
mini_ncbi_df = mini_ncbi_df.replace(" ", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("-", np.nan)

how many gene records for the NCBI data set before any kind of cleaning?

In [683]:
total_raw_gene_record_count_ncbi = mini_ncbi_df['NCBI_ID'].dropna().nunique()
total_raw_gene_record_count_ncbi

192410

how many gene records have no primary gene symbol?

In [684]:
raw_no_primary_symbol_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].isna()]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
raw_no_primary_symbol_record_count_ncbi

0

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [685]:
raw_loc_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('LOC', na=False)&
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
raw_loc_record_count_ncbi

147698

Drop sub species records: "homo sapiens ssp denisova"[Organism] 

In [686]:
ncbi_denisova_records_df = pd.read_csv("../input/ncbi_denisova_20250630.txt", sep="\t")

ncbi_denisova_records_df = ncbi_denisova_records_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Aliases": "alias_symbol"})

In [687]:
mini_ncbi_df = mini_ncbi_df[~mini_ncbi_df['NCBI_ID'].isin(ncbi_denisova_records_df['NCBI_ID'])]

Drop sub species records: "homo sapiens neanderthalensis"[Organism] 

In [688]:
ncbi_neanderthalensis_records_df = pd.read_csv("../input/ncbi_neanderthalensis_20250630.txt", sep="\t")

ncbi_neanderthalensis_records_df = ncbi_neanderthalensis_records_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Aliases": "alias_symbol"})

In [689]:
mini_ncbi_df = mini_ncbi_df[~mini_ncbi_df['NCBI_ID'].isin(ncbi_neanderthalensis_records_df['NCBI_ID'])]

In [690]:
create_ap_collision_df(mini_ncbi_df, "NCBI")

All collisions are present in gene alias lists.


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899


In [691]:
mini_ncbi_df = pd.read_csv(
    "../output/mini_ncbi_df.csv", index_col=[0])

In [692]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...
193502,141732005,ADCY2-AS1,,40064,
193503,141732006,NSG2-AS1,,41074,
193504,141732007,ST18-AS1,,58430,
193505,141732008,MICAL2-AS1,,58437,


In [693]:
merged_alias_ap_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ncbi_df.csv", index_col=[0])
merged_alias_ap_collision_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,collision,source,HGNC_ID,ENSG_ID,alias_symbol
0,3494,IGHA2,A2M,NCBI,5479,ENSG00000211890,A2M
1,54776,PPP1R12C,AAVS1,NCBI,14947,ENSG00000125503,"AAVS1,LENG3,MBS85,p84,p85"
2,6646,SOAT1,ACAT1,NCBI,11177,ENSG00000057252,"ACACT,ACAT,ACAT-1,ACAT1,SOAT,STAT"
3,8435,SOAT2,ACAT2,NCBI,11178,ENSG00000167780,"ACACT2,ACAT2,ARGP2"
4,9034,CCRL2,ACKR5,NCBI,1612,ENSG00000121797,"ACKR5,CKRX,CRAM,CRAM-A,CRAM-B,HCR"
...,...,...,...,...,...,...,...
2032,29903,CCDC106,ZNF581,NCBI,30181,ENSG00000173581,"HSU79303,ZNF581"
2033,146540,ZNF785,ZNF688,NCBI,26496,ENSG00000197162,ZNF688
2034,57829,ZP4,ZP1,NCBI,15770,ENSG00000116996,"ZBP,ZP1,ZP1B,ZPB,ZPB2,Zp-4"
2036,55663,ZNF446,ZSCAN30,NCBI,21036,ENSG00000083838,"ZKSCAN20,ZSCAN30,ZSCAN52"


how many gene records are in the NCBI database?

In [694]:
gene_record_set_ncbi = set(mini_ncbi_df['NCBI_ID'])
gene_record_count_ncbi = len(gene_record_set_ncbi)
gene_record_count_ncbi

44639

how many unique primary gene symbols are in the NCBI data set?

In [695]:
primary_symbol_set_ncbi = set(mini_ncbi_df['gene_symbol'])
primary_symbol_count_ncbi = len(primary_symbol_set_ncbi)
primary_symbol_count_ncbi

44541

how many unique alias gene symbols are in the NCBI data set?

In [696]:
alias_symbol_set_ncbi = set(mini_ncbi_df['alias_symbol'])
alias_symbol_count_ncbi = len(alias_symbol_set_ncbi)
alias_symbol_count_ncbi

69227

how many unique gene symbols in total are there in the NCBI data set??

In [697]:
total_symbol_count_ncbi = pd.concat([mini_ncbi_df["gene_symbol"], mini_ncbi_df["alias_symbol"]]).dropna().nunique()
total_symbol_count_ncbi

112294

how many gene records have no alias symbols?

In [698]:
no_alias_symbol_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["alias_symbol"].isna()]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
no_alias_symbol_record_count_ncbi

17380

how many gene records have a primary gene symbol that is a C#orf?

In [699]:
orf_record_set_ncbi = set(mini_ncbi_df[
    mini_ncbi_df['gene_symbol']
    .str.contains(r'^C.*ORF', case=False, na=False) &
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)
])
orf_record_count_ncbi = len(orf_record_set_ncbi)
orf_record_count_ncbi

5

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [700]:
fam_record_set_ncbi = set(
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('FAM', na=False)&
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
)
fam_record_count_ncbi = len(fam_record_set_ncbi)
fam_record_count_ncbi

371

how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [701]:
kiaa_record_set_ncbi = set(
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('KIAA', na=False)&
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
)
kiaa_record_count_ncbi = len(kiaa_record_set_ncbi)
kiaa_record_count_ncbi

33

which gene records share a primary symbol with other gene records?

In [702]:
# groupby name and return a boolean of whether each has more than 1 unique Country
multi_primary = mini_ncbi_df.groupby(["gene_symbol"]).NCBI_ID.nunique().gt(1)

num_rows = mini_ncbi_df[mini_ncbi_df.gene_symbol.isin(multi_primary[multi_primary].index)].shape[0]
print(num_rows)

# use loc to only see those values that have `True` in `multi_country`:
multi_record_same_symbol_df = mini_ncbi_df.loc[mini_ncbi_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol')
multi_record_same_symbol_df.head(60)

110


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
3550,4549,RNR1,MTRNR1,7470,
4762,6052,RNR1,,10082,
3551,4550,RNR2,MTRNR2,7471,
4763,6053,RNR2,,10083,
65892,124901565,TRNAA-AGC,,0,
65891,124901564,TRNAA-AGC,,0,
65890,124901563,TRNAA-AGC,,0,
65889,124901562,TRNAA-AGC,,0,
70782,124906586,TRNAA-AGC,,0,
65888,124901561,TRNAA-AGC,,0,


How many ambiguous symbols result from alias-primary collisions?

In [703]:
ap_collision_ambiguous_symbol_set_ncbi = set(
    merged_alias_ap_collision_ncbi_df["collision"]
)
ap_collision_ambiguous_symbol_count_ncbi = len(ap_collision_ambiguous_symbol_set_ncbi)
ap_collision_ambiguous_symbol_count_ncbi

1394

How many records have at least one alias-primary collision (alias that matches another record's primary gene symbol)?

In [704]:
ap_record_set_ncbi = set(merged_alias_ap_collision_ncbi_df["NCBI_ID"])
ap_record_count_ncbi = len(ap_record_set_ncbi)
ap_record_count_ncbi

1614

# Merge 3 sets together

In [705]:
merged_alias_primary_collisions_df = pd.concat(
    [
        merged_alias_ap_collision_hgnc_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
        merged_alias_ap_collision_ncbi_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
        merged_alias_ap_collision_ensg_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
    ]
)
merged_alias_primary_collisions_df

Unnamed: 0,gene_symbol,alias_symbol,collision,source,ENSG_ID,HGNC_ID,NCBI_ID
0,PPP1R12C,"AAVS1,DKFZP434D0412,MBS85,p84,p85",AAVS1,HGNC,ENSG00000125503,14947,54776
1,SOAT1,"ACAT,ACAT1",ACAT1,HGNC,ENSG00000057252,11177,6646
2,SOAT2,ACAT2,ACAT2,HGNC,ENSG00000167780,11178,8435
3,CCRL2,"ACKR5,CKRX,CRAM-A,CRAM-B,HCR",ACKR5,HGNC,ENSG00000121797,1612,9034
4,NDUFAB1,"ACP,ACP1,FASN2A,SDAP",ACP1,HGNC,ENSG00000004779,7694,4706
...,...,...,...,...,...,...,...
806,ZNF121,"D19S204,ZHC32,ZNF20",ZNF20,ENSG,ENSG00000197961,12904,7675
807,RNF141,"ZFP26,ZNF230",ZNF230,ENSG,ENSG00000110315,21159,50862
808,ZNF322P1,"ZNF322,ZNF322B",ZNF322,ENSG,ENSG00000188801,14003,0
809,ZNF106,"SH3BP3,ZFP106,ZNF474",ZNF474,ENSG,ENSG00000103994,12886,64397


In [706]:
merged_alias_primary_collisions_df.loc[
    merged_alias_primary_collisions_df["collision"] == "KRAS"
]

Unnamed: 0,gene_symbol,alias_symbol,collision,source,ENSG_ID,HGNC_ID,NCBI_ID
964,NRAS,"ALPS4,CMNS,KRAS,N-ras,NCMS,NRAS1,NS6",KRAS,NCBI,ENSG00000213281,7989,4893


# Convert to csv

In [707]:
merged_alias_primary_collisions_df.to_csv(
    "../output/merged_alias_primary_collisions_df.csv", index=False
)

In [708]:
common_ap_collision_ambiguous_symbol_set = (
    ap_collision_ambiguous_symbol_set_ncbi
    & ap_collision_ambiguous_symbol_set_hgnc
    & ap_collision_ambiguous_symbol_set_ensg
)
common_ap_collision_ambiguous_symbol_set

{'ACAT1',
 'ACAT2',
 'ACP1',
 'ACTBP2',
 'ADA2',
 'ADRA1A',
 'AGT',
 'AIP',
 'ALB',
 'AMN',
 'APC2',
 'AR',
 'ARC',
 'ARG1',
 'ARHGAP10',
 'ARL1',
 'ARSB',
 'ASIP',
 'ATF1',
 'ATR',
 'AVP',
 'B3GNT8',
 'BACH1',
 'BAP1',
 'BCAM',
 'BCRP1',
 'BCRP2',
 'BDP1',
 'BNC1',
 'BRAP',
 'BRCC3',
 'BRI3',
 'BST1',
 'BTF3',
 'C6',
 'C7',
 'CA11',
 'CABP1',
 'CAD',
 'CAMP',
 'CAP1',
 'CAP2',
 'CAPS',
 'CAPS2',
 'CARF',
 'CAST',
 'CBLC',
 'CCR10',
 'CCR4',
 'CDH1',
 'CDH20',
 'CDH7',
 'CDKN1A',
 'CDS1',
 'CES2',
 'CHD5',
 'CHL1',
 'CHP1',
 'CIC',
 'CKLF',
 'CLC',
 'CLCP1',
 'CLIP4',
 'CNP',
 'CNR1',
 'CNR2',
 'COP1',
 'CPA3',
 'CPA6',
 'CPD',
 'CPN1',
 'CPN2',
 'CPS1',
 'CREB3',
 'CRIP1',
 'CRP',
 'CSN2',
 'CSN3',
 'CSNK2A1',
 'CST6',
 'CSTP1',
 'CTAGE4',
 'CTH',
 'DAO',
 'DAP',
 'DAP3',
 'DBI',
 'DBP',
 'DEF6',
 'DIO1',
 'DLC1',
 'DMC1',
 'DMP1',
 'DOK1',
 'DRC3',
 'DRG1',
 'DRP2',
 'DSC1',
 'DUSP1',
 'DUSP23',
 'DUSP26',
 'EAF2',
 'EBP',
 'ECD',
 'EIF2A',
 'ELF1',
 'ELK1',
 'ELN',
 'EMB',
 'EPO',
 

In [709]:
len(common_ap_collision_ambiguous_symbol_set)

434