### Import 

In [2415]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [2416]:
def create_ap_collision_df(mini_xxxx_df: pd.DataFrame, source: str) -> pd.DataFrame:
    """Create a df of alias-primary collision symbols 

    :param mini_xxxx_df: Processed df of gene records
    :param source: Representation of the source of the gene records
    :param split_on_character: Character that is used to seperate alias symbols in the mini_xxxx_df
    :return: A df of genes that share an alias with another gene
    """

    #Strip "HGNC" from HGNC ID
    mini_xxxx_df["HGNC_ID"] = mini_xxxx_df["HGNC_ID"].str.lstrip("HGNC:")

    #Remove placeholder gene records (those with no primary gene symbol)
    mini_xxxx_df = mini_xxxx_df.replace(" ", np.nan)
    mini_xxxx_df = mini_xxxx_df.replace("", np.nan)
    mini_xxxx_df = mini_xxxx_df.replace("-", np.nan)
    mini_xxxx_df = mini_xxxx_df.dropna(subset=["gene_symbol"])

    #Remove placeholder LOC genes
    mini_xxxx_df = mini_xxxx_df[~mini_xxxx_df['gene_symbol'].str.startswith('LOC', na=False)]

    mini_xxxx_df['HGNC_ID'] = mini_xxxx_df['HGNC_ID'].fillna(0).astype(int)
    mini_xxxx_df['NCBI_ID'] = mini_xxxx_df['NCBI_ID'].fillna(0).astype(int)

    #mini_xxx_df
    ## no LOC genes (only present in NCBI)
    ## no records with no primary symbols (only present in ENSG)
    ## each alias on a separate row
    #Convert the df into a csv and save
    mini_xxxx_df.to_csv(f'../output/mini_{source.lower()}_df.csv', index=True) 

    #Make a new df where the alias symbols are merged together for each record
    merged_alias_xxxx_df = mini_xxxx_df.copy()
    merged_alias_xxxx_df["alias_symbol"] = merged_alias_xxxx_df["alias_symbol"].fillna("").astype(str)
    merged_alias_xxxx_df = (
    merged_alias_xxxx_df.groupby(
        [f"{source}_ID"], dropna=False
    )["alias_symbol"]
    .apply(lambda x: ",".join(x.dropna())).reset_index())

    #merged_alias_xxxx_df
    ##merging aliases associated with the same record (same gene record identifier) into a list
    #Convert the df into a csv and save
    merged_alias_xxxx_df.to_csv(f'../output/merged_alias_{source}_df.csv', index=True) 

    #Make a set of the primary gene symbols
    xxxx_gene_symbol_set = set(mini_xxxx_df["gene_symbol"])
    uppercased_xxxx_gene_symbol_set = {s.upper() for s in xxxx_gene_symbol_set}

    
    subset_genes_xxxx_df = mini_xxxx_df.copy()

    #Remove alias symbols that are an exact match to their respective primary gene symbol
    subset_genes_xxxx_df["alias_symbol_upper"] = subset_genes_xxxx_df["alias_symbol"].str.upper()
    subset_genes_xxxx_df["gene_symbol_upper"] = subset_genes_xxxx_df["gene_symbol"].str.upper()
    subset_genes_xxxx_df = subset_genes_xxxx_df[subset_genes_xxxx_df['gene_symbol_upper'] != subset_genes_xxxx_df['alias_symbol_upper']]
    subset_genes_xxxx_df = subset_genes_xxxx_df.drop(["alias_symbol_upper","gene_symbol_upper"], axis=1)

    #subset_genes_xxxx_df
    ## no aliases that match the primary gene symbol
    ## each alias on a separate row
    #Convert the df into a csv and save
    subset_genes_xxxx_df.to_csv(f'../output/subset_genes_{source}_df.csv', index=True) 

    #Create df with genes that have an alias that can be found as another gene's primary gene symbol
    ap_collision_xxxx_df = subset_genes_xxxx_df.copy()
    ap_collision_xxxx_df["alias_symbol_upper"] = ap_collision_xxxx_df["alias_symbol"].str.upper()

    #makes list of aliases into a set
    ap_collision_xxxx_df.loc[:, "alias_symbol_upper"] = ap_collision_xxxx_df["alias_symbol_upper"].apply(lambda x: {x})

    ap_collision_xxxx_df.loc[:, "collision"] = ap_collision_xxxx_df["alias_symbol_upper"].apply(
        lambda x: x & uppercased_xxxx_gene_symbol_set
    )
    ap_collision_xxxx_df = ap_collision_xxxx_df.drop("alias_symbol_upper", axis=1)
    ap_collision_xxxx_df = ap_collision_xxxx_df[ap_collision_xxxx_df["collision"].apply(lambda x: len(x) > 0)]
    #changes back the set of aliases to a list, and make the set of collisions to a list
    ap_collision_xxxx_df['collision'] = ap_collision_xxxx_df['collision'].apply(
        lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x
    )
    ap_collision_xxxx_df = ap_collision_xxxx_df.sort_values("collision")

    #Add a source tag for future merging efforts
    ap_collision_xxxx_df["source"] = str(source.upper())

    #ap_collision_xxxx_df
    ## only the gene records with an alias that matches another gene record's primary gene symbol
    ## each alias on a separate row
    #Convert the df into a csv
    ap_collision_xxxx_df.to_csv(f'../output/single_alias_ap_collision_{source.lower()}_df.csv', index=True)

    #Create a secondary collision df that merges the alias symbols for each record
    merged_alias_ap_collision_xxxx_df = ap_collision_xxxx_df.drop(columns=['alias_symbol'])
    merged_alias_ap_collision_xxxx_df = pd.merge(merged_alias_ap_collision_xxxx_df, merged_alias_xxxx_df, on=[f"{source}_ID"], how="left")
    merged_alias_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df[["gene_symbol","alias_symbol","ENSG_ID","HGNC_ID","NCBI_ID","collision","source"]]
    merged_alias_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.drop_duplicates(subset=[f"{source}_ID"])

    #Test to make sure all collisions are in the alias list for the record
    test_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.copy()
    test_ap_collision_xxxx_df["alias_symbol_upper"] = test_ap_collision_xxxx_df["alias_symbol"].str.upper()   
    test_ap_collision_xxxx_df['alias_symbol_upper'] = test_ap_collision_xxxx_df['alias_symbol_upper'].apply(lambda x: x if isinstance(x, list) else [x])
    test_ap_collision_xxxx_df['collision_in_alias'] = test_ap_collision_xxxx_df.apply(lambda row: row['collision'] in test_ap_collision_xxxx_df['alias_symbol_upper'].values, axis=1)
    true_count = test_ap_collision_xxxx_df['collision_in_alias'].sum()
    test_ap_collision_xxxx_df = test_ap_collision_xxxx_df.drop("alias_symbol_upper", axis=1)

    if true_count == 0:
        #merged_alias_ap_collision_xxxx_df
        ## only the gene records with an alias that matches another gene record's primary gene symbol
        ## merging aliases associated with the same record (same gene record identifier) into a list
        #Convert the df into a csv and save
        merged_alias_ap_collision_xxxx_df.to_csv(f'../output/merged_alias_ap_collision_{source.lower()}_df.csv', index=True)
        print("All collisions are present in gene alias lists.")
    else:
        print("Some collisions are not present in gene alias lists.")

    return mini_xxxx_df.head()

# Ensembl

In [2417]:
mini_ensg_df = pd.read_csv(
    "../input/ensg_biomart_gene20250625.txt", sep="\t",dtype={"NCBI gene (formerly Entrezgene) ID": pd.Int64Dtype()}
)
mini_ensg_df = mini_ensg_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Gene Synonym": "alias_symbol",
        "Gene name": "gene_symbol",
        "Gene stable ID": "ENSG_ID",
        "NCBI gene (formerly Entrezgene) ID": "NCBI_ID",
    }
)
mini_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF
1,ENSG00000210049,,HGNC:7481,TRNF,MT-TF
2,ENSG00000211459,,HGNC:7470,12S,MT-RNR1
3,ENSG00000211459,,HGNC:7470,MOTS-C,MT-RNR1
4,ENSG00000211459,,HGNC:7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133060,ENSG00000229388,,HGNC:52502,LINC01715,TAF12-DT
133061,ENSG00000289291,,,,
133062,ENSG00000274978,26824,HGNC:10108,RNU11-1,RNU11
133063,ENSG00000274978,26824,HGNC:10108,U11,RNU11


In [2418]:
mini_ensg_df = mini_ensg_df.replace(" ", np.nan)
mini_ensg_df = mini_ensg_df.replace("", np.nan)
mini_ensg_df = mini_ensg_df.replace("-", np.nan)
mini_ensg_df = mini_ensg_df.replace("<NA>", np.nan)

how many gene records for the Ensembl data set before any kind of cleaning?

In [2419]:
len(set(mini_ensg_df['ENSG_ID']))

86364

how many unique primary gene symbols are in the Ensembl data set?

In [2420]:
len(set(mini_ensg_df['gene_symbol']))

41165

how many unique alias gene symbols are in the Ensembl data set?

In [2421]:
len(set(mini_ensg_df['alias_symbol']))

55413

how many unique gene symbols in total are there in the Ensembl data set??

In [2422]:
len(set(mini_ensg_df['gene_symbol']) | set(mini_ensg_df['alias_symbol']))

95942

how many gene records have no primary gene symbol?

In [2423]:
unique_ensg_df = mini_ensg_df[mini_ensg_df['ENSG_ID'].duplicated(keep=False) == False]

# Count rows with missing gene symbols before dropping
no_symbol_count = mini_ensg_df["gene_symbol"].isna().sum()
no_symbol_count

38041

how many gene records have no alias symbols?

In [2424]:
# Now count rows in that subset where 'alias_symbol' is missing
no_alias_symbol_count = unique_ensg_df["alias_symbol"].isna().sum()
no_alias_symbol_count

57009

how many gene records have a primary gene symbol that is a C#orf?

In [2425]:
orf_rows = mini_ensg_df[
    mini_ensg_df['gene_symbol']
    .str.contains(r'^C.*ORF', case=False, na=False) &
    ~mini_ensg_df['gene_symbol']
    .str.contains('-', na=False)
]
len(orf_rows)

420

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [2426]:
count_loc = mini_ensg_df['gene_symbol'].str.startswith('LOC', na=False).sum()
count_loc

0

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [2427]:
count_FAM = mini_ensg_df['gene_symbol'].str.startswith('FAM', na=False).sum()
count_FAM

738

how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [2428]:
count_KIAA = mini_ensg_df['gene_symbol'].str.startswith('KIAA', na=False).sum()
count_KIAA

51

which gene records share a primary symbol with other gene records?

In [2429]:
# groupby name and return a boolean of whether each has more than 1 unique ENSG ID
multi_primary = mini_ensg_df.groupby(["gene_symbol"]).ENSG_ID.nunique().gt(1)

# use loc to only see those values that have `True` in `multi_primary`:
mini_ensg_df.loc[mini_ensg_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol').iloc[6000:6060]

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
92891,ENSG00000138641,8916,HGNC:4876,KIAA0032,HERC3
92878,ENSG00000287542,8916,,,HERC3
52117,ENSG00000273529,388585,HGNC:19764,BHLHB38,HES5
14696,ENSG00000197921,388585,HGNC:19764,BHLHB38,HES5
4416,ENSG00000280680,55733,HGNC:18270,MART-2,HHAT
4421,ENSG00000280680,55733,HGNC:18270,SKN,HHAT
4418,ENSG00000280680,55733,HGNC:18270,RASP,HHAT
4417,ENSG00000280680,55733,HGNC:18270,MART2,HHAT
43282,ENSG00000054392,55733,HGNC:18270,MART2,HHAT
43285,ENSG00000054392,55733,HGNC:18270,SKI,HHAT


In [2430]:
create_ap_collision_df(mini_ensg_df, "ENSG")

All collisions are present in gene alias lists.


Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,0,7481,MTTF,MT-TF
1,ENSG00000210049,0,7481,TRNF,MT-TF
2,ENSG00000211459,0,7470,12S,MT-RNR1
3,ENSG00000211459,0,7470,MOTS-C,MT-RNR1
4,ENSG00000211459,0,7470,MTRNR1,MT-RNR1


In [2431]:
mini_ensg_df = pd.read_csv(
    "../output/mini_ensg_df.csv", index_col=[0])

In [2432]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,0,7481,MTTF,MT-TF
1,ENSG00000210049,0,7481,TRNF,MT-TF
2,ENSG00000211459,0,7470,12S,MT-RNR1
3,ENSG00000211459,0,7470,MOTS-C,MT-RNR1
4,ENSG00000211459,0,7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,85028,30062,LINC00100,SNHG12
133059,ENSG00000197989,85028,30062,PNAS-123,SNHG12
133060,ENSG00000229388,0,52502,LINC01715,TAF12-DT
133062,ENSG00000274978,26824,10108,RNU11-1,RNU11


In [2433]:
merged_alias_ap_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ensg_df.csv", index_col=[0])
merged_alias_ap_collision_ensg_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,RN7SK,7SK,ENSG00000283293,10037,125050,7SK,ENSG
1,SOAT1,"ACAT,ACAT1,SOAT,STAT",ENSG00000057252,11177,6646,ACAT1,ENSG
2,SOAT2,ACAT2,ENSG00000167780,11178,8435,ACAT2,ENSG
3,NDUFAB1,"ACP,ACP1,FASN2A,SDAP",ENSG00000004779,7694,4706,ACP1,ENSG
4,ACTBP8,ACTBP2,ENSG00000220267,141,0,ACTBP2,ENSG
...,...,...,...,...,...,...,...
806,ZNF121,"D19S204,ZHC32,ZNF20",ENSG00000197961,12904,7675,ZNF20,ENSG
807,RNF141,"ZFP26,ZNF230",ENSG00000110315,21159,50862,ZNF230,ENSG
808,ZNF322P1,"ZNF322,ZNF322B",ENSG00000188801,14003,0,ZNF322,ENSG
809,ZNF106,"SH3BP3,ZFP106,ZNF474",ENSG00000103994,12886,64397,ZNF474,ENSG


how many gene records have an alias that is also a primary gene symbol of another gene?

In [2434]:
len(set(merged_alias_ap_collision_ensg_df["ENSG_ID"]))

767

### Make a set of the primary gene symbols

In [2435]:
ensg_gene_symbol_set = set(mini_ensg_df["gene_symbol"])

In [2436]:
total_number_ensg_gene_symbols = len(ensg_gene_symbol_set)
total_number_ensg_gene_symbols

41164

How many collisions are there? How many records are involved in one?


In [2437]:
ensg_alias_primary_collision_set = set(merged_alias_ap_collision_ensg_df["collision"])
len(ensg_alias_primary_collision_set)

606

In [2438]:
ensg_alias_primary_collision_primary_symbol_set = set(merged_alias_ap_collision_ensg_df["gene_symbol"])
len(ensg_alias_primary_collision_primary_symbol_set)

676

In [2439]:
ensg_alias_primary_collision_gene_record_set = set(merged_alias_ap_collision_ensg_df["ENSG_ID"])
len(ensg_alias_primary_collision_gene_record_set)

767

1. Why is the alias-gene collision set not the same length as the set of primary symbols with collisions ?
2. Why is the length of the alias-gene collison set shorter?
 - A gene record with an alias-primary collision has an alias that matches a different gene's primary gene symbol.
 - Multiple gene records can share a single alias (alias-alias collision)
 - If that shared alias is an alias-primary collision, then there will be more unique gene symbols in the set of primary symbols with collisions than the set of alias-primary collisions. 

# HGNC

## Set up table

In [2440]:
hgnc_file_path = "../input/hgnc_biomart_gene20250625.txt"

mini_hgnc_df = pd.read_csv(
    hgnc_file_path, sep="\t"
)

# Rename columns
mini_hgnc_df = mini_hgnc_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Approved symbol": "gene_symbol",
        "Ensembl gene ID": "ENSG_ID",
    }
)

#structure and labeling in HGNC download files changed between 2024 amd 2025
if "Alias symbol" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"Alias symbol": "alias_symbol"})
elif "Alias symbols" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"Alias symbols": "alias_symbol"})
else:
    mini_hgnc_df["alias_symbol"] = pd.NA

if "NCBI gene ID" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"NCBI gene ID": "NCBI_ID"})
elif "NCBI Gene ID" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"NCBI Gene ID": "NCBI_ID"})
else:
    mini_hgnc_df["NCBI_ID"] = pd.NA   

mini_hgnc_df["NCBI_ID"] = mini_hgnc_df["NCBI_ID"].astype(pd.Int64Dtype())

# Extract date from filename and check if it is before June 25, 2025
match = re.search(r'(\d{8})', hgnc_file_path)
file_date = datetime.strptime(match.group(1), "%Y%m%d") if match else None
cutoff_date = datetime.strptime("20250625", "%Y%m%d")

# Apply list-splitting logic for newer files
if file_date and file_date >= cutoff_date:
    mini_hgnc_df['alias_symbol'] = (
        mini_hgnc_df['alias_symbol']
        .fillna('')
        .str.split(',')
        .apply(lambda x: [a.strip() for a in x if a.strip()])
    )
    mini_hgnc_df = mini_hgnc_df.explode('alias_symbol')

mini_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,HGNC:100,ASIC1,BNaC2,41,ENSG00000110881
0,HGNC:100,ASIC1,hBNaC2,41,ENSG00000110881
1,HGNC:10000,RGS4,,5999,ENSG00000117152
2,HGNC:10001,RGS5,,8490,ENSG00000143248
3,HGNC:10002,RGS6,,9628,ENSG00000182732
...,...,...,...,...,...
44232,HGNC:9997,RGS16,RGS-r,6004,ENSG00000143333
44233,HGNC:9998,RGS2,,5997,ENSG00000116741
44234,HGNC:9999,RGS3,C2PA,5998,ENSG00000138835
44234,HGNC:9999,RGS3,FLJ20370,5998,ENSG00000138835


In [2441]:
mini_hgnc_df = mini_hgnc_df.replace(" ", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("-", np.nan)

how many gene records for the HGNC data set before any kind of cleaning?

In [2442]:
len(set(mini_hgnc_df['HGNC_ID']))

44235

how many unique primary gene symbols are in the HGNC data set?

In [2443]:
len(set(mini_hgnc_df['gene_symbol']))

44235

how many unique alias gene symbols are in the HGNC data set?

In [2444]:
len(set(mini_hgnc_df['alias_symbol']))

64842

how many unique gene symbols in total are there in the HGNC data set??

In [2445]:
len(set(mini_hgnc_df['gene_symbol']) | set(mini_hgnc_df['alias_symbol']))

108609

how many gene records have no primary gene symbol?

In [2446]:
unique_hcng_df = mini_hgnc_df[mini_hgnc_df['HGNC_ID'].duplicated(keep=False) == False]
# Count rows with missing gene symbols before dropping
no_symbol_count = mini_hgnc_df["gene_symbol"].isna().sum()
no_symbol_count

0

how many gene records have no alias symbols?

In [2447]:
no_alias_symbol_count = unique_hcng_df["alias_symbol"].isna().sum()
no_alias_symbol_count

21767

how many gene records have a primary gene symbol that is a C#orf?

In [2448]:
orf_rows = mini_hgnc_df[
    mini_hgnc_df['gene_symbol']
    .str.contains(r'^C.*ORF', case=False, na=False) &
    ~mini_hgnc_df['gene_symbol']
    .str.contains('-', na=False)
]
len(orf_rows)

344

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [2449]:
count_loc = mini_hgnc_df['gene_symbol'].str.startswith('LOC', na=False).sum()
count_loc

0

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [2450]:
count_FAM = mini_hgnc_df['gene_symbol'].str.startswith('FAM', na=False).sum()
count_FAM

477

how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [2451]:
count_KIAA = mini_hgnc_df['gene_symbol'].str.startswith('KIAA', na=False).sum()
count_KIAA

47

which gene records share a primary symbol with other gene records?

In [2452]:
# groupby name and return a boolean of whether each has more than 1 unique Country
multi_primary = mini_hgnc_df.groupby(["gene_symbol"]).HGNC_ID.nunique().gt(1)

# use loc to only see those values that have `True` in `multi_primary`:
mini_hgnc_df.loc[mini_hgnc_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol').head(50)

Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID


In [2453]:
create_ap_collision_df(mini_hgnc_df, "HGNC")

All collisions are present in gene alias lists.


Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,100,ASIC1,BNaC2,41,ENSG00000110881
0,100,ASIC1,hBNaC2,41,ENSG00000110881
1,10000,RGS4,,5999,ENSG00000117152
2,10001,RGS5,,8490,ENSG00000143248
3,10002,RGS6,,9628,ENSG00000182732


In [2454]:
mini_hgnc_df = pd.read_csv(
    "../output/mini_hgnc_df.csv", index_col=[0])

In [2455]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,100,ASIC1,BNaC2,41,ENSG00000110881
0,100,ASIC1,hBNaC2,41,ENSG00000110881
1,10000,RGS4,,5999,ENSG00000117152
2,10001,RGS5,,8490,ENSG00000143248
3,10002,RGS6,,9628,ENSG00000182732
...,...,...,...,...,...
44232,9997,RGS16,RGS-r,6004,ENSG00000143333
44233,9998,RGS2,,5997,ENSG00000116741
44234,9999,RGS3,C2PA,5998,ENSG00000138835
44234,9999,RGS3,FLJ20370,5998,ENSG00000138835


In [2456]:
merged_alias_ap_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_ap_collision_hgnc_df.csv", index_col=[0])
merged_alias_ap_collision_hgnc_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,PPP1R12C,"DKFZP434D0412,p84,MBS85,p85,AAVS1",ENSG00000125503,14947,54776,AAVS1,HGNC
1,SOAT1,"ACAT,ACAT1",ENSG00000057252,11177,6646,ACAT1,HGNC
2,SOAT2,ACAT2,ENSG00000167780,11178,8435,ACAT2,HGNC
3,CCRL2,"HCR,CRAM-B,CKRX,CRAM-A,ACKR5",ENSG00000121797,1612,9034,ACKR5,HGNC
4,NDUFAB1,"SDAP,FASN2A,ACP,ACP1",ENSG00000004779,7694,4706,ACP1,HGNC
...,...,...,...,...,...,...,...
556,ECEL1,"XCE,DINE",ENSG00000171551,3147,9427,XCE,HGNC
557,ZNF121,"ZHC32,ZNF20",ENSG00000197961,12904,7675,ZNF20,HGNC
558,RNF141,"ZFP26,ZNF230",ENSG00000110315,21159,50862,ZNF230,HGNC
559,ZNF106,"ZNF474,SH3BP3",ENSG00000103994,12886,64397,ZNF474,HGNC


how many gene records have an alias that is also a primary gene symbol of another gene?

In [2457]:
len(set(merged_alias_ap_collision_hgnc_df["HGNC_ID"]))

547

### Make a set of the primary gene symbols

In [2458]:
hgnc_gene_symbol_set = set(mini_hgnc_df["gene_symbol"])

In [2459]:
total_number_hgnc_gene_symbols = len(hgnc_gene_symbol_set)
total_number_hgnc_gene_symbols

44235

How many collisions are there? How many records are involved in one?

In [2460]:
hgnc_alias_primary_collision_set = set(merged_alias_ap_collision_hgnc_df["collision"])
len(hgnc_alias_primary_collision_set)

492

In [2461]:
hgnc_alias_primary_collision_primary_symbol_set = set(merged_alias_ap_collision_hgnc_df["gene_symbol"])
len(hgnc_alias_primary_collision_primary_symbol_set)

547

In [2462]:
hgnc_alias_primary_collision_gene_record_set = set(merged_alias_ap_collision_hgnc_df["HGNC_ID"])
len(hgnc_alias_primary_collision_gene_record_set)

547

# NCBI Info

In [2463]:
ncbi_file_path = "../input/Homo_sapiens.gene_info20250625"

mini_ncbi_df = pd.read_csv(ncbi_file_path, sep="\t")
# Drop all columns besides ENSG_ID, gene_symbol, and alias_symbol
mini_ncbi_df = mini_ncbi_df[
["GeneID", "Symbol", "Synonyms", "dbXrefs"]
]
mini_ncbi_df = mini_ncbi_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Synonyms": "alias_symbol"})


Split dbXrefs into individual columns

In [2464]:
mini_ncbi_df = mini_ncbi_df.assign(
    MIM=np.nan,
    HGNC_ID=np.nan,
    ENSG_ID=np.nan,
    AllianceGenome=np.nan,
    MIRbase=np.nan,
    IMGTgene_db=np.nan,
    dash=np.nan,
    unknown=np.nan,
)

In [2465]:
index_pos = 0

print(len(mini_ncbi_df))
while index_pos < len(mini_ncbi_df):
    xrefs = mini_ncbi_df["dbXrefs"][index_pos].split("|")

    for xref in xrefs:
        xref = xref.lower()
        if xref.startswith("mim:"):
            xref = xref.replace("mim:", "")
            mini_ncbi_df["MIM"][index_pos] = xref
        elif xref.startswith("hgnc:hgnc:"):
            xref = xref.replace("hgnc:hgnc:", "")
            mini_ncbi_df["HGNC_ID"][index_pos] = xref
        elif xref.startswith("ensembl:"):
            xref = xref.replace("ensembl:", "")
            mini_ncbi_df["ENSG_ID"][index_pos] = xref
        elif xref.startswith("alliancegenome:"):
            xref = xref.replace("alliancegenome:", "")
            mini_ncbi_df["AllianceGenome"][index_pos] = xref
        elif xref.startswith("mirbase"):
            xref = xref.replace("mirbase:", "")
            mini_ncbi_df["MIRbase"][index_pos] = xref
        elif xref.startswith("imgt/gene-db:"):
            xref = xref.replace("imgt/gene-db:", "")
            mini_ncbi_df["IMGTgene_db"][index_pos] = xref
        elif xref.startswith("-"):
            mini_ncbi_df["dash"][index_pos] = xref
        else:
            mini_ncbi_df["unknown"][index_pos] = xref

    index_pos += 1
    pass

print(index_pos)

193580


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  mini_ncbi_df["MIM"][index_pos] = xref
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["MIM"][index

193580


In [2466]:
mini_ncbi_df["ENSG_ID"] = mini_ncbi_df["ENSG_ID"].str.replace("ensg", "ENSG", 1)

In [2467]:
mini_ncbi_df = mini_ncbi_df.drop(
    [
        "AllianceGenome",
        "MIRbase",
        "IMGTgene_db",
        "dash",
        "unknown",
        "dbXrefs",
        "MIM",
    ],
    axis=1,
)

In [2468]:
# Extract date from filename and check if it is before June 25, 2025
match = re.search(r'(\d{8})', ncbi_file_path)
file_date = datetime.strptime(match.group(1), "%Y%m%d") if match else None
cutoff_date = datetime.strptime("20250625", "%Y%m%d")

# Apply list-splitting logic for newer files
if file_date and file_date >= cutoff_date:
    mini_ncbi_df['alias_symbol'] = (
        mini_ncbi_df['alias_symbol']
        .fillna('')
        .str.split('|')
        .apply(lambda x: [a.strip() for a in x if a.strip()])
    )
    mini_ncbi_df = mini_ncbi_df.explode('alias_symbol')
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...
193575,8923215,trnD,-,,
193576,8923216,trnP,-,,
193577,8923217,trnA,-,,
193578,8923218,COX1,-,,


In [2469]:
mini_ncbi_df = mini_ncbi_df.replace(" ", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("-", np.nan)

how many gene records for the NCBI data set before any kind of cleaning?

In [2470]:
total_raw_gene_record_count_ncbi = mini_ncbi_df['NCBI_ID'].dropna().nunique()
total_raw_gene_record_count_ncbi

193580

how many unique primary gene symbols are in the NCBI data set?

In [2471]:
raw_primary_symbol_count_ncbi = mini_ncbi_df['gene_symbol'].dropna().nunique()
raw_primary_symbol_count_ncbi

193427

how many unique alias gene symbols are in the NCBI data set?

In [2472]:
raw_alias_symbol_count_ncbi = mini_ncbi_df['alias_symbol'].dropna().nunique()
raw_alias_symbol_count_ncbi

69702

how many unique gene symbols in total are there in the NCBI data set??

In [2473]:
raw_total_symbol_count_ncbi = pd.concat([mini_ncbi_df["gene_symbol"], mini_ncbi_df["alias_symbol"]]).dropna().nunique()
raw_total_symbol_count_ncbi

261531

how many gene records have no primary gene symbol?

In [2474]:
no_primary_symbol_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].isna()]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
no_primary_symbol_record_count_ncbi

0

how many gene records have no alias symbols?

In [2475]:
no_alias_symbol_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["alias_symbol"].isna()]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
no_alias_symbol_record_count_ncbi

165885

how many gene records have a primary gene symbol that is a C#orf?

In [2476]:
orf_rows = mini_ncbi_df[
    mini_ncbi_df['gene_symbol']
    .str.contains(r'^C.*ORF', case=False, na=False) &
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)
]
orf_record_count_ncbi = orf_rows["NCBI_ID"].nunique()
orf_record_count_ncbi

248

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [2477]:
loc_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('LOC', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
loc_record_count_ncbi

147700

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [2478]:
fam_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('FAM', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
fam_record_count_ncbi

396

how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [2479]:
kiaa_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('KIAA', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
kiaa_record_count_ncbi

37

which gene records share a primary symbol with other gene records?

In [2480]:
# groupby name and return a boolean of whether each has more than 1 unique Country
multi_primary = mini_ncbi_df.groupby(["gene_symbol"]).NCBI_ID.nunique().gt(1)

# use loc to only see those values that have `True` in `multi_country`:
mini_ncbi_df.loc[mini_ncbi_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol').head(60)

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
193532,6775087,12S rRNA,,,
193573,8923213,12S rRNA,,,
193519,6775074,ATP6,,,
3522,4508,ATP6,ATPase6,7414.0,
3522,4508,ATP6,MTATP6,7414.0,
193548,8923188,ATP6,,,
3523,4509,ATP8,ATPase8,7415.0,
3523,4509,ATP8,MTATP8,7415.0,
193571,8923211,ATP8,,,
193522,6775077,ATP8,,,


In [2481]:
create_ap_collision_df(mini_ncbi_df, "NCBI")

All collisions are present in gene alias lists.


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899


In [2482]:
mini_ncbi_df = pd.read_csv(
    "../output/mini_ncbi_df.csv", index_col=[0])

In [2483]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...
193575,8923215,trnD,,0,
193576,8923216,trnP,,0,
193577,8923217,trnA,,0,
193578,8923218,COX1,,0,


In [2484]:
merged_alias_ap_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ncbi_df.csv", index_col=[0])
merged_alias_ap_collision_ncbi_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,IGHA2,A2M,ENSG00000211890,5479,3494,A2M,NCBI
1,NPSR1-AS1,AAA1,ENSG00000197085,22128,404744,AAA1,NCBI
2,CFAP91,"AAT1,AAT1alpha,C3orf15,CaM-IP2,MAATS1,SPATA26,...",ENSG00000183833,24010,89876,AAT1,NCBI
3,GPT,"AAT1,ALT,ALT1,GPT1,SGPT",ENSG00000167701,4552,2875,AAT1,NCBI
4,PPP1R12C,"AAVS1,LENG3,MBS85,p84,p85",ENSG00000125503,14947,54776,AAVS1,NCBI
...,...,...,...,...,...,...,...
2167,CCDC106,"HSU79303,ZNF581",ENSG00000173581,30181,29903,ZNF581,NCBI
2168,ZNF785,ZNF688,ENSG00000197162,26496,146540,ZNF688,NCBI
2169,ZP4,"ZBP,ZP1,ZP1B,ZPB,ZPB2,Zp-4",ENSG00000116996,15770,57829,ZP1,NCBI
2171,ZNF446,"ZKSCAN20,ZSCAN30,ZSCAN52",ENSG00000083838,21036,55663,ZSCAN30,NCBI


how many gene records have an alias that is also a primary gene symbol of another gene?

In [2485]:
len(set(merged_alias_ap_collision_ncbi_df["NCBI_ID"]))

1735

### Make a set of primary gene symbols

In [2486]:
ncbi_gene_symbol_set = set(mini_ncbi_df["gene_symbol"])

In [2487]:
total_number_ncbi_gene_symbols = len(ncbi_gene_symbol_set)
total_number_ncbi_gene_symbols

45727

How many collisions are there? How many records are involved in one?

In [2488]:
ncbi_alias_primary_collision_set = set(
    merged_alias_ap_collision_ncbi_df["collision"]
)
len(ncbi_alias_primary_collision_set)

1490

In [2489]:
ncbi_alias_primary_collision_primary_symbol_set = set(merged_alias_ap_collision_ncbi_df["gene_symbol"])
len(ncbi_alias_primary_collision_primary_symbol_set)

1735

In [2490]:
ncbi_alias_primary_collision_gene_record_set = set(
    merged_alias_ap_collision_ncbi_df["NCBI_ID"]
)
len(ncbi_alias_primary_collision_gene_record_set)

1735

# Merge 3 sets together

In [2491]:
merged_alias_primary_collisions_df = pd.concat(
    [
        merged_alias_ap_collision_hgnc_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
        merged_alias_ap_collision_ncbi_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
        merged_alias_ap_collision_ensg_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
    ]
)
merged_alias_primary_collisions_df

Unnamed: 0,gene_symbol,alias_symbol,collision,source,ENSG_ID,HGNC_ID,NCBI_ID
0,PPP1R12C,"DKFZP434D0412,p84,MBS85,p85,AAVS1",AAVS1,HGNC,ENSG00000125503,14947,54776
1,SOAT1,"ACAT,ACAT1",ACAT1,HGNC,ENSG00000057252,11177,6646
2,SOAT2,ACAT2,ACAT2,HGNC,ENSG00000167780,11178,8435
3,CCRL2,"HCR,CRAM-B,CKRX,CRAM-A,ACKR5",ACKR5,HGNC,ENSG00000121797,1612,9034
4,NDUFAB1,"SDAP,FASN2A,ACP,ACP1",ACP1,HGNC,ENSG00000004779,7694,4706
...,...,...,...,...,...,...,...
806,ZNF121,"D19S204,ZHC32,ZNF20",ZNF20,ENSG,ENSG00000197961,12904,7675
807,RNF141,"ZFP26,ZNF230",ZNF230,ENSG,ENSG00000110315,21159,50862
808,ZNF322P1,"ZNF322,ZNF322B",ZNF322,ENSG,ENSG00000188801,14003,0
809,ZNF106,"SH3BP3,ZFP106,ZNF474",ZNF474,ENSG,ENSG00000103994,12886,64397


In [2492]:
merged_alias_primary_collisions_df.loc[
    merged_alias_primary_collisions_df["collision"] == "CFM1"
]

Unnamed: 0,gene_symbol,alias_symbol,collision,source,ENSG_ID,HGNC_ID,NCBI_ID
215,RFLNB,"CFM1,FAM101B",CFM1,NCBI,ENSG00000183688,28705,359845


In [2495]:
merged_alias_primary_collisions_df.loc[
    merged_alias_primary_collisions_df["collision"] == "KRAS"
]

Unnamed: 0,gene_symbol,alias_symbol,collision,source,ENSG_ID,HGNC_ID,NCBI_ID
1039,NRAS,"ALPS4,CMNS,KRAS,N-ras,NCMS,NRAS1,NS6",KRAS,NCBI,ENSG00000213281,7989,4893


# Convert to csv

In [2496]:
merged_alias_primary_collisions_df.to_csv(
    "../output/merged_alias_primary_collisions_df.csv", index=False
)

In [2497]:
common_ap_collisions = (
    ncbi_alias_primary_collision_primary_symbol_set
    & hgnc_alias_primary_collision_primary_symbol_set
    & ensg_alias_primary_collision_primary_symbol_set
)
common_ap_collisions

{'ABCD1',
 'ACD',
 'ACKR2',
 'ACOD1',
 'ACTBP8',
 'ADRA1D',
 'AGXT',
 'AIFM2',
 'AKR1B1',
 'AKR1B10',
 'ALPK3',
 'AMH',
 'ANAPC2',
 'ANKRD37',
 'ANTXR1',
 'AOC1',
 'APEX1',
 'AREG',
 'ARHGAP21',
 'ARHGEF7',
 'ARID4A',
 'ART4',
 'ARTN',
 'ASIC2',
 'AURKAIP1',
 'AZIN2',
 'BANF1P1',
 'BCAT2',
 'BRIP1',
 'BTF3P11',
 'BTN3A3',
 'C1D',
 'C1QTNF1',
 'C6orf89',
 'CACNA1A',
 'CADPS',
 'CADPS2',
 'CAPN5',
 'CARD16',
 'CCL13',
 'CCL14',
 'CCL15',
 'CCM2',
 'CD200R1',
 'CDH19',
 'CDH20',
 'CDPF1',
 'CELSR1',
 'CES1',
 'CFAP73',
 'CFH',
 'CHAF1B',
 'CHAMP1',
 'CHD6',
 'CHEK2',
 'CHORDC1',
 'CLASP1',
 'CLCF1',
 'CNGB1',
 'CNKSR2',
 'CNOT6',
 'CNRIP1',
 'CNTN1',
 'COASY',
 'COPS2',
 'COPS3',
 'CORIN',
 'COX7A2L',
 'CPA4',
 'CPAMD8',
 'CPNE1',
 'CPNE2',
 'CPPED1',
 'CREB3L4',
 'CSNK2A2',
 'CSTB',
 'CTDSP2',
 'CXCL10',
 'CXXC1',
 'CYCSP5',
 'CYP11B1',
 'CYP11B2',
 'CYP21A2',
 'CYP2A6',
 'DCAF5',
 'DCBLD2',
 'DDOST',
 'DDR2',
 'DDX11',
 'DDX18',
 'DEAF1',
 'DEFA6',
 'DEPDC1B',
 'DFFB',
 'DHX8',
 'DIDO1'

In [2498]:
len(common_ap_collisions)

486