### Import 

In [865]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [866]:
def create_ap_collision_df(mini_xxxx_df: pd.DataFrame, source: str) -> pd.DataFrame:
    """Create a df of alias-primary collision symbols 

    :param mini_xxxx_df: Processed df of gene records
    :param source: Representation of the source of the gene records
    :param split_on_character: Character that is used to seperate alias symbols in the mini_xxxx_df
    :return: A df of genes that share an alias with another gene
    """

    #Add prefix for NCBI IDs
    mini_xxxx_df["NCBI_ID"] = mini_xxxx_df["NCBI_ID"].apply(
    lambda x: f"GENE ID:{int(x)}" if pd.notna(x) and x == int(x) else f"GENE ID:{x}" if pd.notna(x) else x
    )   

    #Remove placeholder gene records (those with no primary gene symbol)
    mini_xxxx_df = mini_xxxx_df.dropna(subset=["gene_symbol"])

    #Remove placeholder LOC genes
    mini_xxxx_df = mini_xxxx_df[~mini_xxxx_df['gene_symbol'].str.startswith('LOC', na=False)]

    # mini_xxxx_df['HGNC_ID'] = mini_xxxx_df['HGNC_ID'].fillna(0).astype(int)
    # mini_xxxx_df['NCBI_ID'] = mini_xxxx_df['NCBI_ID'].fillna(0).astype(int)

    #mini_xxx_df
    ## no LOC genes (only present in NCBI)
    ## no records with no primary symbols (only present in ENSG)
    ## each alias on a separate row
    #Convert the df into a csv and save
    mini_xxxx_df.to_csv(f'../output/mini_{source.lower()}_df.csv', index=True) 

    #Make a new df where the alias symbols are merged together for each record
    merged_alias_xxxx_df = (
        mini_xxxx_df
        .copy()
        .fillna("")
        .groupby(f"{source}_ID", dropna=False)
        .agg(lambda col: ",".join(sorted(set(map(str, col)))))
        .reset_index()
    )

    #merged_alias_xxxx_df
    ##merging aliases associated with the same record (same gene record identifier) into a list
    #Convert the df into a csv and save
    merged_alias_xxxx_df.to_csv(f'../output/merged_alias_{source}_df.csv', index=True) 

    #Make a set of the primary gene symbols
    xxxx_gene_symbol_set = set(mini_xxxx_df["gene_symbol"])
    uppercased_xxxx_gene_symbol_set = {s.upper() for s in xxxx_gene_symbol_set}

    
    subset_genes_xxxx_df = mini_xxxx_df.copy()

    #Remove alias symbols that are an exact match to their respective primary gene symbol
    subset_genes_xxxx_df["alias_symbol_upper"] = subset_genes_xxxx_df["alias_symbol"].str.upper()
    subset_genes_xxxx_df["gene_symbol_upper"] = subset_genes_xxxx_df["gene_symbol"].str.upper()
    subset_genes_xxxx_df['alias_symbol_upper'] = subset_genes_xxxx_df.apply(lambda row: np.nan if row['alias_symbol_upper'] == row['gene_symbol_upper'] else row['alias_symbol_upper'], axis=1)
    subset_genes_xxxx_df = subset_genes_xxxx_df.drop(["alias_symbol_upper","gene_symbol_upper"], axis=1)

    #subset_genes_xxxx_df
    ## no aliases that match the primary gene symbol
    ## each alias on a separate row
    #Convert the df into a csv and save
    subset_genes_xxxx_df.to_csv(f'../output/subset_genes_{source}_df.csv', index=True) 

    #Create df with genes that have an alias that can be found as another gene's primary gene symbol
    ap_collision_xxxx_df = subset_genes_xxxx_df.copy()
    ap_collision_xxxx_df = ap_collision_xxxx_df.dropna(subset=["alias_symbol"])
    ap_collision_xxxx_df["alias_symbol_upper"] = ap_collision_xxxx_df["alias_symbol"].str.upper()

    #makes list of aliases into a set
    ap_collision_xxxx_df.loc[:, "alias_symbol_upper"] = ap_collision_xxxx_df["alias_symbol_upper"].apply(lambda x: {x})

    ap_collision_xxxx_df.loc[:, "collision"] = ap_collision_xxxx_df["alias_symbol_upper"].apply(
        lambda x: x & uppercased_xxxx_gene_symbol_set
    )
    ap_collision_xxxx_df = ap_collision_xxxx_df.drop("alias_symbol_upper", axis=1)
    ap_collision_xxxx_df = ap_collision_xxxx_df[ap_collision_xxxx_df["collision"].apply(lambda x: len(x) > 0)]
    #changes back the set of aliases to a list, and make the set of collisions to a list
    ap_collision_xxxx_df['collision'] = ap_collision_xxxx_df['collision'].apply(
        lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x
    )
    ap_collision_xxxx_df = ap_collision_xxxx_df.sort_values("collision")

    #Add a source tag for future merging efforts
    ap_collision_xxxx_df["source"] = str(source.upper())

    #ap_collision_xxxx_df
    ## only the gene records with an alias that matches another gene record's primary gene symbol
    ## each alias on a separate row
    #Convert the df into a csv
    ap_collision_xxxx_df.to_csv(f'../output/single_alias_ap_collision_{source.lower()}_df.csv', index=True)

    #Create a secondary collision df that merges the alias symbols for each record
    columns_map = {
        'ENSG': ['NCBI_ID', 'HGNC_ID'],
        'HGNC': ['NCBI_ID', 'ENSG_ID'],
        'NCBI': ['HGNC_ID', 'ENSG_ID']
    }

    cols_of_interest = columns_map.get(source, [])

    merged_alias_ap_collision_xxxx_df = ap_collision_xxxx_df.drop(columns=cols_of_interest + ['alias_symbol'])

    merged_alias_ap_collision_xxxx_df = pd.merge(
        merged_alias_ap_collision_xxxx_df,
        merged_alias_xxxx_df[[f"{source}_ID"] + cols_of_interest + ['alias_symbol']],
        on=[f"{source}_ID"],
        how="left"
    )
    merged_alias_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.drop_duplicates(subset=[f"{source}_ID"])

    #Test to make sure all collisions are in the alias list for the record
    test_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.copy()
    test_ap_collision_xxxx_df["alias_symbol_upper"] = test_ap_collision_xxxx_df["alias_symbol"].str.upper()   
    test_ap_collision_xxxx_df['alias_symbol_upper'] = test_ap_collision_xxxx_df['alias_symbol_upper'].apply(lambda x: x if isinstance(x, list) else [x])
    test_ap_collision_xxxx_df['collision_in_alias'] = test_ap_collision_xxxx_df.apply(lambda row: row['collision'] in test_ap_collision_xxxx_df['alias_symbol_upper'].values, axis=1)
    true_count = test_ap_collision_xxxx_df['collision_in_alias'].sum()
    test_ap_collision_xxxx_df = test_ap_collision_xxxx_df.drop("alias_symbol_upper", axis=1)

    if true_count == 0:
        #merged_alias_ap_collision_xxxx_df
        ## only the gene records with an alias that matches another gene record's primary gene symbol
        ## merging aliases associated with the same record (same gene record identifier) into a list
        #Convert the df into a csv and save
        merged_alias_ap_collision_xxxx_df.to_csv(f'../output/merged_alias_ap_collision_{source.lower()}_df.csv', index=True)
        print("All collisions are present in gene alias lists.")
    else:
        print("Some collisions are not present in gene alias lists.")

    return mini_xxxx_df.head()

# Ensembl

In [867]:
mini_ensg_df = pd.read_csv(
    "../input/ensg_biomart_gene20250625.txt", sep="\t",dtype={"NCBI gene (formerly Entrezgene) ID": pd.Int64Dtype()}
)
mini_ensg_df = mini_ensg_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Gene Synonym": "alias_symbol",
        "Gene name": "gene_symbol",
        "Gene stable ID": "ENSG_ID",
        "NCBI gene (formerly Entrezgene) ID": "NCBI_ID",
    }
)
mini_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF
1,ENSG00000210049,,HGNC:7481,TRNF,MT-TF
2,ENSG00000211459,,HGNC:7470,12S,MT-RNR1
3,ENSG00000211459,,HGNC:7470,MOTS-C,MT-RNR1
4,ENSG00000211459,,HGNC:7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133060,ENSG00000229388,,HGNC:52502,LINC01715,TAF12-DT
133061,ENSG00000289291,,,,
133062,ENSG00000274978,26824,HGNC:10108,RNU11-1,RNU11
133063,ENSG00000274978,26824,HGNC:10108,U11,RNU11


In [868]:
mini_ensg_df = mini_ensg_df.replace(" ", np.nan)
mini_ensg_df = mini_ensg_df.replace("", np.nan)
mini_ensg_df = mini_ensg_df.replace("-", np.nan)
mini_ensg_df = mini_ensg_df.replace("<NA>", np.nan)

how many gene records for the Ensembl data set before any kind of cleaning?

In [869]:
total_raw_gene_record_count_ensg = len(set(mini_ensg_df['ENSG_ID']))
total_raw_gene_record_count_ensg

86364

how many gene records have no primary gene symbol?

In [870]:
raw_no_primary_symbol_record_count_ensg = (
    mini_ensg_df[mini_ensg_df["gene_symbol"].isna()]
    .dropna(subset=["ENSG_ID"])
    ["ENSG_ID"]
    .nunique()
)
raw_no_primary_symbol_record_count_ensg

37963

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [871]:
raw_loc_record_count_ensg = (
    mini_ensg_df[mini_ensg_df["gene_symbol"].str.startswith('LOC', na=False)&
    ~mini_ensg_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["ENSG_ID"])
    ["ENSG_ID"]
    .nunique()
)
raw_loc_record_count_ensg

0

In [872]:
create_ap_collision_df(mini_ensg_df, "ENSG")

All collisions are present in gene alias lists.


Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF
1,ENSG00000210049,,HGNC:7481,TRNF,MT-TF
2,ENSG00000211459,,HGNC:7470,12S,MT-RNR1
3,ENSG00000211459,,HGNC:7470,MOTS-C,MT-RNR1
4,ENSG00000211459,,HGNC:7470,MTRNR1,MT-RNR1


In [873]:
mini_ensg_df = pd.read_csv(
    "../output/mini_ensg_df.csv", index_col=[0])

In [874]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF
1,ENSG00000210049,,HGNC:7481,TRNF,MT-TF
2,ENSG00000211459,,HGNC:7470,12S,MT-RNR1
3,ENSG00000211459,,HGNC:7470,MOTS-C,MT-RNR1
4,ENSG00000211459,,HGNC:7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,GENE ID:85028,HGNC:30062,LINC00100,SNHG12
133059,ENSG00000197989,GENE ID:85028,HGNC:30062,PNAS-123,SNHG12
133060,ENSG00000229388,,HGNC:52502,LINC01715,TAF12-DT
133062,ENSG00000274978,GENE ID:26824,HGNC:10108,RNU11-1,RNU11


In [876]:
ap_collision_ensg_df = pd.read_csv(
    "../output/single_alias_ap_collision_ensg_df.csv", index_col=[0])
ap_collision_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol,collision,source
24761,ENSG00000283293,GENE ID:125050,HGNC:10037,7SK,RN7SK,7SK,ENSG
37365,ENSG00000057252,GENE ID:6646,HGNC:11177,ACAT1,SOAT1,ACAT1,ENSG
17666,ENSG00000167780,GENE ID:8435,HGNC:11178,ACAT2,SOAT2,ACAT2,ENSG
35063,ENSG00000004779,GENE ID:4706,HGNC:7694,ACP1,NDUFAB1,ACP1,ENSG
47647,ENSG00000220267,,HGNC:141,ACTBP2,ACTBP8,ACTBP2,ENSG
...,...,...,...,...,...,...,...
53226,ENSG00000197961,GENE ID:7675,HGNC:12904,ZNF20,ZNF121,ZNF20,ENSG
35374,ENSG00000110315,GENE ID:50862,HGNC:21159,ZNF230,RNF141,ZNF230,ENSG
96229,ENSG00000188801,,HGNC:14003,ZNF322,ZNF322P1,ZNF322,ENSG
81813,ENSG00000103994,GENE ID:64397,HGNC:12886,ZNF474,ZNF106,ZNF474,ENSG


In [877]:
merged_alias_ap_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ensg_df.csv", index_col=[0])
merged_alias_ap_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,collision,source,NCBI_ID,HGNC_ID,alias_symbol
0,ENSG00000283293,RN7SK,7SK,ENSG,GENE ID:125050,HGNC:10037,7SK
1,ENSG00000057252,SOAT1,ACAT1,ENSG,GENE ID:6646,HGNC:11177,"ACAT,ACAT1,SOAT,STAT"
2,ENSG00000167780,SOAT2,ACAT2,ENSG,GENE ID:8435,HGNC:11178,ACAT2
3,ENSG00000004779,NDUFAB1,ACP1,ENSG,GENE ID:4706,HGNC:7694,"ACP,ACP1,FASN2A,SDAP"
4,ENSG00000220267,ACTBP8,ACTBP2,ENSG,,HGNC:141,ACTBP2
...,...,...,...,...,...,...,...
825,ENSG00000197961,ZNF121,ZNF20,ENSG,GENE ID:7675,HGNC:12904,"D19S204,ZHC32,ZNF20"
826,ENSG00000110315,RNF141,ZNF230,ENSG,GENE ID:50862,HGNC:21159,"ZFP26,ZNF230"
827,ENSG00000188801,ZNF322P1,ZNF322,ENSG,,HGNC:14003,"ZNF322,ZNF322B"
828,ENSG00000103994,ZNF106,ZNF474,ENSG,GENE ID:64397,HGNC:12886,"SH3BP3,ZFP106,ZNF474"


how many gene records are in the Ensembl database?

In [878]:
gene_record_set_ensg = set(mini_ensg_df['ENSG_ID'])
gene_record_count_ensg = len(gene_record_set_ensg)
gene_record_count_ensg

48401

how many unique primary gene symbols are in the Ensembl data set?

In [880]:
primary_symbol_set_ensg = set(mini_ensg_df['gene_symbol'])
primary_symbol_count_ensg = len(primary_symbol_set_ensg)
primary_symbol_count_ensg

41164

how many unique alias gene symbols are in the Ensembl data set?

In [881]:
alias_symbol_set_ensg = set(mini_ensg_df['alias_symbol'])
alias_symbol_count_ensg = len(alias_symbol_set_ensg)
alias_symbol_count_ensg

55413

how many unique gene symbols in total are there in the Ensembl data set??

In [882]:
total_symbol_count_ensg = pd.concat([mini_ensg_df["gene_symbol"], mini_ensg_df["alias_symbol"]]).dropna().nunique()
total_symbol_count_ensg

95941

how many gene records have no alias symbols?

In [883]:
no_alias_symbol_record_count_ensg = (
    mini_ensg_df[mini_ensg_df["alias_symbol"].isna()]
    .dropna(subset=["ENSG_ID"])
    ["ENSG_ID"]
    .nunique()
)
no_alias_symbol_record_count_ensg

19363

In [884]:
(no_alias_symbol_record_count_ensg/gene_record_count_ensg)*100

40.00537178983905

how many gene records have a primary gene symbol that is a C#orf?

In [885]:
orf_record_set_ensg = set(
    mini_ensg_df[
        mini_ensg_df["gene_symbol"].str.contains(r'^C.*ORF', case=False, na=False) &
        ~mini_ensg_df["gene_symbol"].str.contains("-", na=False)
    ]
    .dropna(subset=["ENSG_ID"])["ENSG_ID"]
)

orf_record_count_ensg = len(orf_record_set_ensg)
orf_record_count_ensg

288

In [886]:
(orf_record_count_ensg/gene_record_count_ensg)*100

0.595029028325861

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [887]:
fam_record_set_ensg = set(
    mini_ensg_df[
        mini_ensg_df["gene_symbol"].str.startswith("FAM", na=False) &
        ~mini_ensg_df["gene_symbol"].str.contains("-", na=False)
    ]
    .dropna(subset=["ENSG_ID"])["ENSG_ID"]
)

fam_record_count_ensg = len(fam_record_set_ensg)
fam_record_count_ensg

430

In [888]:
(fam_record_count_ensg/gene_record_count_ensg)*100

0.8884113964587508

In [889]:
fam_record_set_ensg_df = mini_ensg_df[mini_ensg_df['ENSG_ID'].isin(fam_record_set_ensg)].copy()
fam_record_set_ensg_df['fam_family'] = fam_record_set_ensg_df['gene_symbol'].str.extract(r'^FAM(\d+)')
unique_families_count_ensg = fam_record_set_ensg_df['fam_family'].unique()
print("Number of unique FAM families in ENSG:", len(unique_families_count_ensg))

Number of unique FAM families in ENSG: 106


how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [890]:
kiaa_record_set_ensg = set(
    mini_ensg_df[
        mini_ensg_df["gene_symbol"].str.startswith("KIAA", na=False) &
        ~mini_ensg_df["gene_symbol"].str.contains("-", na=False)
    ]
    .dropna(subset=["ENSG_ID"])["ENSG_ID"]
)

kiaa_record_count_ensg = len(kiaa_record_set_ensg)
kiaa_record_count_ensg

33

In [891]:
(kiaa_record_count_ensg/gene_record_count_ensg)*100

0.06818040949567158

which gene records share a primary symbol with other gene records?

In [892]:
# groupby name and return a boolean of whether each has more than 1 unique ENSG ID
multi_primary = mini_ensg_df.groupby(["gene_symbol"]).ENSG_ID.nunique().gt(1)

num_rows = mini_ensg_df[mini_ensg_df.gene_symbol.isin(multi_primary[multi_primary].index)].shape[0]
print(num_rows)

# use loc to only see those values that have `True` in `multi_primary`:
mini_ensg_df.loc[mini_ensg_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol').iloc[6000:6060]

25007


Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
92891,ENSG00000138641,GENE ID:8916,HGNC:4876,KIAA0032,HERC3
92878,ENSG00000287542,GENE ID:8916,,,HERC3
52117,ENSG00000273529,GENE ID:388585,HGNC:19764,BHLHB38,HES5
14696,ENSG00000197921,GENE ID:388585,HGNC:19764,BHLHB38,HES5
4416,ENSG00000280680,GENE ID:55733,HGNC:18270,MART-2,HHAT
4421,ENSG00000280680,GENE ID:55733,HGNC:18270,SKN,HHAT
4418,ENSG00000280680,GENE ID:55733,HGNC:18270,RASP,HHAT
4417,ENSG00000280680,GENE ID:55733,HGNC:18270,MART2,HHAT
43282,ENSG00000054392,GENE ID:55733,HGNC:18270,MART2,HHAT
43285,ENSG00000054392,GENE ID:55733,HGNC:18270,SKI,HHAT


How many ambiguous symbols result from alias-primary collisions?


In [893]:
ap_collision_ambiguous_symbol_set_ensg = set(merged_alias_ap_collision_ensg_df["collision"])
ap_collision_ambiguous_symbol_count_ensg = len(ap_collision_ambiguous_symbol_set_ensg)
ap_collision_ambiguous_symbol_count_ensg

623

How many records have at least one alias-primary collision (alias that matches another record's primary gene symbol)?

In [894]:
ap_record_set_ensg = set(merged_alias_ap_collision_ensg_df["ENSG_ID"])
ap_record_count_ensg = len(ap_record_set_ensg)
ap_record_count_ensg

785

1. Why is the alias-gene collision set not the same length as the set of primary symbols with collisions ?
2. Why is the length of the alias-gene collison set shorter?
 - A gene record with an alias-primary collision has an alias that matches a different gene's primary gene symbol.
 - Multiple gene records can share a single alias (alias-alias collision)
 - If that shared alias is an alias-primary collision, then there will be more unique gene symbols in the set of primary symbols with collisions than the set of alias-primary collisions. 

# HGNC

## Set up table

In [895]:
hgnc_file_path = "../input/hgnc_biomart_gene20250625.txt"

mini_hgnc_df = pd.read_csv(
    hgnc_file_path, sep="\t"
)

# Rename columns
mini_hgnc_df = mini_hgnc_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Approved symbol": "gene_symbol",
        "Ensembl gene ID": "ENSG_ID",
    }
)

#structure and labeling in HGNC download files changed between 2024 amd 2025
if "Alias symbol" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"Alias symbol": "alias_symbol"})
elif "Alias symbols" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"Alias symbols": "alias_symbol"})
else:
    mini_hgnc_df["alias_symbol"] = pd.NA

if "NCBI gene ID" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"NCBI gene ID": "NCBI_ID"})
elif "NCBI Gene ID" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"NCBI Gene ID": "NCBI_ID"})
else:
    mini_hgnc_df["NCBI_ID"] = pd.NA   

mini_hgnc_df["NCBI_ID"] = mini_hgnc_df["NCBI_ID"].astype(pd.Int64Dtype())

# Extract date from filename and check if it is before June 25, 2025
match = re.search(r'(\d{8})', hgnc_file_path)
file_date = datetime.strptime(match.group(1), "%Y%m%d") if match else None
cutoff_date = datetime.strptime("20250625", "%Y%m%d")

# Apply list-splitting logic for newer files
if file_date and file_date >= cutoff_date:
    mini_hgnc_df['alias_symbol'] = (
        mini_hgnc_df['alias_symbol']
        .fillna('')
        .str.split(',')
        .apply(lambda x: [a.strip() for a in x if a.strip()])
    )
    mini_hgnc_df = mini_hgnc_df.explode('alias_symbol')

mini_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,HGNC:100,ASIC1,BNaC2,41,ENSG00000110881
0,HGNC:100,ASIC1,hBNaC2,41,ENSG00000110881
1,HGNC:10000,RGS4,,5999,ENSG00000117152
2,HGNC:10001,RGS5,,8490,ENSG00000143248
3,HGNC:10002,RGS6,,9628,ENSG00000182732
...,...,...,...,...,...
44232,HGNC:9997,RGS16,RGS-r,6004,ENSG00000143333
44233,HGNC:9998,RGS2,,5997,ENSG00000116741
44234,HGNC:9999,RGS3,C2PA,5998,ENSG00000138835
44234,HGNC:9999,RGS3,FLJ20370,5998,ENSG00000138835


In [896]:
mini_hgnc_df = mini_hgnc_df.replace(" ", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("-", np.nan)

how many gene records for the HGNC data set before any kind of cleaning?

In [897]:
total_raw_gene_record_set_hgnc = set(mini_hgnc_df['HGNC_ID'])
total_raw_gene_record_count_hgnc = len(total_raw_gene_record_set_hgnc)
total_raw_gene_record_count_hgnc

44235

how many gene records have no primary gene symbol?

In [898]:
raw_no_primary_symbol_record_count_hgnc = (
    mini_hgnc_df[mini_hgnc_df["gene_symbol"].isna()]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
    .nunique()
)
raw_no_primary_symbol_record_count_hgnc

0

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [899]:
raw_loc_record_count_hgnc = (
    mini_hgnc_df[mini_hgnc_df["gene_symbol"].str.startswith('LOC', na=False)&
    ~mini_hgnc_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
    .nunique()
)
raw_loc_record_count_hgnc

0

In [900]:
create_ap_collision_df(mini_hgnc_df, "HGNC")

All collisions are present in gene alias lists.


Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,HGNC:100,ASIC1,BNaC2,GENE ID:41,ENSG00000110881
0,HGNC:100,ASIC1,hBNaC2,GENE ID:41,ENSG00000110881
1,HGNC:10000,RGS4,,GENE ID:5999,ENSG00000117152
2,HGNC:10001,RGS5,,GENE ID:8490,ENSG00000143248
3,HGNC:10002,RGS6,,GENE ID:9628,ENSG00000182732


In [901]:
mini_hgnc_df = pd.read_csv(
    "../output/mini_hgnc_df.csv", index_col=[0])

In [902]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID
0,HGNC:100,ASIC1,BNaC2,GENE ID:41,ENSG00000110881
0,HGNC:100,ASIC1,hBNaC2,GENE ID:41,ENSG00000110881
1,HGNC:10000,RGS4,,GENE ID:5999,ENSG00000117152
2,HGNC:10001,RGS5,,GENE ID:8490,ENSG00000143248
3,HGNC:10002,RGS6,,GENE ID:9628,ENSG00000182732
...,...,...,...,...,...
44232,HGNC:9997,RGS16,RGS-r,GENE ID:6004,ENSG00000143333
44233,HGNC:9998,RGS2,,GENE ID:5997,ENSG00000116741
44234,HGNC:9999,RGS3,C2PA,GENE ID:5998,ENSG00000138835
44234,HGNC:9999,RGS3,FLJ20370,GENE ID:5998,ENSG00000138835


In [903]:
merged_alias_ap_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_ap_collision_hgnc_df.csv", index_col=[0])
merged_alias_ap_collision_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,collision,source,NCBI_ID,ENSG_ID,alias_symbol
0,HGNC:14947,PPP1R12C,AAVS1,HGNC,GENE ID:54776,ENSG00000125503,"AAVS1,DKFZP434D0412,MBS85,p84,p85"
1,HGNC:11177,SOAT1,ACAT1,HGNC,GENE ID:6646,ENSG00000057252,"ACAT,ACAT1"
2,HGNC:11178,SOAT2,ACAT2,HGNC,GENE ID:8435,ENSG00000167780,ACAT2
3,HGNC:1612,CCRL2,ACKR5,HGNC,GENE ID:9034,ENSG00000121797,"ACKR5,CKRX,CRAM-A,CRAM-B,HCR"
4,HGNC:7694,NDUFAB1,ACP1,HGNC,GENE ID:4706,ENSG00000004779,"ACP,ACP1,FASN2A,SDAP"
...,...,...,...,...,...,...,...
571,HGNC:3147,ECEL1,XCE,HGNC,GENE ID:9427,ENSG00000171551,"DINE,XCE"
572,HGNC:12904,ZNF121,ZNF20,HGNC,GENE ID:7675,ENSG00000197961,"ZHC32,ZNF20"
573,HGNC:21159,RNF141,ZNF230,HGNC,GENE ID:50862,ENSG00000110315,"ZFP26,ZNF230"
574,HGNC:12886,ZNF106,ZNF474,HGNC,GENE ID:64397,ENSG00000103994,"SH3BP3,ZNF474"


how many gene records are in the HGNC database?

In [904]:
gene_record_set_hgnc = set(mini_hgnc_df['HGNC_ID'])
gene_record_count_hgnc = len(gene_record_set_hgnc)
gene_record_count_hgnc

44235

how many unique primary gene symbols are in the HGNC data set?

In [905]:
primary_symbol_set_hgnc = set(mini_hgnc_df['gene_symbol'])
primary_symbol_count_hgnc = len(primary_symbol_set_hgnc)
primary_symbol_count_hgnc

44235

how many unique alias gene symbols are in the HGNC data set?

In [906]:
alias_symbol_set_hgnc = set(mini_hgnc_df['alias_symbol'])
alias_symbol_count_hgnc = len(alias_symbol_set_hgnc)
alias_symbol_count_hgnc

43075

how many unique gene symbols in total are there in the HGNC data set??

In [907]:
total_symbol_count_hgnc = len(set(mini_hgnc_df['gene_symbol']) | set(mini_hgnc_df['alias_symbol']))
total_symbol_count_hgnc

86842

how many gene records have no alias symbols?

In [908]:
no_alias_symbol_record_count_hgnc = (
    mini_hgnc_df[mini_hgnc_df["alias_symbol"].isna()]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
    .nunique()
)
no_alias_symbol_record_count_hgnc

21768

In [909]:
(no_alias_symbol_record_count_hgnc/gene_record_count_ensg)*100

44.974277390962996

how many gene records have a primary gene symbol that is a C#orf?

In [910]:
orf_record_set_hgnc = set(
    mini_hgnc_df[
        mini_hgnc_df["gene_symbol"].str.contains(r'^C.*ORF', case=False, na=False) &
        ~mini_hgnc_df["gene_symbol"].str.contains("-", na=False)
    ]
    .dropna(subset=["HGNC_ID"])["HGNC_ID"]
)

orf_record_count_hgnc = len(orf_record_set_hgnc)
orf_record_count_hgnc

246

In [911]:
(orf_record_count_hgnc/gene_record_count_ensg)*100

0.5082539616950063

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [912]:
fam_record_set_hgnc = set(
    mini_hgnc_df[mini_hgnc_df["gene_symbol"].str.startswith('FAM', na=False)&
    ~mini_hgnc_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
)
fam_record_count_hgnc = len(fam_record_set_hgnc)
fam_record_count_hgnc

371

In [913]:
(fam_record_count_hgnc/gene_record_count_ensg)*100

0.7665130885725502

In [914]:
fam_record_set_hgnc_df = mini_hgnc_df[mini_hgnc_df['HGNC_ID'].isin(fam_record_set_hgnc)].copy()
fam_record_set_hgnc_df['fam_family'] = fam_record_set_hgnc_df['gene_symbol'].str.extract(r'^FAM(\d+)')
unique_families_count_hgnc = fam_record_set_hgnc_df['fam_family'].unique()
print("Number of unique FAM families in HGNC:", len(unique_families_count_hgnc))

Number of unique FAM families in HGNC: 107


how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [915]:
kiaa_record_set_hgnc = set(
    mini_hgnc_df[mini_hgnc_df["gene_symbol"].str.startswith('KIAA', na=False)&
    ~mini_hgnc_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["HGNC_ID"])
    ["HGNC_ID"]
)
kiaa_record_count_hgnc = len(kiaa_record_set_hgnc)
kiaa_record_count_hgnc

32

In [916]:
(kiaa_record_count_hgnc/gene_record_count_ensg)*100

0.06611433648065122

which gene records share a primary symbol with other gene records?

In [917]:
# groupby name and return a boolean of whether each has more than 1 unique Country
multi_primary = mini_hgnc_df.groupby(["gene_symbol"]).HGNC_ID.nunique().gt(1)

num_rows = mini_hgnc_df[mini_hgnc_df.gene_symbol.isin(multi_primary[multi_primary].index)].shape[0]
print(num_rows)

# use loc to only see those values that have `True` in `multi_country`:
multi_record_same_symbol_df = mini_hgnc_df.loc[mini_hgnc_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol')
multi_record_same_symbol_df.head(60)

0


Unnamed: 0,HGNC_ID,gene_symbol,alias_symbol,NCBI_ID,ENSG_ID


How many ambiguous symbols result from alias-primary collisions?

In [918]:
ap_collision_ambiguous_symbol_set_hgnc = set(
    merged_alias_ap_collision_hgnc_df["collision"]
)
ap_collision_ambiguous_symbol_count_hgnc = len(ap_collision_ambiguous_symbol_set_hgnc)
ap_collision_ambiguous_symbol_count_hgnc

507

How many records have at least one alias-primary collision (alias that matches another record's primary gene symbol)?

In [919]:
ap_record_set_hgnc = set(merged_alias_ap_collision_hgnc_df["HGNC_ID"])
ap_record_count_hgnc = len(ap_record_set_hgnc)
ap_record_count_hgnc

562

# NCBI Gene

In [920]:
ncbi_file_path = "../input/Homo_sapiens.gene_info20250625"

mini_ncbi_df = pd.read_csv(ncbi_file_path, sep="\t")
# Drop all columns besides ENSG_ID, gene_symbol, and alias_symbol
mini_ncbi_df = mini_ncbi_df[
["GeneID", "Symbol", "Synonyms", "dbXrefs"]
]
mini_ncbi_df = mini_ncbi_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Synonyms": "alias_symbol"})


Split dbXrefs into individual columns

In [921]:
mini_ncbi_df = mini_ncbi_df.assign(
    MIM=np.nan,
    HGNC_ID=np.nan,
    ENSG_ID=np.nan,
    AllianceGenome=np.nan,
    MIRbase=np.nan,
    IMGTgene_db=np.nan,
    dash=np.nan,
    unknown=np.nan,
)

In [922]:
index_pos = 0

print(len(mini_ncbi_df))
while index_pos < len(mini_ncbi_df):
    xrefs = mini_ncbi_df["dbXrefs"][index_pos].split("|")

    for xref in xrefs:
        xref = xref.lower()
        if xref.startswith("mim:"):
            xref = xref.replace("mim:", "")
            mini_ncbi_df["MIM"][index_pos] = xref
        elif xref.startswith("hgnc:hgnc:"):
            xref = xref.replace("hgnc:hgnc:", "")
            mini_ncbi_df["HGNC_ID"][index_pos] = xref
        elif xref.startswith("ensembl:"):
            xref = xref.replace("ensembl:", "")
            mini_ncbi_df["ENSG_ID"][index_pos] = xref
        elif xref.startswith("alliancegenome:"):
            xref = xref.replace("alliancegenome:", "")
            mini_ncbi_df["AllianceGenome"][index_pos] = xref
        elif xref.startswith("mirbase"):
            xref = xref.replace("mirbase:", "")
            mini_ncbi_df["MIRbase"][index_pos] = xref
        elif xref.startswith("imgt/gene-db:"):
            xref = xref.replace("imgt/gene-db:", "")
            mini_ncbi_df["IMGTgene_db"][index_pos] = xref
        elif xref.startswith("-"):
            mini_ncbi_df["dash"][index_pos] = xref
        else:
            mini_ncbi_df["unknown"][index_pos] = xref

    index_pos += 1
    pass

print(index_pos)

193580


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  mini_ncbi_df["MIM"][index_pos] = xref
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["MIM"][index

193580


In [923]:
mini_ncbi_df["ENSG_ID"] = mini_ncbi_df["ENSG_ID"].str.replace("ensg", "ENSG", 1)

In [924]:
mini_ncbi_df = mini_ncbi_df.drop(
    [
        "AllianceGenome",
        "MIRbase",
        "IMGTgene_db",
        "dash",
        "unknown",
        "dbXrefs",
        "MIM",
    ],
    axis=1,
)

In [925]:
# Extract date from filename and check if it is before June 25, 2025
match = re.search(r'(\d{8})', ncbi_file_path)
file_date = datetime.strptime(match.group(1), "%Y%m%d") if match else None
cutoff_date = datetime.strptime("20250625", "%Y%m%d")

# Apply list-splitting logic for newer files
if file_date and file_date >= cutoff_date:
    mini_ncbi_df['alias_symbol'] = (
        mini_ncbi_df['alias_symbol']
        .fillna('')
        .str.split('|')
        .apply(lambda x: [a.strip() for a in x if a.strip()])
    )
    mini_ncbi_df = mini_ncbi_df.explode('alias_symbol')
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...
193575,8923215,trnD,-,,
193576,8923216,trnP,-,,
193577,8923217,trnA,-,,
193578,8923218,COX1,-,,


In [926]:
mini_ncbi_df = mini_ncbi_df.replace(" ", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("-", np.nan)

how many gene records for the NCBI data set before any kind of cleaning?

In [927]:
total_raw_gene_record_count_ncbi = mini_ncbi_df['NCBI_ID'].dropna().nunique()
total_raw_gene_record_count_ncbi

193580

how many gene records have the "phenotype only" property?

In [928]:
# This file (ncbi_records_w_property_pheno_only_20250630.txt) was generated through the NCBI Gene web page by 
# searching for these parameters "phenotype only [Properties] AND "Homo sapiens"[porgn:__txid9606]"
ncbi_pheno_records_df = pd.read_csv("../input/ncbi_records_w_property_pheno_only_20250630.txt", sep="\t")

ncbi_pheno_records_df = ncbi_pheno_records_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Aliases": "alias_symbol"})

In [929]:
len(set(ncbi_pheno_records_df["NCBI_ID"]))

1170

In [930]:
# need to drop them
mini_ncbi_df = mini_ncbi_df[~mini_ncbi_df['NCBI_ID'].isin(ncbi_pheno_records_df['NCBI_ID'])]

how many gene records have no primary gene symbol?

In [931]:
raw_no_primary_symbol_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].isna()]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
raw_no_primary_symbol_record_count_ncbi

0

In [932]:
(raw_no_primary_symbol_record_count_ncbi/total_raw_gene_record_count_ncbi)*100

0.0

how many gene records that have a primary gene symbol that is a LOC placeholder?

In [933]:
raw_loc_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('LOC', na=False)&
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
raw_loc_record_count_ncbi

147698

how many gene records are sub species records: "homo sapiens ssp denisova"[Organism]

In [934]:
ncbi_denisova_records_df = pd.read_csv("../input/ncbi_denisova_20250630.txt", sep="\t")

ncbi_denisova_records_df = ncbi_denisova_records_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Aliases": "alias_symbol"})

In [935]:
len(set(ncbi_denisova_records_df["NCBI_ID"]))

36

In [936]:
mini_ncbi_df = mini_ncbi_df[~mini_ncbi_df['NCBI_ID'].isin(ncbi_denisova_records_df['NCBI_ID'])]

Drop sub species records: "homo sapiens neanderthalensis"[Organism] 

In [937]:
ncbi_neanderthalensis_records_df = pd.read_csv("../input/ncbi_neanderthalensis_20250630.txt", sep="\t")

ncbi_neanderthalensis_records_df = ncbi_neanderthalensis_records_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Aliases": "alias_symbol"})

In [938]:
len(set(ncbi_neanderthalensis_records_df["NCBI_ID"]))

37

In [939]:
mini_ncbi_df = mini_ncbi_df[~mini_ncbi_df['NCBI_ID'].isin(ncbi_neanderthalensis_records_df['NCBI_ID'])]

In [940]:
mini_ncbi_df["HGNC_ID"] = mini_ncbi_df["HGNC_ID"].apply(
    lambda x: f"HGNC:{int(x)}" if pd.notna(x) and x == int(x) else f"HGNC:{x}" if pd.notna(x) else x
    ) 

In [941]:
create_ap_collision_df(mini_ncbi_df, "NCBI")

All collisions are present in gene alias lists.


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,GENE ID:1,A1BG,A1B,HGNC:5,ENSG00000121410
0,GENE ID:1,A1BG,ABG,HGNC:5,ENSG00000121410
0,GENE ID:1,A1BG,GAB,HGNC:5,ENSG00000121410
0,GENE ID:1,A1BG,HYST2477,HGNC:5,ENSG00000121410
1,GENE ID:2,A2M,A2MD,HGNC:7,ENSG00000175899


In [942]:
mini_ncbi_df = pd.read_csv(
    "../output/mini_ncbi_df.csv", index_col=[0])

In [943]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,GENE ID:1,A1BG,A1B,HGNC:5,ENSG00000121410
0,GENE ID:1,A1BG,ABG,HGNC:5,ENSG00000121410
0,GENE ID:1,A1BG,GAB,HGNC:5,ENSG00000121410
0,GENE ID:1,A1BG,HYST2477,HGNC:5,ENSG00000121410
1,GENE ID:2,A2M,A2MD,HGNC:7,ENSG00000175899
...,...,...,...,...,...
193502,GENE ID:141732005,ADCY2-AS1,,HGNC:40064,
193503,GENE ID:141732006,NSG2-AS1,,HGNC:41074,
193504,GENE ID:141732007,ST18-AS1,,HGNC:58430,
193505,GENE ID:141732008,MICAL2-AS1,,HGNC:58437,


In [944]:
merged_alias_ap_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ncbi_df.csv", index_col=[0])
merged_alias_ap_collision_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,collision,source,HGNC_ID,ENSG_ID,alias_symbol
0,GENE ID:3494,IGHA2,A2M,NCBI,HGNC:5479,ENSG00000211890,A2M
1,GENE ID:54776,PPP1R12C,AAVS1,NCBI,HGNC:14947,ENSG00000125503,"AAVS1,LENG3,MBS85,p84,p85"
2,GENE ID:6646,SOAT1,ACAT1,NCBI,HGNC:11177,ENSG00000057252,"ACACT,ACAT,ACAT-1,ACAT1,SOAT,STAT"
3,GENE ID:8435,SOAT2,ACAT2,NCBI,HGNC:11178,ENSG00000167780,"ACACT2,ACAT2,ARGP2"
4,GENE ID:9034,CCRL2,ACKR5,NCBI,HGNC:1612,ENSG00000121797,"ACKR5,CKRX,CRAM,CRAM-A,CRAM-B,HCR"
...,...,...,...,...,...,...,...
2032,GENE ID:29903,CCDC106,ZNF581,NCBI,HGNC:30181,ENSG00000173581,"HSU79303,ZNF581"
2033,GENE ID:146540,ZNF785,ZNF688,NCBI,HGNC:26496,ENSG00000197162,ZNF688
2034,GENE ID:57829,ZP4,ZP1,NCBI,HGNC:15770,ENSG00000116996,"ZBP,ZP1,ZP1B,ZPB,ZPB2,Zp-4"
2036,GENE ID:55663,ZNF446,ZSCAN30,NCBI,HGNC:21036,ENSG00000083838,"ZKSCAN20,ZSCAN30,ZSCAN52"


how many gene records are in the NCBI database?

In [945]:
gene_record_set_ncbi = set(mini_ncbi_df['NCBI_ID'])
gene_record_count_ncbi = len(gene_record_set_ncbi)
gene_record_count_ncbi

44639

how many unique primary gene symbols are in the NCBI data set?

In [946]:
primary_symbol_set_ncbi = set(mini_ncbi_df['gene_symbol'])
primary_symbol_count_ncbi = len(primary_symbol_set_ncbi)
primary_symbol_count_ncbi

44541

how many unique alias gene symbols are in the NCBI data set?

In [947]:
alias_symbol_set_ncbi = set(mini_ncbi_df['alias_symbol'])
alias_symbol_count_ncbi = len(alias_symbol_set_ncbi)
alias_symbol_count_ncbi

69227

how many unique gene symbols in total are there in the NCBI data set??

In [948]:
total_symbol_count_ncbi = pd.concat([mini_ncbi_df["gene_symbol"], mini_ncbi_df["alias_symbol"]]).dropna().nunique()
total_symbol_count_ncbi

112294

how many gene records have no alias symbols?

In [949]:
no_alias_symbol_record_count_ncbi = (
    mini_ncbi_df[mini_ncbi_df["alias_symbol"].isna()]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
    .nunique()
)
no_alias_symbol_record_count_ncbi

17380

In [950]:
(no_alias_symbol_record_count_ncbi/gene_record_count_ncbi)*100

38.934563946324964

how many gene records have a primary gene symbol that is a C#orf?

In [951]:
orf_record_set_ncbi = set(mini_ncbi_df[
    mini_ncbi_df['gene_symbol']
    .str.contains(r'^C.*ORF', case=False, na=False) &
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)
])
orf_record_count_ncbi = len(orf_record_set_ncbi)
orf_record_count_ncbi

5

In [952]:
(orf_record_count_ncbi/gene_record_count_ncbi)*100

0.011200967763614776

how many gene records that have a primary gene symbol that is a FAM placeholder?

In [953]:
fam_record_set_ncbi = set(
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('FAM', na=False)&
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
)
fam_record_count_ncbi = len(fam_record_set_ncbi)
fam_record_count_ncbi

371

In [954]:
(fam_record_count_ncbi/gene_record_count_ncbi)*100

0.8311118080602165

In [955]:
fam_record_set_ncbi_df = mini_ncbi_df[mini_ncbi_df['NCBI_ID'].isin(fam_record_set_ncbi)].copy()
fam_record_set_ncbi_df['fam_family'] = fam_record_set_ncbi_df['gene_symbol'].str.extract(r'^FAM(\d+)')
unique_families_count_ncbi = fam_record_set_ncbi_df['fam_family'].unique()
print("Number of unique FAM families in NCBI:", len(unique_families_count_ncbi))

Number of unique FAM families in NCBI: 107


how many gene records that have a primary gene symbol that is a KIAA placeholder?

In [956]:
kiaa_record_set_ncbi = set(
    mini_ncbi_df[mini_ncbi_df["gene_symbol"].str.startswith('KIAA', na=False)&
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)]
    .dropna(subset=["NCBI_ID"])
    ["NCBI_ID"]
)
kiaa_record_count_ncbi = len(kiaa_record_set_ncbi)
kiaa_record_count_ncbi

33

In [957]:
(kiaa_record_count_ncbi/gene_record_count_ncbi)*100

0.07392638723985752

which gene records share a primary symbol with other gene records?

In [958]:
# groupby name and return a boolean of whether each has more than 1 unique Country
multi_primary = mini_ncbi_df.groupby(["gene_symbol"]).NCBI_ID.nunique().gt(1)

num_rows = mini_ncbi_df[mini_ncbi_df.gene_symbol.isin(multi_primary[multi_primary].index)].shape[0]
print(num_rows)

# use loc to only see those values that have `True` in `multi_country`:
multi_record_same_symbol_df = mini_ncbi_df.loc[mini_ncbi_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol')
multi_record_same_symbol_df.head(60)

110


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
3550,GENE ID:4549,RNR1,MTRNR1,HGNC:7470,
4762,GENE ID:6052,RNR1,,HGNC:10082,
3551,GENE ID:4550,RNR2,MTRNR2,HGNC:7471,
4763,GENE ID:6053,RNR2,,HGNC:10083,
65892,GENE ID:124901565,TRNAA-AGC,,,
65891,GENE ID:124901564,TRNAA-AGC,,,
65890,GENE ID:124901563,TRNAA-AGC,,,
65889,GENE ID:124901562,TRNAA-AGC,,,
70782,GENE ID:124906586,TRNAA-AGC,,,
65888,GENE ID:124901561,TRNAA-AGC,,,


In [959]:
multi_record_same_symbol_df.iloc[60:110]

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
70154,GENE ID:124905918,TRNAG-GCC,,,
70157,GENE ID:124905921,TRNAG-GCC,,,
70159,GENE ID:124905923,TRNAG-GCC,,,
70089,GENE ID:124905853,TRNAG-GCC,,,
70161,GENE ID:124905925,TRNAG-GCC,,,
70163,GENE ID:124905927,TRNAG-GCC,,,
70165,GENE ID:124905929,TRNAG-GCC,,,
70167,GENE ID:124905931,TRNAG-GCC,,,
70169,GENE ID:124905933,TRNAG-GCC,,,
70087,GENE ID:124905851,TRNAG-GCC,,,


How many ambiguous symbols result from alias-primary collisions?

In [960]:
ap_collision_ambiguous_symbol_set_ncbi = set(
    merged_alias_ap_collision_ncbi_df["collision"]
)
ap_collision_ambiguous_symbol_count_ncbi = len(ap_collision_ambiguous_symbol_set_ncbi)
ap_collision_ambiguous_symbol_count_ncbi

1394

How many records have at least one alias-primary collision (alias that matches another record's primary gene symbol)?

In [961]:
ap_record_set_ncbi = set(merged_alias_ap_collision_ncbi_df["NCBI_ID"])
ap_record_count_ncbi = len(ap_record_set_ncbi)
ap_record_count_ncbi

1614

# Merge 3 sets together

In [962]:
merged_alias_primary_collisions_df = pd.concat(
    [
        merged_alias_ap_collision_hgnc_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
        merged_alias_ap_collision_ncbi_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
        merged_alias_ap_collision_ensg_df[
            ["gene_symbol", "alias_symbol", "collision", "source", "ENSG_ID", "HGNC_ID", "NCBI_ID"]
        ],
    ]
)
merged_alias_primary_collisions_df

Unnamed: 0,gene_symbol,alias_symbol,collision,source,ENSG_ID,HGNC_ID,NCBI_ID
0,PPP1R12C,"AAVS1,DKFZP434D0412,MBS85,p84,p85",AAVS1,HGNC,ENSG00000125503,HGNC:14947,GENE ID:54776
1,SOAT1,"ACAT,ACAT1",ACAT1,HGNC,ENSG00000057252,HGNC:11177,GENE ID:6646
2,SOAT2,ACAT2,ACAT2,HGNC,ENSG00000167780,HGNC:11178,GENE ID:8435
3,CCRL2,"ACKR5,CKRX,CRAM-A,CRAM-B,HCR",ACKR5,HGNC,ENSG00000121797,HGNC:1612,GENE ID:9034
4,NDUFAB1,"ACP,ACP1,FASN2A,SDAP",ACP1,HGNC,ENSG00000004779,HGNC:7694,GENE ID:4706
...,...,...,...,...,...,...,...
825,ZNF121,"D19S204,ZHC32,ZNF20",ZNF20,ENSG,ENSG00000197961,HGNC:12904,GENE ID:7675
826,RNF141,"ZFP26,ZNF230",ZNF230,ENSG,ENSG00000110315,HGNC:21159,GENE ID:50862
827,ZNF322P1,"ZNF322,ZNF322B",ZNF322,ENSG,ENSG00000188801,HGNC:14003,
828,ZNF106,"SH3BP3,ZFP106,ZNF474",ZNF474,ENSG,ENSG00000103994,HGNC:12886,GENE ID:64397


In [963]:
merged_alias_primary_collisions_df.loc[
    merged_alias_primary_collisions_df["collision"] == "KRAS"
]

Unnamed: 0,gene_symbol,alias_symbol,collision,source,ENSG_ID,HGNC_ID,NCBI_ID
964,NRAS,"ALPS4,CMNS,KRAS,N-ras,NCMS,NRAS1,NS6",KRAS,NCBI,ENSG00000213281,HGNC:7989,GENE ID:4893


# Convert to csv

In [964]:
merged_alias_primary_collisions_df.to_csv(
    "../output/merged_alias_primary_collisions_df.csv", index=False
)

In [965]:
common_ap_collision_ambiguous_symbol_set = (
    ap_collision_ambiguous_symbol_set_ncbi
    & ap_collision_ambiguous_symbol_set_hgnc
    & ap_collision_ambiguous_symbol_set_ensg
)
common_ap_collision_ambiguous_symbol_set

{'ACAT1',
 'ACAT2',
 'ACP1',
 'ACTBP2',
 'ADA2',
 'ADRA1A',
 'AGT',
 'AIP',
 'ALB',
 'AMN',
 'APC2',
 'AR',
 'ARC',
 'ARG1',
 'ARHGAP10',
 'ARL1',
 'ARSB',
 'ASIP',
 'ATF1',
 'ATR',
 'AVP',
 'B3GNT8',
 'BACH1',
 'BAP1',
 'BCAM',
 'BCRP1',
 'BCRP2',
 'BDP1',
 'BNC1',
 'BRAP',
 'BRCC3',
 'BRI3',
 'BST1',
 'BTF3',
 'C6',
 'C7',
 'CA11',
 'CABP1',
 'CAD',
 'CAMP',
 'CAP1',
 'CAP2',
 'CAPS',
 'CAPS2',
 'CARF',
 'CAST',
 'CBLC',
 'CCR10',
 'CCR4',
 'CDH1',
 'CDH20',
 'CDH7',
 'CDKN1A',
 'CDS1',
 'CES2',
 'CHD5',
 'CHL1',
 'CHP1',
 'CIC',
 'CKLF',
 'CLC',
 'CLCP1',
 'CLIP4',
 'CNP',
 'CNR1',
 'CNR2',
 'COP1',
 'CPA3',
 'CPA6',
 'CPD',
 'CPN1',
 'CPN2',
 'CPS1',
 'CREB3',
 'CRIP1',
 'CRP',
 'CSN2',
 'CSN3',
 'CSNK2A1',
 'CST6',
 'CSTP1',
 'CTAGE4',
 'CTH',
 'DAO',
 'DAP',
 'DAP3',
 'DBI',
 'DBP',
 'DEF6',
 'DIO1',
 'DLC1',
 'DMC1',
 'DMP1',
 'DOK1',
 'DRC3',
 'DRG1',
 'DRP2',
 'DSC1',
 'DUSP1',
 'DUSP23',
 'DUSP26',
 'EAF2',
 'EBP',
 'ECD',
 'EIF2A',
 'ELF1',
 'ELK1',
 'ELN',
 'EMB',
 'EPO',
 

In [966]:
len(common_ap_collision_ambiguous_symbol_set)

434

In [967]:
mini_ncbi_df[mini_ncbi_df['gene_symbol'].str.contains('@', na=False)].nunique()


NCBI_ID         11
gene_symbol     11
alias_symbol    22
HGNC_ID          7
ENSG_ID          0
dtype: int64

In [968]:
mini_ncbi_df[mini_ncbi_df['gene_symbol'].str.contains('@', na=False)]

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
2585,GENE ID:3197,HOXA@,HOX1@,HGNC:5098,
2598,GENE ID:3210,HOXB@,HOX2@,HGNC:5110,
2608,GENE ID:3220,HOXC@,HOX3@,HGNC:5121,
2618,GENE ID:3230,HOXD@,HOX4@,HGNC:5131,
2764,GENE ID:3438,IFN1@,IFNA,,
2811,GENE ID:3496,IGHDOR15@,IGD2,,
2811,GENE ID:3496,IGHDOR15@,IGHD/OR15,,
2811,GENE ID:3496,IGHDOR15@,IGHDY2,,
2828,GENE ID:3519,IGKV@,IGKV,,
2828,GENE ID:3519,IGKV@,IGKV1,,


In [969]:
mini_hgnc_df[mini_hgnc_df['gene_symbol'].str.contains('@', na=False)].nunique()

HGNC_ID         7
gene_symbol     7
alias_symbol    4
NCBI_ID         3
ENSG_ID         0
dtype: int64