### Import 

In [1026]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [1027]:
def create_ap_collision_df(mini_xxxx_df: pd.DataFrame, source: str) -> pd.DataFrame:
    """Create a df of alias-primary collision symbols 

    :param mini_xxxx_df: Processed df of gene records
    :param source: Representation of the source of the gene records
    :param split_on_character: Character that is used to seperate alias symbols in the mini_xxxx_df
    :return: A df of genes that share an alias with another gene
    """

    #Strip "HGNC" from HGNC ID
    mini_xxxx_df["HGNC_ID"] = mini_xxxx_df["HGNC_ID"].str.lstrip("HGNC:")

    #Remove placeholder gene records (those with no primary gene symbol)
    mini_xxxx_df = mini_xxxx_df.replace(" ", np.nan)
    mini_xxxx_df = mini_xxxx_df.replace("", np.nan)
    mini_xxxx_df = mini_xxxx_df.replace("-", np.nan)
    mini_xxxx_df = mini_xxxx_df.dropna(subset=["gene_symbol"])

    #Remove placeholder LOC genes
    mini_xxxx_df = mini_xxxx_df[~mini_xxxx_df['gene_symbol'].str.startswith('LOC', na=False)]

    #mini_xxx_df
    ## no LOC genes (only present in NCBI)
    ## no records with no primary symbols (only present in ENSG)
    #Convert the df into a csv and save
    mini_xxxx_df.to_csv(f'../output/mini_{source.lower()}_df.csv', index=True) 

    #Make a new df where the alias symbols are merged together for each record
    merged_alias_xxxx_df = mini_xxxx_df.copy()
    merged_alias_xxxx_df["alias_symbol"] = merged_alias_xxxx_df["alias_symbol"].fillna("").astype(str)
    merged_alias_xxxx_df = (
    merged_alias_xxxx_df.groupby(
        ["ENSG_ID", "gene_symbol", "HGNC_ID"], dropna=False
    )["alias_symbol"]
    .apply(lambda x: ",".join(x.dropna())).reset_index())

    #Convert the df into a csv and save
    merged_alias_xxxx_df.to_csv(f'../output/merged_alias_{source}_df.csv', index=True) 

    #Make a set of the primary gene symbols
    xxxx_gene_symbol_set = set(mini_xxxx_df["gene_symbol"])
    uppercased_xxxx_gene_symbol_set = {s.upper() for s in xxxx_gene_symbol_set}

    
    subset_genes_xxxx_df = mini_xxxx_df.copy()

    # #Drop genes without any aliases
    # subset_genes_xxxx_df = subset_genes_xxxx_df.dropna(subset=["alias_symbol"])

    # # Remove duplicate instances of primary gene symbol- alias pairs
    # # (occur because the same primary gene symbol may have multiple different ENSG IDs, 
    # # see gene RFLNB for example)
    # subset_genes_xxxx_df["gene_symbol"] = subset_genes_xxxx_df["gene_symbol"].str.upper()
    # subset_genes_xxxx_df["alias_symbol"] = subset_genes_xxxx_df["alias_symbol"].str.upper()
    # subset_genes_xxxx_df = subset_genes_xxxx_df.drop_duplicates(subset=['gene_symbol', 'alias_symbol'], keep='first')

    #Remove alias symbols that are an exact match to their respective primary gene symbol
    subset_genes_xxxx_df = subset_genes_xxxx_df[subset_genes_xxxx_df['gene_symbol'] != subset_genes_xxxx_df['alias_symbol']]

    #Convert the df into a csv and save
    subset_genes_xxxx_df.to_csv(f'../output/subset_genes_{source}_df.csv', index=True) 

    #Create df with genes that have an alias that can be found as another gene's primary gene symbol
    ap_collision_xxxx_df = subset_genes_xxxx_df.copy()
    ap_collision_xxxx_df["alias_symbol_upper"] = ap_collision_xxxx_df["alias_symbol"].str.upper()
    ap_collision_xxxx_df.loc[:, "alias_symbol_upper"] = ap_collision_xxxx_df["alias_symbol_upper"].apply(lambda x: {x})
    ap_collision_xxxx_df.loc[:, "collision"] = ap_collision_xxxx_df["alias_symbol_upper"].apply(
        lambda x: x & uppercased_xxxx_gene_symbol_set
    )
    ap_collision_xxxx_df = ap_collision_xxxx_df.drop("alias_symbol_upper", axis=1)
    ap_collision_xxxx_df = ap_collision_xxxx_df[ap_collision_xxxx_df["collision"].apply(lambda x: len(x) > 0)]
    ap_collision_xxxx_df = ap_collision_xxxx_df.applymap(lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x)
    ap_collision_xxxx_df['NCBI_ID'] = ap_collision_xxxx_df['NCBI_ID'].fillna(0).astype(int)
    ap_collision_xxxx_df = ap_collision_xxxx_df.sort_values("collision")

    #Add a source tag for future merging efforts
    ap_collision_xxxx_df["source"] = str(source.upper())

    #Convert the df into a csv
    ap_collision_xxxx_df.to_csv(f'../output/single_alias_ap_collision_{source.lower()}_df.csv', index=True)

    #Create a secondary collision df that merges the alias symbols for each record
    merged_alias_ap_collision_xxxx_df = ap_collision_xxxx_df.drop(columns=['alias_symbol'])
    merged_alias_ap_collision_xxxx_df = pd.merge(merged_alias_ap_collision_xxxx_df, merged_alias_xxxx_df, on=["ENSG_ID", "gene_symbol", "HGNC_ID"], how="left")
    merged_alias_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df[["gene_symbol","alias_symbol","ENSG_ID","HGNC_ID","NCBI_ID","collision","source"]]
    merged_alias_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.drop_duplicates(subset=[f"{source}_ID"])

    #Test to make sure all collisions are in the alias list for the record
    test_ap_collision_xxxx_df = merged_alias_ap_collision_xxxx_df.copy()
    test_ap_collision_xxxx_df['alias_symbol'] = test_ap_collision_xxxx_df['alias_symbol'].apply(lambda x: x if isinstance(x, list) else [x])
    test_ap_collision_xxxx_df['collision_in_alias'] = test_ap_collision_xxxx_df.apply(lambda row: row['collision'] in test_ap_collision_xxxx_df['alias_symbol'].values, axis=1)
    true_count = test_ap_collision_xxxx_df['collision_in_alias'].sum()

    if true_count == 0:
        #Convert the df into a csv and save
        merged_alias_ap_collision_xxxx_df.to_csv(f'../output/merged_alias_ap_collision_{source.lower()}_df.csv', index=True)
        print("All collisions are present in gene alias lists.")
    else:
        print("Some collisions are not present in gene alias lists.")

    return mini_xxxx_df.head(), subset_genes_xxxx_df.head(), ap_collision_xxxx_df.head()

# Ensembl

In [1028]:
mini_ensg_df = pd.read_csv(
    "../input/ensg_biomart_gene20250625.txt", sep="\t",dtype={"NCBI gene (formerly Entrezgene) ID": pd.Int64Dtype()}
)
mini_ensg_df = mini_ensg_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Gene Synonym": "alias_symbol",
        "Gene name": "gene_symbol",
        "Gene stable ID": "ENSG_ID",
        "NCBI gene (formerly Entrezgene) ID": "NCBI_ID",
    }
)
mini_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF
1,ENSG00000210049,,HGNC:7481,TRNF,MT-TF
2,ENSG00000211459,,HGNC:7470,12S,MT-RNR1
3,ENSG00000211459,,HGNC:7470,MOTS-C,MT-RNR1
4,ENSG00000211459,,HGNC:7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133060,ENSG00000229388,,HGNC:52502,LINC01715,TAF12-DT
133061,ENSG00000289291,,,,
133062,ENSG00000274978,26824,HGNC:10108,RNU11-1,RNU11
133063,ENSG00000274978,26824,HGNC:10108,U11,RNU11


In [1029]:
len(set(mini_ensg_df['ENSG_ID']))

86364

In [1030]:
len(set(mini_ensg_df['gene_symbol']))

41165

In [1031]:
len(set(mini_ensg_df['alias_symbol']))

55413

In [1032]:
len(set(mini_ensg_df['gene_symbol']) | set(mini_ensg_df['alias_symbol']))

95942

In [1033]:
mini_ensg_df = mini_ensg_df.replace(" ", np.nan)
mini_ensg_df = mini_ensg_df.replace("", np.nan)
mini_ensg_df = mini_ensg_df.replace("-", np.nan)

unique_ensg_df = mini_ensg_df[mini_ensg_df['ENSG_ID'].duplicated(keep=False) == False]

# Count rows with missing gene symbols before dropping
no_symbol_count = mini_ensg_df["gene_symbol"].isna().sum()
no_symbol_count

38041

In [1034]:
# Now count rows in that subset where 'alias_symbol' is missing
no_alias_symbol_count = unique_ensg_df["alias_symbol"].isna().sum()
no_alias_symbol_count

57009

In [1035]:
orf_rows = mini_ensg_df[
    mini_ensg_df['gene_symbol']
    .str.contains(r'^C.*ORF', case=False, na=False) &
    ~mini_ensg_df['gene_symbol']
    .str.contains('-', na=False)
]
len(orf_rows)

420

In [1036]:
count_loc = mini_ensg_df['gene_symbol'].str.startswith('LOC', na=False).sum()
count_loc

0

In [1037]:
count_FAM = mini_ensg_df['gene_symbol'].str.startswith('FAM', na=False).sum()
count_FAM

738

In [1038]:
count_KIAA = mini_ensg_df['gene_symbol'].str.startswith('KIAA', na=False).sum()
count_KIAA

51

In [1039]:
# groupby name and return a boolean of whether each has more than 1 unique ENSG ID
multi_primary = mini_ensg_df.groupby(["gene_symbol"]).ENSG_ID.nunique().gt(1)

# use loc to only see those values that have `True` in `multi_primary`:
mini_ensg_df.loc[mini_ensg_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol').iloc[6000:6060]

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
92891,ENSG00000138641,8916,HGNC:4876,KIAA0032,HERC3
92878,ENSG00000287542,8916,,,HERC3
52117,ENSG00000273529,388585,HGNC:19764,BHLHB38,HES5
14696,ENSG00000197921,388585,HGNC:19764,BHLHB38,HES5
4416,ENSG00000280680,55733,HGNC:18270,MART-2,HHAT
4421,ENSG00000280680,55733,HGNC:18270,SKN,HHAT
4418,ENSG00000280680,55733,HGNC:18270,RASP,HHAT
4417,ENSG00000280680,55733,HGNC:18270,MART2,HHAT
43282,ENSG00000054392,55733,HGNC:18270,MART2,HHAT
43285,ENSG00000054392,55733,HGNC:18270,SKI,HHAT


In [1040]:
create_ap_collision_df(mini_ensg_df, "ENSG")

All collisions are present in gene alias lists.


  ap_collision_xxxx_df = ap_collision_xxxx_df.applymap(lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x)


(           ENSG_ID  NCBI_ID HGNC_ID alias_symbol gene_symbol
 0  ENSG00000210049     <NA>    7481         MTTF       MT-TF
 1  ENSG00000210049     <NA>    7481         TRNF       MT-TF
 2  ENSG00000211459     <NA>    7470          12S     MT-RNR1
 3  ENSG00000211459     <NA>    7470       MOTS-C     MT-RNR1
 4  ENSG00000211459     <NA>    7470       MTRNR1     MT-RNR1,
            ENSG_ID  NCBI_ID HGNC_ID alias_symbol gene_symbol
 0  ENSG00000210049     <NA>    7481         MTTF       MT-TF
 1  ENSG00000210049     <NA>    7481         TRNF       MT-TF
 2  ENSG00000211459     <NA>    7470          12S     MT-RNR1
 3  ENSG00000211459     <NA>    7470       MOTS-C     MT-RNR1
 4  ENSG00000211459     <NA>    7470       MTRNR1     MT-RNR1,
                ENSG_ID  NCBI_ID HGNC_ID alias_symbol gene_symbol collision  \
 24761  ENSG00000283293   125050   10037          7SK       RN7SK       7SK   
 37365  ENSG00000057252     6646   11177        ACAT1       SOAT1     ACAT1   
 17666  ENSG00000

In [1041]:
mini_ensg_df= pd.read_csv(
    "../output/mini_ensg_df.csv", index_col=[0])
mini_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,7481.0,MTTF,MT-TF
1,ENSG00000210049,,7481.0,TRNF,MT-TF
2,ENSG00000211459,,7470.0,12S,MT-RNR1
3,ENSG00000211459,,7470.0,MOTS-C,MT-RNR1
4,ENSG00000211459,,7470.0,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,85028.0,30062.0,LINC00100,SNHG12
133059,ENSG00000197989,85028.0,30062.0,PNAS-123,SNHG12
133060,ENSG00000229388,,52502.0,LINC01715,TAF12-DT
133062,ENSG00000274978,26824.0,10108.0,RNU11-1,RNU11


In [1042]:
subset_genes_ensg_df = pd.read_csv(
    "../output/subset_genes_ensg_df.csv", index_col=[0])
subset_genes_ensg_df

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
0,ENSG00000210049,,7481.0,MTTF,MT-TF
1,ENSG00000210049,,7481.0,TRNF,MT-TF
2,ENSG00000211459,,7470.0,12S,MT-RNR1
3,ENSG00000211459,,7470.0,MOTS-C,MT-RNR1
4,ENSG00000211459,,7470.0,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,85028.0,30062.0,LINC00100,SNHG12
133059,ENSG00000197989,85028.0,30062.0,PNAS-123,SNHG12
133060,ENSG00000229388,,52502.0,LINC01715,TAF12-DT
133062,ENSG00000274978,26824.0,10108.0,RNU11-1,RNU11


In [1043]:
mini_ensg_df.loc[mini_ensg_df["gene_symbol"] == "HLA-DQB2"]

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
45334,ENSG00000232629,3120.0,4945.0,HLA-DXB,HLA-DQB2
68189,ENSG00000230675,3120.0,4945.0,HLA-DXB,HLA-DQB2
73816,ENSG00000228813,3120.0,4945.0,HLA-DXB,HLA-DQB2
74541,ENSG00000224305,3120.0,4945.0,HLA-DXB,HLA-DQB2
76050,ENSG00000228254,3120.0,4945.0,HLA-DXB,HLA-DQB2
76102,ENSG00000229493,3120.0,4945.0,HLA-DXB,HLA-DQB2
77670,ENSG00000196610,3120.0,4945.0,HLA-DXB,HLA-DQB2
78251,ENSG00000226165,3120.0,4945.0,HLA-DXB,HLA-DQB2


In [1044]:
subset_genes_ensg_df.loc[
    subset_genes_ensg_df["gene_symbol"] == "ARK2C"
]

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,gene_symbol
4202,ENSG00000141622,494470.0,31696.0,ARKL2,ARK2C
4203,ENSG00000141622,494470.0,31696.0,LNCAMPC,ARK2C
4204,ENSG00000141622,494470.0,31696.0,RNF111L2,ARK2C
4205,ENSG00000141622,494470.0,31696.0,RNF165,ARK2C


In [1045]:
merged_alias_ap_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ensg_df.csv", index_col=[0])
merged_alias_ap_collision_ensg_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,RN7SK,7SK,ENSG00000283293,10037.0,125050,7SK,ENSG
1,SOAT1,"ACAT,ACAT1,SOAT,STAT",ENSG00000057252,11177.0,6646,ACAT1,ENSG
2,SOAT2,ACAT2,ENSG00000167780,11178.0,8435,ACAT2,ENSG
3,NDUFAB1,"ACP,ACP1,FASN2A,SDAP",ENSG00000004779,7694.0,4706,ACP1,ENSG
4,ACTBP8,ACTBP2,ENSG00000220267,141.0,0,ACTBP2,ENSG
...,...,...,...,...,...,...,...
807,ZNF121,"D19S204,ZHC32,ZNF20",ENSG00000197961,12904.0,7675,ZNF20,ENSG
808,RNF141,"ZFP26,ZNF230",ENSG00000110315,21159.0,50862,ZNF230,ENSG
809,ZNF322P1,"ZNF322,ZNF322B",ENSG00000188801,14003.0,0,ZNF322,ENSG
810,ZNF106,"SH3BP3,ZFP106,ZNF474",ENSG00000103994,12886.0,64397,ZNF474,ENSG


In [1046]:
merged_alias_ap_collision_ensg_df.head(60)

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,RN7SK,7SK,ENSG00000283293,10037.0,125050,7SK,ENSG
1,SOAT1,"ACAT,ACAT1,SOAT,STAT",ENSG00000057252,11177.0,6646,ACAT1,ENSG
2,SOAT2,ACAT2,ENSG00000167780,11178.0,8435,ACAT2,ENSG
3,NDUFAB1,"ACP,ACP1,FASN2A,SDAP",ENSG00000004779,7694.0,4706,ACP1,ENSG
4,ACTBP8,ACTBP2,ENSG00000220267,141.0,0,ACTBP2,ENSG
5,TADA2A,"ADA2,ADA2A,HADA2,TADA2L",ENSG00000277104,11531.0,6871,ADA2,ENSG
6,TADA2A,"ADA2,ADA2A,HADA2,TADA2L",ENSG00000276234,11531.0,6871,ADA2,ENSG
7,ADAM28,"ADAM23,EMDCII,MDC-LM,MDC-LS",ENSG00000042980,206.0,10863,ADAM23,ENSG
8,ADCY8,"AC8,ADCY3,HBAC1",ENSG00000155897,239.0,114,ADCY3,ENSG
9,ADRA1D,"ADRA1,ADRA1A,ADRA1R",ENSG00000171873,280.0,146,ADRA1A,ENSG


In [1047]:
len(set(merged_alias_ap_collision_ensg_df["gene_symbol"]))

677

In [1048]:
len(set(merged_alias_ap_collision_ensg_df["ENSG_ID"]))

768

### Make a set of the primary gene symbols

In [1049]:
ensg_gene_symbol_set = set(mini_ensg_df["gene_symbol"])

In [1050]:
total_number_ensg_gene_symbols = len(ensg_gene_symbol_set)
total_number_ensg_gene_symbols

41164

How many collisions are there? How many records are involved in one?


In [1051]:
ensg_alias_primary_collision_set = set(merged_alias_ap_collision_ensg_df["collision"])
len(ensg_alias_primary_collision_set)

607

In [1052]:
ensg_alias_primary_collision_primary_symbol_set = set(merged_alias_ap_collision_ensg_df["gene_symbol"])
len(ensg_alias_primary_collision_primary_symbol_set)

677

1. Why is the alias-gene collision set not the same length as the set of primary symbols with collisions ?
2. Why is the length of the alias-gene collison set shorter?
 - A priamry gene symbol with an alias-gene collision has an alias that matches a different gene's primary gene symbol.
 - Multiple genes can share a single alias (alias-alias collision)
 - If that shared alias is an alias-gene collision, then there will be more unique gene symbols in the set of primary symbols with collisions than the set of alias-gene collisions. 

# HGNC

## Set up table

In [1053]:
file_path = "../input/hgnc_biomart_gene20240626.txt"

mini_hgnc_df = pd.read_csv(
    file_path, sep="\t"
)

# Rename columns
mini_hgnc_df = mini_hgnc_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Approved symbol": "gene_symbol",
        "Alias symbols": "alias_symbol",
        "Ensembl gene ID": "ENSG_ID",
    }
)

#structure and labeling in HGNC download files changed between 2024 amd 2025
if "Alias symbol" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"Alias symbol": "alias_symbol"})
elif "Alias symbols" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"Alias symbols": "alias_symbol"})
else:
    mini_hgnc_df["alias_symbol"] = pd.NA

if "NCBI gene ID" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"NCBI gene ID": "NCBI_ID"})
elif "NCBI Gene ID" in mini_hgnc_df.columns:
    mini_hgnc_df = mini_hgnc_df.rename(columns={"NCBI Gene ID": "NCBI_ID"})
else:
    mini_hgnc_df["NCBI_ID"] = pd.NA   

mini_hgnc_df["NCBI_ID"] = mini_hgnc_df["NCBI_ID"].astype(pd.Int64Dtype())

# Extract date from filename and check if it is before June 25, 2025
match = re.search(r'(\d{8})', file_path)
file_date = datetime.strptime(match.group(1), "%Y%m%d") if match else None
cutoff_date = datetime.strptime("20250625", "%Y%m%d")

# Apply list-splitting logic for newer files
if file_date and file_date >= cutoff_date:
    mini_hgnc_df['alias_symbol'] = (
        mini_hgnc_df['alias_symbol']
        .fillna('')
        .str.split(',')
        .apply(lambda x: [a.strip() for a in x if a.strip()])
    )
    mini_hgnc_df = mini_hgnc_df.explode('alias_symbol')

mini_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,HGNC:5,,1,ENSG00000121410,A1BG
1,HGNC:37133,FLJ23569,503538,ENSG00000268895,A1BG-AS1
2,HGNC:24086,ACF,29974,ENSG00000148584,A1CF
3,HGNC:24086,ASP,29974,ENSG00000148584,A1CF
4,HGNC:24086,ACF64,29974,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,HGNC:29027,KIAA0399,23140,ENSG00000074755,ZZEF1
67579,HGNC:29027,ZZZ4,23140,ENSG00000074755,ZZEF1
67580,HGNC:29027,FLJ10821,23140,ENSG00000074755,ZZEF1
67581,HGNC:24523,DKFZP564I052,26009,ENSG00000036549,ZZZ3


In [1054]:
len(set(mini_hgnc_df['HGNC_ID']))

45646

In [1055]:
len(set(mini_hgnc_df['gene_symbol']))

45646

In [1056]:
len(set(mini_hgnc_df['alias_symbol']))

43187

In [1057]:
len(set(mini_hgnc_df['gene_symbol']) | set(mini_hgnc_df['alias_symbol']))

88307

In [1058]:
mini_hgnc_df = mini_hgnc_df.replace(" ", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("-", np.nan)
unique_hcng_df = mini_hgnc_df[mini_hgnc_df['HGNC_ID'].duplicated(keep=False) == False]
# Count rows with missing gene symbols before dropping
no_symbol_count = mini_hgnc_df["gene_symbol"].isna().sum()
no_symbol_count

0

In [1059]:
no_alias_symbol_count = unique_hcng_df["alias_symbol"].isna().sum()
no_alias_symbol_count

22999

In [1060]:
orf_rows = mini_hgnc_df[
    mini_hgnc_df['gene_symbol']
    .str.contains(r'^C.*ORF', case=False, na=False) &
    ~mini_hgnc_df['gene_symbol']
    .str.contains('-', na=False)
]
len(orf_rows)

470

In [1061]:
count_loc = mini_hgnc_df['gene_symbol'].str.startswith('LOC', na=False).sum()
count_loc

0

In [1062]:
count_FAM = mini_hgnc_df['gene_symbol'].str.startswith('FAM', na=False).sum()
count_FAM

494

In [1063]:
count_KIAA = mini_hgnc_df['gene_symbol'].str.startswith('KIAA', na=False).sum()
count_KIAA

46

In [1064]:
mask = mini_hgnc_df['gene_symbol'].str.contains('CCV', na=False) 

# Use the boolean mask to filter the DataFrame
filtered_df = mini_hgnc_df[mask]
filtered_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
6727,HGNC:1624,CTRCT8,,,CCV


In [1065]:
create_ap_collision_df(mini_hgnc_df, "HGNC")

All collisions are present in gene alias lists.


  ap_collision_xxxx_df = ap_collision_xxxx_df.applymap(lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x)


(  HGNC_ID alias_symbol  NCBI_ID          ENSG_ID gene_symbol
 0       5          NaN        1  ENSG00000121410        A1BG
 1   37133     FLJ23569   503538  ENSG00000268895    A1BG-AS1
 2   24086          ACF    29974  ENSG00000148584        A1CF
 3   24086          ASP    29974  ENSG00000148584        A1CF
 4   24086        ACF64    29974  ENSG00000148584        A1CF,
   HGNC_ID alias_symbol  NCBI_ID          ENSG_ID gene_symbol
 0       5          NaN        1  ENSG00000121410        A1BG
 1   37133     FLJ23569   503538  ENSG00000268895    A1BG-AS1
 2   24086          ACF    29974  ENSG00000148584        A1CF
 3   24086          ASP    29974  ENSG00000148584        A1CF
 4   24086        ACF64    29974  ENSG00000148584        A1CF,
       HGNC_ID alias_symbol  NCBI_ID          ENSG_ID gene_symbol collision  \
 42104   14947        AAVS1    54776  ENSG00000125503    PPP1R12C     AAVS1   
 56627   11177        ACAT1     6646  ENSG00000057252       SOAT1     ACAT1   
 56628   11178   

In [1066]:
mini_hgnc_df= pd.read_csv(
    "../output/mini_hgnc_df.csv", index_col=[0])
mini_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,5,,1.0,ENSG00000121410,A1BG
1,37133,FLJ23569,503538.0,ENSG00000268895,A1BG-AS1
2,24086,ACF,29974.0,ENSG00000148584,A1CF
3,24086,ASP,29974.0,ENSG00000148584,A1CF
4,24086,ACF64,29974.0,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,29027,KIAA0399,23140.0,ENSG00000074755,ZZEF1
67579,29027,ZZZ4,23140.0,ENSG00000074755,ZZEF1
67580,29027,FLJ10821,23140.0,ENSG00000074755,ZZEF1
67581,24523,DKFZP564I052,26009.0,ENSG00000036549,ZZZ3


In [1067]:
# groupby name and return a boolean of whether each has more than 1 unique Country
multi_primary = mini_hgnc_df.groupby(["gene_symbol"]).HGNC_ID.nunique().gt(1)

# use loc to only see those values that have `True` in `multi_primary`:
mini_hgnc_df.loc[mini_hgnc_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol').head(50)

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol


In [1068]:
subset_genes_hgnc_df = pd.read_csv(
    "../output/subset_genes_hgnc_df.csv", index_col=[0])
subset_genes_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,5,,1.0,ENSG00000121410,A1BG
1,37133,FLJ23569,503538.0,ENSG00000268895,A1BG-AS1
2,24086,ACF,29974.0,ENSG00000148584,A1CF
3,24086,ASP,29974.0,ENSG00000148584,A1CF
4,24086,ACF64,29974.0,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,29027,KIAA0399,23140.0,ENSG00000074755,ZZEF1
67579,29027,ZZZ4,23140.0,ENSG00000074755,ZZEF1
67580,29027,FLJ10821,23140.0,ENSG00000074755,ZZEF1
67581,24523,DKFZP564I052,26009.0,ENSG00000036549,ZZZ3


In [1069]:
subset_genes_hgnc_df.loc[
    subset_genes_hgnc_df["gene_symbol"] == "ARK2C"
]

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
2708,31696,ARKL2,494470.0,ENSG00000141622,ARK2C
2709,31696,RNF111L2,494470.0,ENSG00000141622,ARK2C
2710,31696,Ark2C,494470.0,ENSG00000141622,ARK2C
2711,31696,lncAMPC,494470.0,ENSG00000141622,ARK2C


In [1070]:
merged_alias_ap_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_ap_collision_hgnc_df.csv", index_col=[0])
merged_alias_ap_collision_hgnc_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,PPP1R12C,"DKFZP434D0412,p84,MBS85,p85,AAVS1",ENSG00000125503,14947,54776,AAVS1,HGNC
1,SOAT1,"ACAT,ACAT1",ENSG00000057252,11177,6646,ACAT1,HGNC
2,SOAT2,ACAT2,ENSG00000167780,11178,8435,ACAT2,HGNC
3,GLI3,"PAP-A,PAPA,PAPA1,PAPB,ACLS,PPDIV",ENSG00000106571,4319,2737,ACLS,HGNC
4,NDUFAB1,"SDAP,FASN2A,ACP,ACP1",ENSG00000004779,7694,4706,ACP1,HGNC
...,...,...,...,...,...,...,...
655,ZNF121,"ZHC32,ZNF20",ENSG00000197961,12904,7675,ZNF20,HGNC
656,RNF141,"ZFP26,ZNF230",ENSG00000110315,21159,50862,ZNF230,HGNC
657,ZNF106,"ZNF474,SH3BP3",ENSG00000103994,12886,64397,ZNF474,HGNC
658,ZFP1,"FLJ34243,ZNF475",ENSG00000184517,23328,162239,ZNF475,HGNC


### Make a set of the primary gene symbols

In [1071]:
hgnc_gene_symbol_set = set(mini_hgnc_df["gene_symbol"])
# all_gene_symbols_set

In [1072]:
total_number_hgnc_gene_symbols = len(hgnc_gene_symbol_set)
total_number_hgnc_gene_symbols

45646

How many collisions are there? How many records are involved in one?

In [1073]:
hgnc_alias_primary_collision_set = set(merged_alias_ap_collision_hgnc_df["collision"])
len(hgnc_alias_primary_collision_set)

568

In [1074]:
hgnc_alias_primary_collision_primary_symbol_set = set(merged_alias_ap_collision_hgnc_df["gene_symbol"])
len(hgnc_alias_primary_collision_primary_symbol_set)

644

# NCBI Info

In [1075]:
file_path = "../input/Homo_sapiens.gene_info20250625"

mini_ncbi_df = pd.read_csv(file_path, sep="\t")
# Drop all columns besides ENSG_ID, gene_symbol, and alias_symbol
mini_ncbi_df = mini_ncbi_df[
["GeneID", "Symbol", "Synonyms", "dbXrefs"]
]
mini_ncbi_df = mini_ncbi_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Synonyms": "alias_symbol"})


Split dbXrefs into individual columns

In [1076]:
mini_ncbi_df = mini_ncbi_df.assign(
    MIM=np.nan,
    HGNC_ID=np.nan,
    ENSG_ID=np.nan,
    AllianceGenome=np.nan,
    MIRbase=np.nan,
    IMGTgene_db=np.nan,
    dash=np.nan,
    unknown=np.nan,
)

In [1077]:
index_pos = 0

print(len(mini_ncbi_df))
while index_pos < len(mini_ncbi_df):
    xrefs = mini_ncbi_df["dbXrefs"][index_pos].split("|")

    for xref in xrefs:
        xref = xref.lower()
        if xref.startswith("mim:"):
            xref = xref.replace("mim:", "")
            mini_ncbi_df["MIM"][index_pos] = xref
        elif xref.startswith("hgnc:hgnc:"):
            xref = xref.replace("hgnc:hgnc:", "")
            mini_ncbi_df["HGNC_ID"][index_pos] = xref
        elif xref.startswith("ensembl:"):
            xref = xref.replace("ensembl:", "")
            mini_ncbi_df["ENSG_ID"][index_pos] = xref
        elif xref.startswith("alliancegenome:"):
            xref = xref.replace("alliancegenome:", "")
            mini_ncbi_df["AllianceGenome"][index_pos] = xref
        elif xref.startswith("mirbase"):
            xref = xref.replace("mirbase:", "")
            mini_ncbi_df["MIRbase"][index_pos] = xref
        elif xref.startswith("imgt/gene-db:"):
            xref = xref.replace("imgt/gene-db:", "")
            mini_ncbi_df["IMGTgene_db"][index_pos] = xref
        elif xref.startswith("-"):
            mini_ncbi_df["dash"][index_pos] = xref
        else:
            mini_ncbi_df["unknown"][index_pos] = xref

    index_pos += 1
    pass

print(index_pos)

193580


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  mini_ncbi_df["MIM"][index_pos] = xref
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["MIM"][index

193580


In [1078]:
mini_ncbi_df["ENSG_ID"] = mini_ncbi_df["ENSG_ID"].str.replace("ensg", "ENSG", 1)

In [1079]:
mini_ncbi_df = mini_ncbi_df.drop(
    [
        "AllianceGenome",
        "MIRbase",
        "IMGTgene_db",
        "dash",
        "unknown",
        "dbXrefs",
        "MIM",
    ],
    axis=1,
)
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B|ABG|GAB|HYST2477,5,ENSG00000121410
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7,7,ENSG00000175899
2,9,NAT1,AAC1|MNAT|NAT-1|NATI,7645,ENSG00000171428
3,10,NAT2,AAC2|NAT-2|PNAT,7646,ENSG00000156006
4,11,NATP,AACP|NATP1,15,
...,...,...,...,...,...
193575,8923215,trnD,-,,
193576,8923216,trnP,-,,
193577,8923217,trnA,-,,
193578,8923218,COX1,-,,


In [1080]:
len(set(mini_ncbi_df['NCBI_ID']))

193580

In [1081]:
len(set(mini_ncbi_df['gene_symbol']))

193427

In [1082]:
len(set(mini_ncbi_df['alias_symbol']))

27600

In [1083]:
len(set(mini_ncbi_df['gene_symbol']) | set(mini_ncbi_df['alias_symbol']))

220799

In [1084]:
mini_ncbi_df = mini_ncbi_df.replace(" ", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("-", np.nan)

unique_ncbi_df = mini_ncbi_df[mini_ncbi_df['NCBI_ID'].duplicated(keep=False) == False]

# Count rows with missing gene symbols before dropping
no_symbol_count = mini_ncbi_df["gene_symbol"].isna().sum()
no_symbol_count

0

In [1085]:
no_alias_symbol_count = unique_ncbi_df["alias_symbol"].isna().sum()
no_alias_symbol_count

165885

In [1086]:
orf_rows = mini_ncbi_df[
    mini_ncbi_df['gene_symbol']
    .str.contains(r'^C.*ORF', case=False, na=False) &
    ~mini_ncbi_df['gene_symbol']
    .str.contains('-', na=False)
]
len(orf_rows)

248

In [1087]:
count_loc = mini_ncbi_df['gene_symbol'].str.startswith('LOC', na=False).sum()
count_loc

147700

In [1088]:
count_FAM = mini_ncbi_df['gene_symbol'].str.startswith('FAM', na=False).sum()
count_FAM

396

In [1089]:
count_KIAA = mini_ncbi_df['gene_symbol'].str.startswith('KIAA', na=False).sum()
count_KIAA

37

In [1090]:
# groupby name and return a boolean of whether each has more than 1 unique Country
multi_primary = mini_ncbi_df.groupby(["gene_symbol"]).NCBI_ID.nunique().gt(1)

# use loc to only see those values that have `True` in `multi_country`:
mini_ncbi_df.loc[mini_ncbi_df.gene_symbol.isin(multi_primary[multi_primary].index)].sort_values(by='gene_symbol').head(60)

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
193532,6775087,12S rRNA,,,
193573,8923213,12S rRNA,,,
193548,8923188,ATP6,,,
3522,4508,ATP6,ATPase6|MTATP6,7414.0,
193519,6775074,ATP6,,,
3523,4509,ATP8,ATPase8|MTATP8,7415.0,
193522,6775077,ATP8,,,
193571,8923211,ATP8,,,
193528,6775083,COX1,,,
193578,8923218,COX1,,,


In [1091]:
mini_ncbi_df['alias_symbol'] = mini_ncbi_df['alias_symbol'].str.split('|')
mini_ncbi_df = mini_ncbi_df.explode('alias_symbol')
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5,ENSG00000121410
0,1,A1BG,ABG,5,ENSG00000121410
0,1,A1BG,GAB,5,ENSG00000121410
0,1,A1BG,HYST2477,5,ENSG00000121410
1,2,A2M,A2MD,7,ENSG00000175899
...,...,...,...,...,...
193575,8923215,trnD,,,
193576,8923216,trnP,,,
193577,8923217,trnA,,,
193578,8923218,COX1,,,


In [1092]:
create_ap_collision_df(mini_ncbi_df, "NCBI")

All collisions are present in gene alias lists.


  ap_collision_xxxx_df = ap_collision_xxxx_df.applymap(lambda x: ', '.join(map(str, x)) if isinstance(x, set) else x)


(   NCBI_ID gene_symbol alias_symbol HGNC_ID          ENSG_ID
 0        1        A1BG          A1B       5  ENSG00000121410
 0        1        A1BG          ABG       5  ENSG00000121410
 0        1        A1BG          GAB       5  ENSG00000121410
 0        1        A1BG     HYST2477       5  ENSG00000121410
 1        2         A2M         A2MD       7  ENSG00000175899,
    NCBI_ID gene_symbol alias_symbol HGNC_ID          ENSG_ID
 0        1        A1BG          A1B       5  ENSG00000121410
 0        1        A1BG          ABG       5  ENSG00000121410
 0        1        A1BG          GAB       5  ENSG00000121410
 0        1        A1BG     HYST2477       5  ENSG00000121410
 1        2         A2M         A2MD       7  ENSG00000175899,
        NCBI_ID gene_symbol alias_symbol HGNC_ID          ENSG_ID collision  \
 2809      3494       IGHA2          A2M    5479  ENSG00000211890       A2M   
 24213   404744   NPSR1-AS1         AAA1   22128  ENSG00000197085      AAA1   
 16372    89876  

In [1093]:
mini_ncbi_df= pd.read_csv(
    "../output/mini_ncbi_df.csv", index_col=[0])
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5.0,ENSG00000121410
0,1,A1BG,ABG,5.0,ENSG00000121410
0,1,A1BG,GAB,5.0,ENSG00000121410
0,1,A1BG,HYST2477,5.0,ENSG00000121410
1,2,A2M,A2MD,7.0,ENSG00000175899
...,...,...,...,...,...
193575,8923215,trnD,,,
193576,8923216,trnP,,,
193577,8923217,trnA,,,
193578,8923218,COX1,,,


In [1094]:
mini_ncbi_df[mini_ncbi_df['gene_symbol'].str.startswith('LOC')]

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID


In [1095]:
subset_genes_ncbi_df = pd.read_csv(
    "../output/subset_genes_ncbi_df.csv", index_col=[0])
subset_genes_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B,5.0,ENSG00000121410
0,1,A1BG,ABG,5.0,ENSG00000121410
0,1,A1BG,GAB,5.0,ENSG00000121410
0,1,A1BG,HYST2477,5.0,ENSG00000121410
1,2,A2M,A2MD,7.0,ENSG00000175899
...,...,...,...,...,...
193575,8923215,trnD,,,
193576,8923216,trnP,,,
193577,8923217,trnA,,,
193578,8923218,COX1,,,


In [1096]:
merged_alias_ap_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ncbi_df.csv", index_col=[0])
merged_alias_ap_collision_ncbi_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,IGHA2,A2M,ENSG00000211890,5479.0,3494,A2M,NCBI
1,NPSR1-AS1,AAA1,ENSG00000197085,22128.0,404744,AAA1,NCBI
2,CFAP91,"AAT1,AAT1alpha,C3orf15,CaM-IP2,MAATS1,SPATA26,...",ENSG00000183833,24010.0,89876,AAT1,NCBI
3,GPT,"AAT1,ALT,ALT1,GPT1,SGPT",ENSG00000167701,4552.0,2875,AAT1,NCBI
4,PPP1R12C,"AAVS1,LENG3,MBS85,p84,p85",ENSG00000125503,14947.0,54776,AAVS1,NCBI
...,...,...,...,...,...,...,...
2167,CCDC106,"HSU79303,ZNF581",ENSG00000173581,30181.0,29903,ZNF581,NCBI
2168,ZNF785,ZNF688,ENSG00000197162,26496.0,146540,ZNF688,NCBI
2169,ZP4,"ZBP,ZP1,ZP1B,ZPB,ZPB2,Zp-4",ENSG00000116996,15770.0,57829,ZP1,NCBI
2171,ZNF446,"ZKSCAN20,ZSCAN30,ZSCAN52",ENSG00000083838,21036.0,55663,ZSCAN30,NCBI


### Make a set of primary gene symbols

In [1097]:
ncbi_gene_symbol_set = set(mini_ncbi_df["gene_symbol"])

In [1098]:
total_number_ncbi_gene_symbols = len(ncbi_gene_symbol_set)
total_number_ncbi_gene_symbols

45727

How many collisions are there? How many records are involved in one?

In [1099]:
ncbi_alias_primary_collision_set = set(
    merged_alias_ap_collision_ncbi_df["collision"]
)
len(ncbi_alias_primary_collision_set)

1490

TODO: make the records set based on ID not symbol

In [1100]:
ncbi_alias_primary_collision_primary_symbol_set = set(
    merged_alias_ap_collision_ncbi_df["gene_symbol"]
)
len(ncbi_alias_primary_collision_primary_symbol_set)

1735

# Merge 3 sets together

In [1101]:
merged_alias_primary_collisions_df = pd.concat(
    [
        merged_alias_ap_collision_hgnc_df[
            ["gene_symbol", "alias_symbol", "collision", "source"]
        ],
        merged_alias_ap_collision_ncbi_df[
            ["gene_symbol", "alias_symbol", "collision", "source"]
        ],
        merged_alias_ap_collision_ensg_df[
            ["gene_symbol", "alias_symbol", "collision", "source"]
        ],
    ]
)
merged_alias_primary_collisions_df

Unnamed: 0,gene_symbol,alias_symbol,collision,source
0,PPP1R12C,"DKFZP434D0412,p84,MBS85,p85,AAVS1",AAVS1,HGNC
1,SOAT1,"ACAT,ACAT1",ACAT1,HGNC
2,SOAT2,ACAT2,ACAT2,HGNC
3,GLI3,"PAP-A,PAPA,PAPA1,PAPB,ACLS,PPDIV",ACLS,HGNC
4,NDUFAB1,"SDAP,FASN2A,ACP,ACP1",ACP1,HGNC
...,...,...,...,...
807,ZNF121,"D19S204,ZHC32,ZNF20",ZNF20,ENSG
808,RNF141,"ZFP26,ZNF230",ZNF230,ENSG
809,ZNF322P1,"ZNF322,ZNF322B",ZNF322,ENSG
810,ZNF106,"SH3BP3,ZFP106,ZNF474",ZNF474,ENSG


In [1102]:
merged_alias_primary_collisions_df.loc[
    merged_alias_primary_collisions_df["collision"] == "CFM1"
]

Unnamed: 0,gene_symbol,alias_symbol,collision,source
87,RFLNB,"MGC45871,RefilinB,Cfm1",CFM1,HGNC
215,RFLNB,"CFM1,FAM101B",CFM1,NCBI


In [1103]:
duplicate_rows = merged_alias_primary_collisions_df[merged_alias_primary_collisions_df.duplicated(subset=['collision','source'], keep=False)]
duplicate_rows

Unnamed: 0,gene_symbol,alias_symbol,collision,source
13,UBE3A,"AS,ANCR,E6-AP,FLJ26981",ANCR,HGNC
14,DANCR,"ANCR,AGU2,lncRNA-ANCR",ANCR,HGNC
15,SLC25A23,"FLJ30339,MGC2615,APC2",APC2,HGNC
16,ANAPC2,"APC2,KIAA1406",APC2,HGNC
19,AKR1B1,AR,AR,HGNC
...,...,...,...,...
796,VARS1,"VARS,VARS2",VARS2,ENSG
797,VARS1,"VARS,VARS2",VARS2,ENSG
798,VARS1,"VARS,VARS2",VARS2,ENSG
804,WDR82P1,"HCG26824,SW2,WDR82,WDR82B",WDR82,ENSG


In [1104]:
duplicate_rows = merged_alias_primary_collisions_df[merged_alias_primary_collisions_df.duplicated(subset=['gene_symbol','source'], keep=False)]
duplicate_rows

Unnamed: 0,gene_symbol,alias_symbol,collision,source
5,TADA2A,"ADA2,ADA2A,HADA2,TADA2L",ADA2,ENSG
6,TADA2A,"ADA2,ADA2A,HADA2,TADA2L",ADA2,ENSG
36,SERPINA2,"ARGS,ATR,PIL,SERPINA2P",ATR,ENSG
37,SERPINA2,"ARGS,ATR,PIL,SERPINA2P",ATR,ENSG
41,B3GNTL1,"B3GNT8,B3GNT8",B3GNT8,ENSG
...,...,...,...,...
796,VARS1,"VARS,VARS2",VARS2,ENSG
797,VARS1,"VARS,VARS2",VARS2,ENSG
798,VARS1,"VARS,VARS2",VARS2,ENSG
804,WDR82P1,"HCG26824,SW2,WDR82,WDR82B",WDR82,ENSG


# Convert to csv

In [1105]:
merged_alias_primary_collisions_df.to_csv(
    "../output/merged_alias_primary_collisions_df.csv", index=False
)

In [1106]:
common_ap_collisions = (
    ncbi_alias_primary_collision_primary_symbol_set
    & hgnc_alias_primary_collision_primary_symbol_set
    & ensg_alias_primary_collision_primary_symbol_set
)
common_ap_collisions

{'ABCD1',
 'ACD',
 'ACKR2',
 'ACOD1',
 'ACTBP8',
 'ADRA1D',
 'AGXT',
 'AIFM2',
 'AKR1B1',
 'AKR1B10',
 'ALPK3',
 'AMH',
 'ANAPC2',
 'ANKRD37',
 'ANTXR1',
 'AOC1',
 'APEX1',
 'AREG',
 'ARHGAP21',
 'ARHGEF7',
 'ARID4A',
 'ART4',
 'ARTN',
 'ASIC2',
 'AURKAIP1',
 'AZIN2',
 'BANF1P1',
 'BCAT2',
 'BRIP1',
 'BTF3P11',
 'BTN3A3',
 'C1D',
 'C1QTNF1',
 'C6orf89',
 'CACNA1A',
 'CADPS',
 'CADPS2',
 'CAPN5',
 'CARD16',
 'CCL13',
 'CCL14',
 'CCL15',
 'CCM2',
 'CD200R1',
 'CDH19',
 'CDH20',
 'CDPF1',
 'CELSR1',
 'CES1',
 'CFAP73',
 'CFH',
 'CHAF1B',
 'CHAMP1',
 'CHD6',
 'CHEK2',
 'CHORDC1',
 'CLASP1',
 'CLCF1',
 'CNGB1',
 'CNKSR2',
 'CNOT6',
 'CNRIP1',
 'CNTN1',
 'COASY',
 'COPS2',
 'COPS3',
 'CORIN',
 'COX7A2L',
 'CPA4',
 'CPAMD8',
 'CPNE1',
 'CPNE2',
 'CPPED1',
 'CREB3L4',
 'CSNK2A2',
 'CSTB',
 'CTDSP2',
 'CXCL10',
 'CXXC1',
 'CYCSP5',
 'CYP11B1',
 'CYP11B2',
 'CYP21A2',
 'CYP2A6',
 'DCAF5',
 'DCBLD2',
 'DDOST',
 'DDR2',
 'DDX11',
 'DDX18',
 'DEAF1',
 'DEFA6',
 'DEPDC1B',
 'DFFB',
 'DHX8',
 'DIDO1'

In [1107]:
len(common_ap_collisions)

488