**Table of contents**<a id='toc0_'></a>    
- [What is the set of ambiguous symbols per source?](#toc1_1_)    
    - [Define Functions](#toc1_1_1_)    
    - [ENSG](#toc1_1_2_)    
    - [HGNC](#toc1_1_3_)    
    - [NCBI](#toc1_1_4_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[What is the set of ambiguous symbols per source?](#toc0_)

Need this to create a better histogram figure. Current one only shows the distribution of ambiguous symbols created by alias-alias collisions by source. I want to be able to illustrate how the ambiguous symbols in each source (created by alias-alias and alias-primary collisions) are shared. 

In [96]:
import pandas as pd
import plotly.express as px

### <a id='toc1_1_1_'></a>[Define Functions](#toc0_)

In [97]:
def combine_columns(df, columns_to_combine, columns_to_keep, new_name, columns_to_drop):
    """Combine multiple columns into one while keeping associated data attached.
    Use this function when the columns to combine are easier to list 
    than the columns not to combine.
    
    :param df: The DataFrame containing the columns to be combined
    :param columns_to_combine: List of column names to combine into one
    :param columns_to_keep: List of columns to keep in the final DataFrame
    :param new_name: The name of the new combined column
    :param columns_to_drop: List of columns to drop from the final DataFrame
    :return: A new DataFrame with combined columns and selected columns retained
    """
    og_df = df.copy()

    combined_dfs = []

    # Loop through each column in columns_to_combine and create a new DataFrame
    for col in columns_to_combine:
        temp_df = og_df[list(set([col] + columns_to_keep))].copy()
        temp_df[new_name] = temp_df[col]
        combined_dfs.append(temp_df)

    df_combined = pd.concat(combined_dfs, ignore_index=True)
    df_combined.drop(columns_to_drop, axis=1, inplace=True)
    df_combined.drop_duplicates(inplace=True)
    
    return df_combined

In [98]:
def create_ambiguous_symbol_histogram(XXXX_ambiguous_symbol_counts: pd.DataFrame, source: str, XXXX_ambiguous_symbol_set_count: int):
    """Create a histogram of the frequencies at which aliases are shared

    :param aa_collision_gene_xxxx_df: A df of alias-alias collisions organized by primary gene symbol
    :param source: Representation of the source of the gene records
    :param xxxx_alias_count: Number of aliases total in the source
    :return: A histogram of the percentage of aliases that are shared between 2 genes, 3 genes, and so on
    """

    #Count the frequency at which aliases are shared 
    XXXX_ambiguous_symbol_counts_distribution_df = XXXX_ambiguous_symbol_counts.pivot_table(
    index=["symbol_counts"], aggfunc="size"
    )
    XXXX_ambiguous_symbol_counts_distribution_df = XXXX_ambiguous_symbol_counts_distribution_df.reset_index()
    XXXX_ambiguous_symbol_counts_distribution_df.rename(columns={0: "num_collision_symbol"}, inplace=True)
    XXXX_ambiguous_symbol_counts_distribution_df["percent_collision_symbol"] = (
        XXXX_ambiguous_symbol_counts_distribution_df["num_collision_symbol"] / XXXX_ambiguous_symbol_set_count
    ) * 100

    #Convert to csv
    XXXX_ambiguous_symbol_counts_distribution_df.to_csv(f'../output/ambiguous_symbol_{source}_distribution_df.csv', index=True)

    #Create histogram df 
    xxxx_symbol_count_histogram_df = XXXX_ambiguous_symbol_counts_distribution_df.drop(
    "num_collision_symbol", axis=1)

    #Convert to csv
    xxxx_symbol_count_histogram_df.to_csv(f'../output/{source}_ambiguous_symbol_count_histogram_df.csv', index=True)

    return px.bar(xxxx_symbol_count_histogram_df, x="symbol_counts", y="percent_collision_symbol")



### <a id='toc1_1_2_'></a>[ENSG](#toc0_)

import file with ambiguous symbols resulting from alias-alias collisions

In [99]:
merged_alias_aa_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_aa_collision_ensg_df.csv", index_col=[0])
merged_alias_aa_collision_ensg_df

Unnamed: 0,ENSG_ID,collision,gene_symbol,source,NCBI_ID,HGNC_ID,alias_symbol
0,ENSG00000139187,2F1,KLRG1,ENSG,GENE ID:10219,HGNC:6380,"2F1,CLEC15A,MAFA,MAFA-L"
1,ENSG00000005022,2F1,SLC25A5,ENSG,GENE ID:292,HGNC:10991,"2F1,ANT2,T2,T3"
2,ENSG00000163220,60B8AG,S100A9,ENSG,GENE ID:6280,HGNC:10499,"60B8AG,CAGB,CFAG,CGLB,LIAG,MAC387,MIF,MRP-14,M..."
3,ENSG00000143546,60B8AG,S100A8,ENSG,GENE ID:6279,HGNC:10498,"60B8AG,CAGA,CFAG,CGLA,MRP-8,MRP8,P8,S100-A8"
4,ENSG00000065135,87U6,GNAI3,ENSG,GENE ID:2773,HGNC:4387,87U6
...,...,...,...,...,...,...,...
4721,ENSG00000164756,ZNT8,SLC30A8,ENSG,GENE ID:169026,HGNC:20303,"ZNT-8,ZNT8"
4722,ENSG00000196660,ZRC1,SLC30A10,ENSG,GENE ID:55532,HGNC:25355,"DKFZP547M236,ZNT-10,ZNT10,ZNT8,ZRC1"
4723,ENSG00000170385,ZRC1,SLC30A1,ENSG,GENE ID:7779,HGNC:11012,"ZNT1,ZRC1"
4724,ENSG00000162378,ZYG11,ZYG11B,ENSG,GENE ID:79699,HGNC:25820,"FLJ13456,ZYG11"


In [100]:
aa_collision_alias_ensg_set = set(
    merged_alias_aa_collision_ensg_df["collision"]
)
len(aa_collision_alias_ensg_set)

1614

import file with ambiguous symbols resulting from alias-primary collisions

In [101]:
ap_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ensg_df.csv", index_col=[0])
ap_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,collision,source,NCBI_ID,HGNC_ID,alias_symbol
0,ENSG00000283293,RN7SK,7SK,ENSG,GENE ID:125050,HGNC:10037,7SK
1,ENSG00000057252,SOAT1,ACAT1,ENSG,GENE ID:6646,HGNC:11177,"ACAT,ACAT1,SOAT,STAT"
2,ENSG00000167780,SOAT2,ACAT2,ENSG,GENE ID:8435,HGNC:11178,ACAT2
3,ENSG00000004779,NDUFAB1,ACP1,ENSG,GENE ID:4706,HGNC:7694,"ACP,ACP1,FASN2A,SDAP"
4,ENSG00000220267,ACTBP8,ACTBP2,ENSG,,HGNC:141,ACTBP2
...,...,...,...,...,...,...,...
806,ENSG00000197961,ZNF121,ZNF20,ENSG,GENE ID:7675,HGNC:12904,"D19S204,ZHC32,ZNF20"
807,ENSG00000110315,RNF141,ZNF230,ENSG,GENE ID:50862,HGNC:21159,"ZFP26,ZNF230"
808,ENSG00000188801,ZNF322P1,ZNF322,ENSG,,HGNC:14003,"ZNF322,ZNF322B"
809,ENSG00000103994,ZNF106,ZNF474,ENSG,GENE ID:64397,HGNC:12886,"SH3BP3,ZFP106,ZNF474"


In [102]:
ap_collision_ensg_df.loc[ap_collision_ensg_df["collision"] == "SKI"]

Unnamed: 0,ENSG_ID,gene_symbol,collision,source,NCBI_ID,HGNC_ID,alias_symbol
662,ENSG00000054392,HHAT,SKI,ENSG,GENE ID:55733,HGNC:18270,"FLJ10724,GUP2,MART-2,MART2,RASP,SIT,SKI,SKN"
663,ENSG00000280680,HHAT,SKI,ENSG,GENE ID:55733,HGNC:18270,"FLJ10724,GUP2,MART-2,MART2,RASP,SIT,SKI,SKN"


In [103]:
ap_collision_alias_ensg_set = set(
    ap_collision_ensg_df["collision"]
)
len(ap_collision_alias_ensg_set)

606

combine lists without duplicates to create list of ambiguous symbols for ENSG

In [104]:
ensg_ambiguous_symbol_set = aa_collision_alias_ensg_set.union(
    ap_collision_alias_ensg_set
)
ensg_ambiguous_symbol_set_count = len(ensg_ambiguous_symbol_set)
ensg_ambiguous_symbol_set_count

2162

In [105]:
both_collision_ensg_set = aa_collision_alias_ensg_set & ap_collision_alias_ensg_set
len(both_collision_ensg_set)

58

In [106]:
only_aa_collision_alias_ensg_set = aa_collision_alias_ensg_set - both_collision_ensg_set
len(only_aa_collision_alias_ensg_set)

1556

In [107]:
only_ap_collision_alias_ensg_set = ap_collision_alias_ensg_set - both_collision_ensg_set
len(only_ap_collision_alias_ensg_set)

548

import mini_ensg_df from symbol_capture_generation.ipynb

In [108]:
mini_ensg_df = pd.read_csv(
        ("../output/mini_ensg_df.csv"), index_col=[0],dtype={"NCBI_ID": str,"HGNC_ID":str})
ensg_df = mini_ensg_df[mini_ensg_df['gene_symbol'] != mini_ensg_df['alias_symbol']]
ensg_df['primary_gene_symbol'] = ensg_df['gene_symbol'].str.upper()
ensg_df.drop(["gene_symbol"], axis=1, inplace=True)
ensg_df['alias_symbol'] = ensg_df['alias_symbol'].str.upper()
ensg_df = ensg_df.drop_duplicates(subset=['primary_gene_symbol', 'alias_symbol'], keep='first')
ensg_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,primary_gene_symbol
0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF
1,ENSG00000210049,,HGNC:7481,TRNF,MT-TF
2,ENSG00000211459,,HGNC:7470,12S,MT-RNR1
3,ENSG00000211459,,HGNC:7470,MOTS-C,MT-RNR1
4,ENSG00000211459,,HGNC:7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,GENE ID:85028,HGNC:30062,LINC00100,SNHG12
133059,ENSG00000197989,GENE ID:85028,HGNC:30062,PNAS-123,SNHG12
133060,ENSG00000229388,,HGNC:52502,LINC01715,TAF12-DT
133062,ENSG00000274978,GENE ID:26824,HGNC:10108,RNU11-1,RNU11


In [109]:
ensg_df.loc[ensg_df["alias_symbol"] == "MTTF"]

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,primary_gene_symbol
0,ENSG00000210049,,HGNC:7481,MTTF,MT-TF


make a general symbol column with primary and alias symbols

In [110]:
ensg_df = combine_columns(ensg_df, ["primary_gene_symbol", "alias_symbol"], ["primary_gene_symbol", "ENSG_ID", "HGNC_ID","NCBI_ID"], "symbol", "alias_symbol")
ensg_df

Unnamed: 0,NCBI_ID,primary_gene_symbol,ENSG_ID,HGNC_ID,symbol
0,,MT-TF,ENSG00000210049,HGNC:7481,MT-TF
2,,MT-RNR1,ENSG00000211459,HGNC:7470,MT-RNR1
5,,MT-TV,ENSG00000210077,HGNC:7500,MT-TV
7,,MT-RNR2,ENSG00000210082,HGNC:7471,MT-RNR2
10,,MT-TL1,ENSG00000209082,HGNC:7490,MT-TL1
...,...,...,...,...,...
147803,GENE ID:85028,SNHG12,ENSG00000197989,HGNC:30062,LINC00100
147804,GENE ID:85028,SNHG12,ENSG00000197989,HGNC:30062,PNAS-123
147805,,TAF12-DT,ENSG00000229388,HGNC:52502,LINC01715
147806,GENE ID:26824,RNU11,ENSG00000274978,HGNC:10108,RNU11-1


In [111]:
ensg_df.loc[ensg_df["primary_gene_symbol"] == "MT-TF"]

Unnamed: 0,NCBI_ID,primary_gene_symbol,ENSG_ID,HGNC_ID,symbol
0,,MT-TF,ENSG00000210049,HGNC:7481,MT-TF
73904,,MT-TF,ENSG00000210049,HGNC:7481,MTTF
73905,,MT-TF,ENSG00000210049,HGNC:7481,TRNF


use list of ambiguous symbols and ensg_combined_concept_ids_df to create a histogram of how many genes share ambiguous symbols

In [112]:
ensg_symbol_value_counts = ensg_df["symbol"].value_counts()

ensg_ambiguous_symbol_counts = ensg_symbol_value_counts[ensg_symbol_value_counts.index.isin(ensg_ambiguous_symbol_set)]
ensg_ambiguous_symbol_counts = ensg_ambiguous_symbol_counts.reset_index()
ensg_ambiguous_symbol_counts.columns = ["symbols", "symbol_counts"]
ensg_ambiguous_symbol_counts

Unnamed: 0,symbols,symbol_counts
0,MT1,11
1,HOX1,10
2,P40,10
3,HOX2,9
4,P14,9
...,...,...
2157,IBP1,2
2158,HLA-H,2
2159,MIA2,2
2160,INT2,2


In [113]:
create_ambiguous_symbol_histogram(ensg_ambiguous_symbol_counts, "ENSG", ensg_ambiguous_symbol_set_count)

### <a id='toc1_1_3_'></a>[HGNC](#toc0_)

import file with ambiguous symbols resulting from alias-alias collisions

In [114]:
merged_alias_aa_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_aa_collision_hgnc_df.csv", index_col=[0])
merged_alias_aa_collision_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,collision,source,NCBI_ID,ENSG_ID,alias_symbol
0,HGNC:6380,KLRG1,2F1,HGNC,GENE ID:10219,ENSG00000139187,"2F1,CLEC15A,MAFA,MAFA-L"
1,HGNC:10991,SLC25A5,2F1,HGNC,GENE ID:292,ENSG00000005022,"2F1,T2,T3"
2,HGNC:10498,S100A8,60B8AG,HGNC,GENE ID:6279,ENSG00000143546,"60B8AG,CGLA,MRP-8,MRP8,P8,S100-A8"
3,HGNC:10499,S100A9,60B8AG,HGNC,GENE ID:6280,ENSG00000163220,"60B8AG,CGLB,LIAG,MAC387,MIF,MRP-14,MRP14,NIF,P..."
4,HGNC:10230,RNU6V,87U6,HGNC,GENE ID:6071,ENSG00000206832,"87U6,LH87"
...,...,...,...,...,...,...,...
2443,HGNC:33357,TEX28P2,pTEX,HGNC,GENE ID:653363,ENSG00000277008,"CXorf2B,pTEX"
2444,HGNC:33146,PPP4R3C,smk1,HGNC,GENE ID:139420,ENSG00000224960,"FLFL3P,FLJ32867,smk1"
2445,HGNC:20219,PPP4R3A,smk1,HGNC,GENE ID:55671,ENSG00000100796,"FLFL1,FLJ20707,MSTP033,PP4R3,smk-1,smk1"
2446,HGNC:28393,SPATA2L,tamo,HGNC,GENE ID:124044,ENSG00000158792,"MGC26885,tamo"


In [115]:
aa_collision_alias_hgnc_set = set(
    merged_alias_aa_collision_hgnc_df["collision"]
)
len(aa_collision_alias_hgnc_set)

1083

import file with ambiguous symbols resulting from alias-primary collisions

In [116]:
ap_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_ap_collision_hgnc_df.csv", index_col=[0])
ap_collision_hgnc_df

Unnamed: 0,HGNC_ID,gene_symbol,collision,source,NCBI_ID,ENSG_ID,alias_symbol
0,HGNC:14947,PPP1R12C,AAVS1,HGNC,GENE ID:54776,ENSG00000125503,"AAVS1,DKFZP434D0412,MBS85,p84,p85"
1,HGNC:11177,SOAT1,ACAT1,HGNC,GENE ID:6646,ENSG00000057252,"ACAT,ACAT1"
2,HGNC:11178,SOAT2,ACAT2,HGNC,GENE ID:8435,ENSG00000167780,ACAT2
3,HGNC:1612,CCRL2,ACKR5,HGNC,GENE ID:9034,ENSG00000121797,"ACKR5,CKRX,CRAM-A,CRAM-B,HCR"
4,HGNC:7694,NDUFAB1,ACP1,HGNC,GENE ID:4706,ENSG00000004779,"ACP,ACP1,FASN2A,SDAP"
...,...,...,...,...,...,...,...
556,HGNC:3147,ECEL1,XCE,HGNC,GENE ID:9427,ENSG00000171551,"DINE,XCE"
557,HGNC:12904,ZNF121,ZNF20,HGNC,GENE ID:7675,ENSG00000197961,"ZHC32,ZNF20"
558,HGNC:21159,RNF141,ZNF230,HGNC,GENE ID:50862,ENSG00000110315,"ZFP26,ZNF230"
559,HGNC:12886,ZNF106,ZNF474,HGNC,GENE ID:64397,ENSG00000103994,"SH3BP3,ZNF474"


In [117]:
ap_collision_alias_hgnc_set = set(
    ap_collision_hgnc_df["collision"]
)
len(ap_collision_alias_hgnc_set)

492

combine lists without duplicates to create list of ambiguous symbols for HGNC

In [118]:
hgnc_ambiguous_symbol_set = aa_collision_alias_hgnc_set.union(
    ap_collision_alias_hgnc_set
)
hgnc_ambiguous_symbol_set_count = len(hgnc_ambiguous_symbol_set)
hgnc_ambiguous_symbol_set_count

1534

In [119]:
both_collision_hgnc_set = aa_collision_alias_hgnc_set & ap_collision_alias_hgnc_set
len(both_collision_hgnc_set)

41

In [120]:
only_aa_collision_alias_hgnc_set = aa_collision_alias_hgnc_set - both_collision_hgnc_set
len(only_aa_collision_alias_hgnc_set)

1042

In [121]:
only_ap_collision_alias_hgnc_set = ap_collision_alias_hgnc_set - both_collision_hgnc_set
len(only_ap_collision_alias_hgnc_set)

451

import mini_hgnc_df from symbol_capture_generation.ipynb

In [122]:
mini_hgnc_df = pd.read_csv(
        ("../output/mini_hgnc_df.csv"), index_col=[0],dtype={"NCBI_ID": str,"HGNC_ID":str})
hgnc_df = mini_hgnc_df[mini_hgnc_df['gene_symbol'] != mini_hgnc_df['alias_symbol']]
hgnc_df['primary_gene_symbol'] = hgnc_df['gene_symbol'].str.upper()
hgnc_df.drop(["gene_symbol"], axis=1, inplace=True)
hgnc_df['alias_symbol'] = hgnc_df['alias_symbol'].str.upper()
hgnc_df = hgnc_df.drop_duplicates(subset=['primary_gene_symbol', 'alias_symbol'], keep='first')
hgnc_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,primary_gene_symbol
0,HGNC:100,BNAC2,GENE ID:41,ENSG00000110881,ASIC1
0,HGNC:100,HBNAC2,GENE ID:41,ENSG00000110881,ASIC1
1,HGNC:10000,,GENE ID:5999,ENSG00000117152,RGS4
2,HGNC:10001,,GENE ID:8490,ENSG00000143248,RGS5
3,HGNC:10002,,GENE ID:9628,ENSG00000182732,RGS6
...,...,...,...,...,...
44232,HGNC:9997,RGS-R,GENE ID:6004,ENSG00000143333,RGS16
44233,HGNC:9998,,GENE ID:5997,ENSG00000116741,RGS2
44234,HGNC:9999,C2PA,GENE ID:5998,ENSG00000138835,RGS3
44234,HGNC:9999,FLJ20370,GENE ID:5998,ENSG00000138835,RGS3


make a general column with primary and alias symbols

In [123]:
hgnc_df = combine_columns(hgnc_df, ["primary_gene_symbol", "alias_symbol"], ["primary_gene_symbol", "ENSG_ID", "HGNC_ID","NCBI_ID"], "symbol", "alias_symbol")
hgnc_df

Unnamed: 0,NCBI_ID,primary_gene_symbol,ENSG_ID,HGNC_ID,symbol
0,GENE ID:41,ASIC1,ENSG00000110881,HGNC:100,ASIC1
2,GENE ID:5999,RGS4,ENSG00000117152,HGNC:10000,RGS4
3,GENE ID:8490,RGS5,ENSG00000143248,HGNC:10001,RGS5
4,GENE ID:9628,RGS6,ENSG00000182732,HGNC:10002,RGS6
5,GENE ID:6000,RGS7,ENSG00000182901,HGNC:10003,RGS7
...,...,...,...,...,...
132327,GENE ID:6004,RGS16,ENSG00000143333,HGNC:9997,RGS-R
132328,GENE ID:5997,RGS2,ENSG00000116741,HGNC:9998,
132329,GENE ID:5998,RGS3,ENSG00000138835,HGNC:9999,C2PA
132330,GENE ID:5998,RGS3,ENSG00000138835,HGNC:9999,FLJ20370


use list of ambiguous symbols and hgnc_combined_concept_ids_df to create a histogram of how many genes share ambiguous symbols

In [124]:
hgnc_symbol_value_counts = hgnc_df["symbol"].value_counts()

hgnc_ambiguous_symbol_counts = hgnc_symbol_value_counts[hgnc_symbol_value_counts.index.isin(hgnc_ambiguous_symbol_set)]
hgnc_ambiguous_symbol_counts = hgnc_ambiguous_symbol_counts.reset_index()
hgnc_ambiguous_symbol_counts.columns = ["symbols", "symbol_counts"]
hgnc_ambiguous_symbol_counts

Unnamed: 0,symbols,symbol_counts
0,P40,10
1,P14,9
2,U4,7
3,P18,7
4,PAP,7
...,...,...
1433,MIO,2
1434,NMI,2
1435,SCAF1,2
1436,POF,2


In [125]:
create_ambiguous_symbol_histogram(hgnc_ambiguous_symbol_counts, "HGNC", hgnc_ambiguous_symbol_set_count)

### <a id='toc1_1_4_'></a>[NCBI](#toc0_)

import file with ambiguous symbols resulting from alias-alias collisions

In [126]:
merged_alias_aa_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_aa_collision_ncbi_df.csv", index_col=[0])
merged_alias_aa_collision_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,collision,source,HGNC_ID,ENSG_ID,alias_symbol
0,GENE ID:5728,PTEN,10q23del,NCBI,HGNC:9588,ENSG00000171862,"10q23del,BZS,CWS1,DEC,GLM2,MHAM,MMAC1,PTEN1,PT..."
1,GENE ID:657,BMPR1A,10q23del,NCBI,HGNC:1076,ENSG00000107779,"10q23del,ACVRLK3,ALK-3,ALK3,BMPR-1A,CD292,SKR5"
2,GENE ID:239,ALOX12,12-LOX,NCBI,HGNC:429,ENSG00000108839,"12-LOX,12S-LOX,LOG12"
3,GENE ID:246,ALOX15,12-LOX,NCBI,HGNC:433,ENSG00000161905,"12-LOX,15-LOX,15-LOX-1,LOG15"
4,GENE ID:10219,KLRG1,2F1,NCBI,HGNC:6380,ENSG00000139187,"2F1,CLEC15A,MAFA,MAFA-2F1,MAFA-L,MAFA-LIKE"
...,...,...,...,...,...,...,...
8180,GENE ID:55671,PPP4R3A,smk1,NCBI,HGNC:20219,ENSG00000100796,"FLFL1,KIAA2010,MSTP033,PP4R3,PP4R3A,SMEK1,smk-..."
8181,GENE ID:57223,PPP4R3B,smk1,NCBI,HGNC:29267,ENSG00000275052,"FLFL2,PP4R3B,PSY2,SMEK2,smk1"
8182,GENE ID:139420,PPP4R3C,smk1,NCBI,HGNC:33146,ENSG00000224960,"FLFL3P,PPP4R3CP,SMEK3P,smk1"
8183,GENE ID:9825,SPATA2,tamo,NCBI,HGNC:14681,ENSG00000158480,"PD1,PPP1R145,tamo"


In [127]:
aa_collision_alias_ncbi_set = set(
    merged_alias_aa_collision_ncbi_df["collision"]
)
len(aa_collision_alias_ncbi_set)

3414

import file with ambiguous symbols resulting from alias-primary collisions

In [128]:
ap_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ncbi_df.csv", index_col=[0])
ap_collision_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,collision,source,HGNC_ID,ENSG_ID,alias_symbol
0,GENE ID:3494,IGHA2,A2M,NCBI,HGNC:5479,ENSG00000211890,A2M
1,GENE ID:54776,PPP1R12C,AAVS1,NCBI,HGNC:14947,ENSG00000125503,"AAVS1,LENG3,MBS85,p84,p85"
2,GENE ID:6646,SOAT1,ACAT1,NCBI,HGNC:11177,ENSG00000057252,"ACACT,ACAT,ACAT-1,ACAT1,SOAT,STAT"
3,GENE ID:8435,SOAT2,ACAT2,NCBI,HGNC:11178,ENSG00000167780,"ACACT2,ACAT2,ARGP2"
4,GENE ID:9034,CCRL2,ACKR5,NCBI,HGNC:1612,ENSG00000121797,"ACKR5,CKRX,CRAM,CRAM-A,CRAM-B,HCR"
...,...,...,...,...,...,...,...
2032,GENE ID:29903,CCDC106,ZNF581,NCBI,HGNC:30181,ENSG00000173581,"HSU79303,ZNF581"
2033,GENE ID:146540,ZNF785,ZNF688,NCBI,HGNC:26496,ENSG00000197162,ZNF688
2034,GENE ID:57829,ZP4,ZP1,NCBI,HGNC:15770,ENSG00000116996,"ZBP,ZP1,ZP1B,ZPB,ZPB2,Zp-4"
2036,GENE ID:55663,ZNF446,ZSCAN30,NCBI,HGNC:21036,ENSG00000083838,"ZKSCAN20,ZSCAN30,ZSCAN52"


In [129]:
ap_collision_alias_ncbi_set = set(
    ap_collision_ncbi_df["collision"]
)
len(ap_collision_alias_ncbi_set)

1394

combine lists without duplicates to create list of ambiguous symbols for NCBI

In [130]:
ncbi_ambiguous_symbol_set = aa_collision_alias_ncbi_set.union(
    ap_collision_alias_ncbi_set
)
ncbi_ambiguous_symbol_set_count = len(ncbi_ambiguous_symbol_set)
ncbi_ambiguous_symbol_set_count

4653

In [131]:
both_collision_ncbi_set = aa_collision_alias_ncbi_set & ap_collision_alias_ncbi_set
len(both_collision_ncbi_set)

155

In [132]:
only_aa_collision_alias_ncbi_set = aa_collision_alias_ncbi_set - both_collision_ncbi_set
len(only_aa_collision_alias_ncbi_set)

3259

In [133]:
only_ap_collision_alias_ncbi_set = ap_collision_alias_ncbi_set - both_collision_ncbi_set
len(only_ap_collision_alias_ncbi_set)

1239

import mini_ncbi_df from symbol_capture_generation.ipynb

In [134]:
mini_ncbi_df = pd.read_csv(
        ("../output/mini_ncbi_df.csv"), index_col=[0],dtype={"NCBI_ID": str,"HGNC_ID":str})
ncbi_df = mini_ncbi_df[mini_ncbi_df['gene_symbol'] != mini_ncbi_df['alias_symbol']]
ncbi_df['primary_gene_symbol'] = ncbi_df['gene_symbol'].str.upper()
ncbi_df.drop(["gene_symbol"], axis=1, inplace=True)
ncbi_df['alias_symbol'] = ncbi_df['alias_symbol'].str.upper()
ncbi_df = ncbi_df.drop_duplicates(subset=['primary_gene_symbol', 'alias_symbol'], keep='first')
ncbi_df

Unnamed: 0,NCBI_ID,alias_symbol,HGNC_ID,ENSG_ID,primary_gene_symbol
0,GENE ID:1,A1B,HGNC:5,ENSG00000121410,A1BG
0,GENE ID:1,ABG,HGNC:5,ENSG00000121410,A1BG
0,GENE ID:1,GAB,HGNC:5,ENSG00000121410,A1BG
0,GENE ID:1,HYST2477,HGNC:5,ENSG00000121410,A1BG
1,GENE ID:2,A2MD,HGNC:7,ENSG00000175899,A2M
...,...,...,...,...,...
193502,GENE ID:141732005,,HGNC:40064,,ADCY2-AS1
193503,GENE ID:141732006,,HGNC:41074,,NSG2-AS1
193504,GENE ID:141732007,,HGNC:58430,,ST18-AS1
193505,GENE ID:141732008,,HGNC:58437,,MICAL2-AS1


make a general symbol column with primary and alias symbols

In [135]:
ncbi_df = combine_columns(ncbi_df, ["primary_gene_symbol", "alias_symbol"], ["primary_gene_symbol", "ENSG_ID", "HGNC_ID","NCBI_ID"], "symbol", "alias_symbol")
ncbi_df

Unnamed: 0,NCBI_ID,primary_gene_symbol,ENSG_ID,HGNC_ID,symbol
0,GENE ID:1,A1BG,ENSG00000121410,HGNC:5,A1BG
4,GENE ID:2,A2M,ENSG00000175899,HGNC:7,A2M
8,GENE ID:9,NAT1,ENSG00000171428,HGNC:7645,NAT1
12,GENE ID:10,NAT2,ENSG00000156006,HGNC:7646,NAT2
15,GENE ID:11,NATP,,HGNC:15,NATP
...,...,...,...,...,...
182553,GENE ID:141732005,ADCY2-AS1,,HGNC:40064,
182554,GENE ID:141732006,NSG2-AS1,,HGNC:41074,
182555,GENE ID:141732007,ST18-AS1,,HGNC:58430,
182556,GENE ID:141732008,MICAL2-AS1,,HGNC:58437,


use list of ambiguous symbols and ncbi_combined_concept_ids_df to create a histogram of how many genes share ambiguous symbols

In [136]:
ncbi_symbol_value_counts = ncbi_df["symbol"].value_counts()

ncbi_ambiguous_symbol_counts = ncbi_symbol_value_counts[ncbi_symbol_value_counts.index.isin(ncbi_ambiguous_symbol_set)]
ncbi_ambiguous_symbol_counts = ncbi_ambiguous_symbol_counts.reset_index()
ncbi_ambiguous_symbol_counts.columns = ["symbols", "symbol_counts"]
ncbi_ambiguous_symbol_counts

Unnamed: 0,symbols,symbol_counts
0,VH,37
1,H4C6,14
2,H4C3,14
3,H4C14,14
4,H4C13,14
...,...,...
4434,GLS,2
4435,KIR2DS5,2
4436,HEBP1,2
4437,TRAD,2


In [137]:
create_ambiguous_symbol_histogram(ncbi_ambiguous_symbol_counts, "NCBI", ncbi_ambiguous_symbol_set_count)

In [138]:
ncbi_ensg_ambiguous_symbol_set = ncbi_ambiguous_symbol_set.union(
    ensg_ambiguous_symbol_set
)

In [139]:
ncbi_ensg_hgnc_ambiguous_symbol_set = ncbi_ensg_ambiguous_symbol_set.union(
    hgnc_ambiguous_symbol_set
)

In [140]:
ambiguous_symbol_set = ncbi_ensg_hgnc_ambiguous_symbol_set
len(ambiguous_symbol_set)

4969