**Table of contents**<a id='toc0_'></a>    
- [What is the set of ambiguous symbols per source?](#toc1_1_)    
    - [Define Functions](#toc1_1_1_)    
    - [ENSG](#toc1_1_2_)    
    - [HGNC](#toc1_1_3_)    
    - [NCBI](#toc1_1_4_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[What is the set of ambiguous symbols per source?](#toc0_)

Need this to create a better histogram figure. Current one only shows the distribution of ambiguous symbols created by alias-alias collisions by source. I want to be able to illustrate how the ambiguous symbols in each source (created by alias-alias and alias-primary collisions) are shared. 

In [52]:
import pandas as pd
import plotly.express as px

### <a id='toc1_1_1_'></a>[Define Functions](#toc0_)

In [53]:
def combine_columns(df, columns_to_combine, columns_to_keep, new_name, columns_to_drop):
    """Combine multiple columns into one while keeping associated data attached.
    Use this function when the columns to combine are easier to list 
    than the columns not to combine.
    
    :param df: The DataFrame containing the columns to be combined
    :param columns_to_combine: List of column names to combine into one
    :param columns_to_keep: List of columns to keep in the final DataFrame
    :param new_name: The name of the new combined column
    :param columns_to_drop: List of columns to drop from the final DataFrame
    :return: A new DataFrame with combined columns and selected columns retained
    """
    og_df = df.copy()

    combined_dfs = []

    # Loop through each column in columns_to_combine and create a new DataFrame
    for col in columns_to_combine:
        temp_df = og_df[list(set([col] + columns_to_keep))].copy()
        temp_df[new_name] = temp_df[col]
        combined_dfs.append(temp_df)

    df_combined = pd.concat(combined_dfs, ignore_index=True)
    df_combined.drop(columns_to_drop, axis=1, inplace=True)
    df_combined.drop_duplicates(inplace=True)
    
    return df_combined

In [54]:
def create_ambiguous_symbol_histogram(XXXX_ambiguous_symbol_counts: pd.DataFrame, source: str, XXXX_ambiguous_symbol_set_count: int):
    """Create a histogram of the frequencies at which aliases are shared

    :param aa_collision_gene_xxxx_df: A df of alias-alias collisions organized by primary gene symbol
    :param source: Representation of the source of the gene records
    :param xxxx_alias_count: Number of aliases total in the source
    :return: A histogram of the percentage of aliases that are shared between 2 genes, 3 genes, and so on
    """

    #Count the frequency at which aliases are shared 
    XXXX_ambiguous_symbol_counts_distribution_df = XXXX_ambiguous_symbol_counts.pivot_table(
    index=["symbol_counts"], aggfunc="size"
    )
    XXXX_ambiguous_symbol_counts_distribution_df = XXXX_ambiguous_symbol_counts_distribution_df.reset_index()
    XXXX_ambiguous_symbol_counts_distribution_df.rename(columns={0: "num_collision_symbol"}, inplace=True)
    XXXX_ambiguous_symbol_counts_distribution_df["percent_collision_symbol"] = (
        XXXX_ambiguous_symbol_counts_distribution_df["num_collision_symbol"] / XXXX_ambiguous_symbol_set_count
    ) * 100

    #Convert to csv
    XXXX_ambiguous_symbol_counts_distribution_df.to_csv(f'../output/ambiguous_symbol_{source}_distribution_df.csv', index=True)

    #Create histogram df 
    xxxx_symbol_count_histogram_df = XXXX_ambiguous_symbol_counts_distribution_df.drop(
    "num_collision_symbol", axis=1)

    #Convert to csv
    xxxx_symbol_count_histogram_df.to_csv(f'../output/{source}_ambiguous_symbol_count_histogram_df.csv', index=True)

    return px.bar(xxxx_symbol_count_histogram_df, x="symbol_counts", y="percent_collision_symbol")



### <a id='toc1_1_2_'></a>[ENSG](#toc0_)

import file with ambiguous symbols resulting from alias-alias collisions

In [55]:
aa_collision_alias_ensg_df = pd.read_csv(
    "../output/aa_collision_alias_ensg_df.csv", index_col=[0])
aa_collision_alias_ensg_df

Unnamed: 0,collision,ENSG_ID,gene_symbol,source
0,2F1,"ENSG00000005022, ENSG00000139187","SLC25A5, KLRG1",ENSG
1,60B8AG,"ENSG00000143546, ENSG00000163220","S100A8, S100A9",ENSG
2,87U6,"ENSG00000206832, ENSG00000065135","RNU6V, GNAI3",ENSG
3,9G8,"ENSG00000164609, ENSG00000115875","SLU7, SRSF7",ENSG
4,A1,"ENSG00000033627, ENSG00000163918, ENSG00000035...","ATP6V0A1, RFC4, RFC1, RFC2",ENSG
...,...,...,...,...
1610,ZIP4,"ENSG00000285243, ENSG00000120498","SLC39A4, TEX11",ENSG
1611,ZNF422,"ENSG00000165512, ENSG00000172943","ZNF22, PHF8",ENSG
1612,ZNT8,"ENSG00000196660, ENSG00000164756","SLC30A10, SLC30A8",ENSG
1613,ZRC1,"ENSG00000170385, ENSG00000196660","SLC30A1, SLC30A10",ENSG


In [56]:
aa_collision_alias_ensg_set = set(
    aa_collision_alias_ensg_df["collision"]
)
len(aa_collision_alias_ensg_set)

1615

import file with ambiguous symbols resulting from alias-primary collisions

In [57]:
ap_collision_ensg_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ensg_df.csv", index_col=[0])
ap_collision_ensg_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,RN7SK,7SK,ENSG00000283293,10037.0,125050,7SK,ENSG
1,SOAT1,"ACAT,ACAT1,SOAT,STAT",ENSG00000057252,11177.0,6646,ACAT1,ENSG
2,SOAT2,ACAT2,ENSG00000167780,11178.0,8435,ACAT2,ENSG
3,NDUFAB1,"ACP,ACP1,FASN2A,SDAP",ENSG00000004779,7694.0,4706,ACP1,ENSG
4,ACTBP8,ACTBP2,ENSG00000220267,141.0,0,ACTBP2,ENSG
...,...,...,...,...,...,...,...
807,ZNF121,"D19S204,ZHC32,ZNF20",ENSG00000197961,12904.0,7675,ZNF20,ENSG
808,RNF141,"ZFP26,ZNF230",ENSG00000110315,21159.0,50862,ZNF230,ENSG
809,ZNF322P1,"ZNF322,ZNF322B",ENSG00000188801,14003.0,0,ZNF322,ENSG
810,ZNF106,"SH3BP3,ZFP106,ZNF474",ENSG00000103994,12886.0,64397,ZNF474,ENSG


In [58]:
ap_collision_ensg_df.loc[ap_collision_ensg_df["collision"] == "SKI"]

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
663,HHAT,"FLJ10724,GUP2,MART-2,MART2,RASP,SIT,SKI,SKN",ENSG00000054392,18270.0,55733,SKI,ENSG
664,HHAT,"FLJ10724,GUP2,MART-2,MART2,RASP,SIT,SKI,SKN",ENSG00000280680,18270.0,55733,SKI,ENSG


In [59]:
ap_collision_alias_ensg_set = set(
    ap_collision_ensg_df["collision"]
)
len(ap_collision_alias_ensg_set)

607

combine lists without duplicates to create list of ambiguous symbols for ENSG

In [60]:
ensg_ambiguous_symbol_set = aa_collision_alias_ensg_set.union(
    ap_collision_alias_ensg_set
)
ensg_ambiguous_symbol_set_count = len(ensg_ambiguous_symbol_set)
ensg_ambiguous_symbol_set_count

2164

In [61]:
both_collision_ensg_set = aa_collision_alias_ensg_set & ap_collision_alias_ensg_set
len(both_collision_ensg_set)

58

In [62]:
only_aa_collision_alias_ensg_set = aa_collision_alias_ensg_set - both_collision_ensg_set
len(only_aa_collision_alias_ensg_set)

1557

In [63]:
only_ap_collision_alias_ensg_set = ap_collision_alias_ensg_set - both_collision_ensg_set
len(only_ap_collision_alias_ensg_set)

549

import mini_ensg_df from symbol_capture_generation.ipynb

In [64]:
mini_ensg_df = pd.read_csv(
        ("../output/mini_ensg_df.csv"), index_col=[0],dtype={"NCBI_ID": str,"HGNC_ID":str})
ensg_df = mini_ensg_df[mini_ensg_df['gene_symbol'] != mini_ensg_df['alias_symbol']]
ensg_df['primary_gene_symbol'] = ensg_df['gene_symbol'].str.upper()
ensg_df.drop(["gene_symbol"], axis=1, inplace=True)
ensg_df['alias_symbol'] = ensg_df['alias_symbol'].str.upper()
ensg_df = ensg_df.drop_duplicates(subset=['primary_gene_symbol', 'alias_symbol'], keep='first')
ensg_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,primary_gene_symbol
0,ENSG00000210049,,7481,MTTF,MT-TF
1,ENSG00000210049,,7481,TRNF,MT-TF
2,ENSG00000211459,,7470,12S,MT-RNR1
3,ENSG00000211459,,7470,MOTS-C,MT-RNR1
4,ENSG00000211459,,7470,MTRNR1,MT-RNR1
...,...,...,...,...,...
133058,ENSG00000197989,85028,30062,LINC00100,SNHG12
133059,ENSG00000197989,85028,30062,PNAS-123,SNHG12
133060,ENSG00000229388,,52502,LINC01715,TAF12-DT
133062,ENSG00000274978,26824,10108,RNU11-1,RNU11


In [65]:
ensg_df.loc[ensg_df["alias_symbol"] == "MTTF"]

Unnamed: 0,ENSG_ID,NCBI_ID,HGNC_ID,alias_symbol,primary_gene_symbol
0,ENSG00000210049,,7481,MTTF,MT-TF


make a general symbol column with primary and alias symbols

In [66]:
ensg_df = combine_columns(ensg_df, ["primary_gene_symbol", "alias_symbol"], ["primary_gene_symbol", "ENSG_ID", "HGNC_ID","NCBI_ID"], "symbol", "alias_symbol")
ensg_df

Unnamed: 0,ENSG_ID,HGNC_ID,NCBI_ID,primary_gene_symbol,symbol
0,ENSG00000210049,7481,,MT-TF,MT-TF
2,ENSG00000211459,7470,,MT-RNR1,MT-RNR1
5,ENSG00000210077,7500,,MT-TV,MT-TV
7,ENSG00000210082,7471,,MT-RNR2,MT-RNR2
10,ENSG00000209082,7490,,MT-TL1,MT-TL1
...,...,...,...,...,...
147803,ENSG00000197989,30062,85028,SNHG12,LINC00100
147804,ENSG00000197989,30062,85028,SNHG12,PNAS-123
147805,ENSG00000229388,52502,,TAF12-DT,LINC01715
147806,ENSG00000274978,10108,26824,RNU11,RNU11-1


In [67]:
ensg_df.loc[ensg_df["primary_gene_symbol"] == "MT-TF"]

Unnamed: 0,ENSG_ID,HGNC_ID,NCBI_ID,primary_gene_symbol,symbol
0,ENSG00000210049,7481,,MT-TF,MT-TF
73904,ENSG00000210049,7481,,MT-TF,MTTF
73905,ENSG00000210049,7481,,MT-TF,TRNF


use list of ambiguous symbols and ensg_combined_concept_ids_df to create a histogram of how many genes share ambiguous symbols

In [68]:
ensg_symbol_value_counts = ensg_df["symbol"].value_counts()

ensg_ambiguous_symbol_counts = ensg_symbol_value_counts[ensg_symbol_value_counts.index.isin(ensg_ambiguous_symbol_set)]
ensg_ambiguous_symbol_counts = ensg_ambiguous_symbol_counts.reset_index()
ensg_ambiguous_symbol_counts.columns = ["symbols", "symbol_counts"]
ensg_ambiguous_symbol_counts

Unnamed: 0,symbols,symbol_counts
0,MT1,11
1,HOX1,10
2,P40,10
3,HOX2,9
4,P14,9
...,...,...
2159,HLA-H,2
2160,MIA2,2
2161,INT2,2
2162,HP,2


In [69]:
create_ambiguous_symbol_histogram(ensg_ambiguous_symbol_counts, "ENSG", ensg_ambiguous_symbol_set_count)

### <a id='toc1_1_3_'></a>[HGNC](#toc0_)

import file with ambiguous symbols resulting from alias-alias collisions

In [70]:
aa_collision_alias_hgnc_df = pd.read_csv(
    "../output/aa_collision_alias_hgnc_df.csv", index_col=[0])
aa_collision_alias_hgnc_df

Unnamed: 0,collision,ENSG_ID,gene_symbol,source
0,2F1,"ENSG00000139187, ENSG00000005022","KLRG1, SLC25A5",HGNC
1,60B8AG,"ENSG00000143546, ENSG00000163220","S100A8, S100A9",HGNC
2,87U6,"ENSG00000206832, ENSG00000065135","RNU6V, GNAI3",HGNC
3,9G8,"ENSG00000115875, ENSG00000164609","SRSF7, SLU7",HGNC
4,A1,"ENSG00000033627, ENSG00000035928, ENSG00000163...","ATP6V0A1, RFC1, RFC4, RFC2",HGNC
...,...,...,...,...
1245,ZIP2,"ENSG00000165181, ENSG00000165794","SHOC1, SLC39A2",HGNC
1246,ZIP4,"ENSG00000120498, ENSG00000147804","TEX11, SLC39A4",HGNC
1247,ZNF422,"ENSG00000172943, ENSG00000165512","PHF8, ZNF22",HGNC
1248,ZNT8,"ENSG00000196660, ENSG00000164756","SLC30A10, SLC30A8",HGNC


In [71]:
aa_collision_alias_hgnc_set = set(
    aa_collision_alias_hgnc_df["collision"]
)
len(aa_collision_alias_hgnc_set)

1250

import file with ambiguous symbols resulting from alias-primary collisions

In [72]:
ap_collision_hgnc_df = pd.read_csv(
    "../output/merged_alias_ap_collision_hgnc_df.csv", index_col=[0])
ap_collision_hgnc_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,PPP1R12C,"DKFZP434D0412,p84,MBS85,p85,AAVS1",ENSG00000125503,14947,54776,AAVS1,HGNC
1,SOAT1,"ACAT,ACAT1",ENSG00000057252,11177,6646,ACAT1,HGNC
2,SOAT2,ACAT2,ENSG00000167780,11178,8435,ACAT2,HGNC
3,GLI3,"PAP-A,PAPA,PAPA1,PAPB,ACLS,PPDIV",ENSG00000106571,4319,2737,ACLS,HGNC
4,NDUFAB1,"SDAP,FASN2A,ACP,ACP1",ENSG00000004779,7694,4706,ACP1,HGNC
...,...,...,...,...,...,...,...
655,ZNF121,"ZHC32,ZNF20",ENSG00000197961,12904,7675,ZNF20,HGNC
656,RNF141,"ZFP26,ZNF230",ENSG00000110315,21159,50862,ZNF230,HGNC
657,ZNF106,"ZNF474,SH3BP3",ENSG00000103994,12886,64397,ZNF474,HGNC
658,ZFP1,"FLJ34243,ZNF475",ENSG00000184517,23328,162239,ZNF475,HGNC


In [73]:
ap_collision_alias_hgnc_set = set(
    ap_collision_hgnc_df["collision"]
)
len(ap_collision_alias_hgnc_set)

568

combine lists without duplicates to create list of ambiguous symbols for HGNC

In [74]:
hgnc_ambiguous_symbol_set = aa_collision_alias_hgnc_set.union(
    ap_collision_alias_hgnc_set
)
hgnc_ambiguous_symbol_set_count = len(hgnc_ambiguous_symbol_set)
hgnc_ambiguous_symbol_set_count

1750

In [75]:
both_collision_hgnc_set = aa_collision_alias_hgnc_set & ap_collision_alias_hgnc_set
len(both_collision_hgnc_set)

68

In [76]:
only_aa_collision_alias_hgnc_set = aa_collision_alias_hgnc_set - both_collision_hgnc_set
len(only_aa_collision_alias_hgnc_set)

1182

In [77]:
only_ap_collision_alias_hgnc_set = ap_collision_alias_hgnc_set - both_collision_hgnc_set
len(only_ap_collision_alias_hgnc_set)

500

import mini_hgnc_df from symbol_capture_generation.ipynb

In [78]:
mini_hgnc_df = pd.read_csv(
        ("../output/mini_hgnc_df.csv"), index_col=[0],dtype={"NCBI_ID": str,"HGNC_ID":str})
hgnc_df = mini_hgnc_df[mini_hgnc_df['gene_symbol'] != mini_hgnc_df['alias_symbol']]
hgnc_df['primary_gene_symbol'] = hgnc_df['gene_symbol'].str.upper()
hgnc_df.drop(["gene_symbol"], axis=1, inplace=True)
hgnc_df['alias_symbol'] = hgnc_df['alias_symbol'].str.upper()
hgnc_df = hgnc_df.drop_duplicates(subset=['primary_gene_symbol', 'alias_symbol'], keep='first')
hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,primary_gene_symbol
0,5,,1,ENSG00000121410,A1BG
1,37133,FLJ23569,503538,ENSG00000268895,A1BG-AS1
2,24086,ACF,29974,ENSG00000148584,A1CF
3,24086,ASP,29974,ENSG00000148584,A1CF
4,24086,ACF64,29974,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,29027,KIAA0399,23140,ENSG00000074755,ZZEF1
67579,29027,ZZZ4,23140,ENSG00000074755,ZZEF1
67580,29027,FLJ10821,23140,ENSG00000074755,ZZEF1
67581,24523,DKFZP564I052,26009,ENSG00000036549,ZZZ3


make a general column with primary and alias symbols

In [79]:
hgnc_df = combine_columns(hgnc_df, ["primary_gene_symbol", "alias_symbol"], ["primary_gene_symbol", "ENSG_ID", "HGNC_ID","NCBI_ID"], "symbol", "alias_symbol")
hgnc_df

Unnamed: 0,ENSG_ID,HGNC_ID,NCBI_ID,primary_gene_symbol,symbol
0,ENSG00000121410,5,1,A1BG,A1BG
1,ENSG00000268895,37133,503538,A1BG-AS1,A1BG-AS1
2,ENSG00000148584,24086,29974,A1CF,A1CF
7,ENSG00000175899,7,2,A2M,A2M
10,ENSG00000245105,27057,144571,A2M-AS1,A2M-AS1
...,...,...,...,...,...
135093,ENSG00000074755,29027,23140,ZZEF1,KIAA0399
135094,ENSG00000074755,29027,23140,ZZEF1,ZZZ4
135095,ENSG00000074755,29027,23140,ZZEF1,FLJ10821
135096,ENSG00000036549,24523,26009,ZZZ3,DKFZP564I052


use list of ambiguous symbols and hgnc_combined_concept_ids_df to create a histogram of how many genes share ambiguous symbols

In [80]:
hgnc_symbol_value_counts = hgnc_df["symbol"].value_counts()

hgnc_ambiguous_symbol_counts = hgnc_symbol_value_counts[hgnc_symbol_value_counts.index.isin(hgnc_ambiguous_symbol_set)]
hgnc_ambiguous_symbol_counts = hgnc_ambiguous_symbol_counts.reset_index()
hgnc_ambiguous_symbol_counts.columns = ["symbols", "symbol_counts"]
hgnc_ambiguous_symbol_counts

Unnamed: 0,symbols,symbol_counts
0,P40,10
1,P14,9
2,U3,9
3,PAP,7
4,U4,7
...,...,...
1745,BRDT,1
1746,ARK2N,1
1747,ARK2C,1
1748,EPIST,1


In [81]:
create_ambiguous_symbol_histogram(hgnc_ambiguous_symbol_counts, "HGNC", hgnc_ambiguous_symbol_set_count)

### <a id='toc1_1_4_'></a>[NCBI](#toc0_)

import file with ambiguous symbols resulting from alias-alias collisions

In [82]:
aa_collision_alias_ncbi_df = pd.read_csv(
    "../output/aa_collision_alias_ncbi_df.csv", index_col=[0])
aa_collision_alias_ncbi_df

Unnamed: 0,collision,ENSG_ID,gene_symbol,source
0,10Q23DEL,"ENSG00000107779, ENSG00000171862","BMPR1A, PTEN",NCBI
1,12-LOX,"ENSG00000108839, ENSG00000161905","ALOX12, ALOX15",NCBI
2,20-ALPHA-HSD,"ENSG00000187134, ENSG00000108786","AKR1C1, HSD17B1",NCBI
3,2F1,"ENSG00000139187, ENSG00000005022","KLRG1, SLC25A5",NCBI
4,3-ALPHA-HSD,"ENSG00000198610, ENSG00000073737","AKR1C4, DHRS9",NCBI
...,...,...,...,...
3693,ZNF769,"ENSG00000272968, ENSG00000146587","RBAK-RBAKDN, RBAK",NCBI
3694,ZNT8,"ENSG00000164756, ENSG00000196660","SLC30A8, SLC30A10",NCBI
3695,ZRC1,"ENSG00000170385, ENSG00000196660","SLC30A1, SLC30A10",NCBI
3696,ZWS,"ENSG00000162928, ENSG00000127980","PEX13, PEX1",NCBI


In [83]:
aa_collision_alias_ncbi_set = set(
    aa_collision_alias_ncbi_df["collision"]
)
len(aa_collision_alias_ncbi_set)

3698

import file with ambiguous symbols resulting from alias-primary collisions

In [84]:
ap_collision_ncbi_df = pd.read_csv(
    "../output/merged_alias_ap_collision_ncbi_df.csv", index_col=[0])
ap_collision_ncbi_df

Unnamed: 0,gene_symbol,alias_symbol,ENSG_ID,HGNC_ID,NCBI_ID,collision,source
0,IGHA2,A2M,ENSG00000211890,5479.0,3494,A2M,NCBI
1,NPSR1-AS1,AAA1,ENSG00000197085,22128.0,404744,AAA1,NCBI
2,CFAP91,"AAT1,AAT1alpha,C3orf15,CaM-IP2,MAATS1,SPATA26,...",ENSG00000183833,24010.0,89876,AAT1,NCBI
3,GPT,"AAT1,ALT,ALT1,GPT1,SGPT",ENSG00000167701,4552.0,2875,AAT1,NCBI
4,PPP1R12C,"AAVS1,LENG3,MBS85,p84,p85",ENSG00000125503,14947.0,54776,AAVS1,NCBI
...,...,...,...,...,...,...,...
2167,CCDC106,"HSU79303,ZNF581",ENSG00000173581,30181.0,29903,ZNF581,NCBI
2168,ZNF785,ZNF688,ENSG00000197162,26496.0,146540,ZNF688,NCBI
2169,ZP4,"ZBP,ZP1,ZP1B,ZPB,ZPB2,Zp-4",ENSG00000116996,15770.0,57829,ZP1,NCBI
2171,ZNF446,"ZKSCAN20,ZSCAN30,ZSCAN52",ENSG00000083838,21036.0,55663,ZSCAN30,NCBI


In [85]:
ap_collision_alias_ncbi_set = set(
    ap_collision_ncbi_df["collision"]
)
len(ap_collision_alias_ncbi_set)

1490

combine lists without duplicates to create list of ambiguous symbols for NCBI

In [86]:
ncbi_ambiguous_symbol_set = aa_collision_alias_ncbi_set.union(
    ap_collision_alias_ncbi_set
)
ncbi_ambiguous_symbol_set_count = len(ncbi_ambiguous_symbol_set)
ncbi_ambiguous_symbol_set_count

5001

In [87]:
both_collision_ncbi_set = aa_collision_alias_ncbi_set & ap_collision_alias_ncbi_set
len(both_collision_ncbi_set)

187

In [88]:
only_aa_collision_alias_ncbi_set = aa_collision_alias_ncbi_set - both_collision_ncbi_set
len(only_aa_collision_alias_ncbi_set)

3511

In [89]:
only_ap_collision_alias_ncbi_set = ap_collision_alias_ncbi_set - both_collision_ncbi_set
len(only_ap_collision_alias_ncbi_set)

1303

import mini_ncbi_df from symbol_capture_generation.ipynb

In [90]:
mini_ncbi_df = pd.read_csv(
        ("../output/mini_ncbi_df.csv"), index_col=[0],dtype={"NCBI_ID": str,"HGNC_ID":str})
ncbi_df = mini_ncbi_df[mini_ncbi_df['gene_symbol'] != mini_ncbi_df['alias_symbol']]
ncbi_df['primary_gene_symbol'] = ncbi_df['gene_symbol'].str.upper()
ncbi_df.drop(["gene_symbol"], axis=1, inplace=True)
ncbi_df['alias_symbol'] = ncbi_df['alias_symbol'].str.upper()
ncbi_df = ncbi_df.drop_duplicates(subset=['primary_gene_symbol', 'alias_symbol'], keep='first')
ncbi_df

Unnamed: 0,NCBI_ID,alias_symbol,HGNC_ID,ENSG_ID,primary_gene_symbol
0,1,A1B,5,ENSG00000121410,A1BG
0,1,ABG,5,ENSG00000121410,A1BG
0,1,GAB,5,ENSG00000121410,A1BG
0,1,HYST2477,5,ENSG00000121410,A1BG
1,2,A2MD,7,ENSG00000175899,A2M
...,...,...,...,...,...
193540,6775095,,,,TRNQ
193541,6775096,,,,TRNM
193542,6775097,,,,TRNI
193543,6775098,,,,ND1


make a general symbol column with primary and alias symbols

In [91]:
ncbi_df = combine_columns(ncbi_df, ["primary_gene_symbol", "alias_symbol"], ["primary_gene_symbol", "ENSG_ID", "HGNC_ID","NCBI_ID"], "symbol", "alias_symbol")
ncbi_df

Unnamed: 0,ENSG_ID,HGNC_ID,NCBI_ID,primary_gene_symbol,symbol
0,ENSG00000121410,5,1,A1BG,A1BG
4,ENSG00000175899,7,2,A2M,A2M
8,ENSG00000171428,7645,9,NAT1,NAT1
12,ENSG00000156006,7646,10,NAT2,NAT2
15,,15,11,NATP,NATP
...,...,...,...,...,...
185157,,,6775095,TRNQ,
185158,,,6775096,TRNM,
185159,,,6775097,TRNI,
185160,,,6775098,ND1,


use list of ambiguous symbols and ncbi_combined_concept_ids_df to create a histogram of how many genes share ambiguous symbols

In [92]:
ncbi_symbol_value_counts = ncbi_df["symbol"].value_counts()

ncbi_ambiguous_symbol_counts = ncbi_symbol_value_counts[ncbi_symbol_value_counts.index.isin(ncbi_ambiguous_symbol_set)]
ncbi_ambiguous_symbol_counts = ncbi_ambiguous_symbol_counts.reset_index()
ncbi_ambiguous_symbol_counts.columns = ["symbols", "symbol_counts"]
ncbi_ambiguous_symbol_counts

Unnamed: 0,symbols,symbol_counts
0,VH,37
1,H4C1,14
2,H4C3,14
3,H4C5,14
4,H4C13,14
...,...,...
4996,PGD2,2
4997,GP1BB,2
4998,C20ORF197,2
4999,MAC-1,2


In [93]:
create_ambiguous_symbol_histogram(ncbi_ambiguous_symbol_counts, "NCBI", ncbi_ambiguous_symbol_set_count)

In [94]:
ncbi_ensg_ambiguous_symbol_set = ncbi_ambiguous_symbol_set.union(
    ensg_ambiguous_symbol_set
)

In [95]:
ncbi_ensg_hgnc_ambiguous_symbol_set = ncbi_ensg_ambiguous_symbol_set.union(
    hgnc_ambiguous_symbol_set
)

In [96]:
ambiguous_symbol_set = ncbi_ensg_hgnc_ambiguous_symbol_set
len(ambiguous_symbol_set)

5191