### Import 

In [279]:
import pandas as pd
import numpy as np
from civicpy import civic as civicpy

In [280]:
civic_molprof_ids = civicpy.get_all_molecular_profiles(
    include_status=["accepted", "submitted", "rejected"]
)

In [281]:
civic_variant_ids = civicpy.get_all_variants()

In [282]:
def transform_df_mp_id(df: pd.DataFrame) -> pd.DataFrame:
    """Transform dataframe to include molecular profile ID information

    :param df: Dataframe of variants
    :return: Transformed dataframe with molecular profile ID information
    """
    tmp_df = df.copy(deep=True)

    variants_molprof_ids = []
    variant_ids = list(tmp_df["civic_ids"])

    for v_id in variant_ids:
        variant_molprof_ids = []

        for variant in civic_variant_ids:
            if int(v_id) == variant.id:
                for mp in variant.molecular_profiles:
                    if mp.id not in variant_molprof_ids:
                        variant_molprof_ids.append(mp.id)

        variants_molprof_ids.append(variant_molprof_ids or "")

    tmp_df["molecular_profile_id"] = variants_molprof_ids
    return tmp_df

In [283]:
def transform_df_mp_score(df: pd.DataFrame) -> pd.DataFrame:
    """Transform dataframe to include molecular profile score information

    :param df: Dataframe of variants
    :return: Transformed dataframe with molecular profile score information
    """
    variants_molprof_scores = []
    normalized_variant_molprof_ids = list(df["molecular_profile_id"])

    for mp_ids in normalized_variant_molprof_ids:
        variant_molprof_scores = []
        for mp_id in mp_ids:
            for molprof in civic_molprof_ids:
                if int(mp_id) == molprof.id:
                    if molprof.molecular_profile_score not in variant_molprof_scores:
                        variant_molprof_scores.append(molprof.molecular_profile_score)

        variants_molprof_scores.append(variant_molprof_scores or "")

    df["molecular_profile_score"] = variants_molprof_scores
    return df

# Ensembl

In [284]:
mini_ensg_df = pd.read_csv(
    "../downloaded_files/ensg_biomart_gene20240626.txt", sep="\t",dtype={"NCBI gene (formerly Entrezgene) ID": pd.Int64Dtype()}
)
mini_ensg_df = mini_ensg_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Gene Synonym": "alias_symbol",
        "Gene name": "gene_symbol",
        "Gene stable ID": "ENSG_ID",
        "NCBI gene (formerly Entrezgene) ID": "NCBI_ID",
    }
)
mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,alias_symbol,HGNC_ID,NCBI_ID
0,ENSG00000210049,MT-TF,MTTF,HGNC:7481,
1,ENSG00000210049,MT-TF,TRNF,HGNC:7481,
2,ENSG00000211459,MT-RNR1,12S,HGNC:7470,
3,ENSG00000211459,MT-RNR1,MOTS-C,HGNC:7470,
4,ENSG00000211459,MT-RNR1,MTRNR1,HGNC:7470,
...,...,...,...,...,...
117135,ENSG00000200033,RNU6-403P,,HGNC:47366,
117136,ENSG00000228437,LINC02474,LNCSLCC1,HGNC:53417,
117137,ENSG00000228437,LINC02474,RP11-400N13.2,HGNC:53417,
117138,ENSG00000229463,LYST-AS1,LYST-IT2,HGNC:41320,


In [285]:
mini_ensg_df["alias_symbol"] = mini_ensg_df["alias_symbol"].fillna("").astype(str)
mini_ensg_df = (
    mini_ensg_df.groupby(
        ["ENSG_ID", "gene_symbol", "HGNC_ID", "NCBI_ID"], dropna=False
    )["alias_symbol"]
    .apply(lambda x: ", ".join(x.dropna()))
    .reset_index()
)
mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,HGNC:11858,7105.0,"T245, TM4SF6, TSPAN-6"
1,ENSG00000000005,TNMD,HGNC:17757,64102.0,"BRICD4, CHM1L, MYODULIN, TEM, TENDIN"
2,ENSG00000000419,DPM1,HGNC:3005,8813.0,"CDGIE, MPDS"
3,ENSG00000000457,SCYL3,HGNC:19285,57147.0,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,HGNC:25565,55732.0,"APOLO1, C1ORF112, FLIP, FLJ10706, MEICA1"
...,...,...,...,...,...
75829,ENSG00000293596,,,105372654.0,
75830,ENSG00000293597,LINC00970,HGNC:48730,101978719.0,
75831,ENSG00000293599,,,,
75832,ENSG00000293600,,,131768270.0,


In [286]:
# mini_ensg_df["NCBI_ID"] = mini_ensg_df["NCBI_ID"].astype(str).str.removesuffix(".0")
# mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,HGNC:11858,7105,"T245, TM4SF6, TSPAN-6"
1,ENSG00000000005,TNMD,HGNC:17757,64102,"BRICD4, CHM1L, MYODULIN, TEM, TENDIN"
2,ENSG00000000419,DPM1,HGNC:3005,8813,"CDGIE, MPDS"
3,ENSG00000000457,SCYL3,HGNC:19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,HGNC:25565,55732,"APOLO1, C1ORF112, FLIP, FLJ10706, MEICA1"
...,...,...,...,...,...
75829,ENSG00000293596,,,105372654,
75830,ENSG00000293597,LINC00970,HGNC:48730,101978719,
75831,ENSG00000293599,,,,
75832,ENSG00000293600,,,131768270,


In [287]:
mini_ensg_df["HGNC_ID"] = mini_ensg_df["HGNC_ID"].str.lstrip("HGNC:")
mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"T245, TM4SF6, TSPAN-6"
1,ENSG00000000005,TNMD,17757,64102,"BRICD4, CHM1L, MYODULIN, TEM, TENDIN"
2,ENSG00000000419,DPM1,3005,8813,"CDGIE, MPDS"
3,ENSG00000000457,SCYL3,19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,25565,55732,"APOLO1, C1ORF112, FLIP, FLJ10706, MEICA1"
...,...,...,...,...,...
75829,ENSG00000293596,,,105372654,
75830,ENSG00000293597,LINC00970,48730,101978719,
75831,ENSG00000293599,,,,
75832,ENSG00000293600,,,131768270,


In [288]:
mini_ensg_df.to_csv("../created_files/mini_ensg_df.csv", index=False)
mini_ensg_df.head()

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"T245, TM4SF6, TSPAN-6"
1,ENSG00000000005,TNMD,17757,64102,"BRICD4, CHM1L, MYODULIN, TEM, TENDIN"
2,ENSG00000000419,DPM1,3005,8813,"CDGIE, MPDS"
3,ENSG00000000457,SCYL3,19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,25565,55732,"APOLO1, C1ORF112, FLIP, FLJ10706, MEICA1"


One gene symbol may have multiple NCBI IDs

In [291]:
mini_ensg_df["symbol_duplicates"] = mini_ensg_df.duplicated(
    subset="gene_symbol", keep=False
)
dup_symbol_mini_ensg_df = mini_ensg_df[mini_ensg_df["symbol_duplicates"] == True]
dup_symbol_mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,symbol_duplicates
23,ENSG00000002586,CD99,7082,4267,"MIC2, MIC2X, MIC2Y",True
66,ENSG00000004866,ST7,11351,7982,"ETS7Q, FAM4A, FAM4A1, HELG, RAY1, SEN4, TSG7",True
67,ENSG00000004866,ST7,11351,93655,"ETS7Q, FAM4A, FAM4A1, HELG, RAY1, SEN4, TSG7",True
73,ENSG00000005001,PRSS22,14368,64063,"BSSP-4, HBSSP-4, SP001LA",True
104,ENSG00000005700,IBTK,17853,25998,"BTBD26, BTKI, DKFZP564B116",True
...,...,...,...,...,...,...
75829,ENSG00000293596,,,105372654,,True
75830,ENSG00000293597,LINC00970,48730,101978719,,True
75831,ENSG00000293599,,,,,True
75832,ENSG00000293600,,,131768270,,True


In [292]:
mini_ensg_df.loc[mini_ensg_df["gene_symbol"] == "ST7"]

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,symbol_duplicates
66,ENSG00000004866,ST7,11351,7982,"ETS7Q, FAM4A, FAM4A1, HELG, RAY1, SEN4, TSG7",True
67,ENSG00000004866,ST7,11351,93655,"ETS7Q, FAM4A, FAM4A1, HELG, RAY1, SEN4, TSG7",True


In [293]:
mini_ensg_df = mini_ensg_df.drop("symbol_duplicates", axis=1)
mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"T245, TM4SF6, TSPAN-6"
1,ENSG00000000005,TNMD,17757,64102,"BRICD4, CHM1L, MYODULIN, TEM, TENDIN"
2,ENSG00000000419,DPM1,3005,8813,"CDGIE, MPDS"
3,ENSG00000000457,SCYL3,19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,25565,55732,"APOLO1, C1ORF112, FLIP, FLJ10706, MEICA1"
...,...,...,...,...,...
75829,ENSG00000293596,,,105372654,
75830,ENSG00000293597,LINC00970,48730,101978719,
75831,ENSG00000293599,,,,
75832,ENSG00000293600,,,131768270,


### Make a set of the primary gene symbols

In [289]:
ensg_gene_symbol_set = set(mini_ensg_df["gene_symbol"])

In [290]:
total_number_ensembl_gene_symbols = len(ensg_gene_symbol_set)
total_number_ensembl_gene_symbols

41068

Drop genes with no aliases

In [294]:
mini_ensg_df = mini_ensg_df[
    ~mini_ensg_df["alias_symbol"].isnull() & (mini_ensg_df["alias_symbol"] != "")
]
mini_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"T245, TM4SF6, TSPAN-6"
1,ENSG00000000005,TNMD,17757,64102,"BRICD4, CHM1L, MYODULIN, TEM, TENDIN"
2,ENSG00000000419,DPM1,3005,8813,"CDGIE, MPDS"
3,ENSG00000000457,SCYL3,19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,25565,55732,"APOLO1, C1ORF112, FLIP, FLJ10706, MEICA1"
...,...,...,...,...,...
75796,ENSG00000293549,HCG22,,285834,PBMUCL2
75798,ENSG00000293551,PRAMEF22,34393,653606,PRAMEF3L
75801,ENSG00000293555,FAM169BP,26835,283777,"FAM169B, FLJ39743, KIAA0888L"
75828,ENSG00000293595,SLC25A3P1,26869,163742,FLJ40434


### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [295]:
mini_ensg_df["alias_symbol"] = mini_ensg_df["alias_symbol"].astype(str)
mini_ensg_df["alias_symbol"] = [x.split(",") for x in mini_ensg_df.alias_symbol]
mini_ensg_df["alias_symbol"] = np.where(
    mini_ensg_df.alias_symbol == "", "", mini_ensg_df.alias_symbol.map(set)
)
mini_ensg_df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ensg_df["alias_symbol"] = mini_ensg_df["alias_symbol"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ensg_df["alias_symbol"] = [x.split(",") for x in mini_ensg_df.alias_symbol]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ensg_df["alias_symbol"] = np.where(


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"{T245, TM4SF6, TSPAN-6}"


## Test for false positives (when a gene symbol has an alias that is exactly the same as the primary gene symbol)

In [296]:
mini_ensg_df["gene_symbol"] = mini_ensg_df["gene_symbol"].map(str)
print(mini_ensg_df.dtypes)

ENSG_ID         object
gene_symbol     object
HGNC_ID         object
NCBI_ID         object
alias_symbol    object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ensg_df["gene_symbol"] = mini_ensg_df["gene_symbol"].map(str)


In [297]:
mini_ensg_df["gene_symbol"] = [x.split(",") for x in mini_ensg_df.gene_symbol]
mini_ensg_df["gene_symbol"] = np.where(
    mini_ensg_df.gene_symbol == "", "", mini_ensg_df.gene_symbol.map(set)
)
false_pos_mini_ensg_df = mini_ensg_df[
    mini_ensg_df.alias_symbol.apply(lambda x: x & ensg_gene_symbol_set)
    == mini_ensg_df.gene_symbol
]
false_pos_mini_ensg_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ensg_df["gene_symbol"] = [x.split(",") for x in mini_ensg_df.gene_symbol]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ensg_df["gene_symbol"] = np.where(


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
3004,ENSG00000104883,{PEX11G},20208,92960.0,"{PEX11G, PEX11GAMMA}"
6532,ENSG00000131779,{PEX11B},8853,8799.0,"{ PEX11BETA, PEX11B}"
8199,ENSG00000141622,{ARK2C},31696,494470.0,"{ RNF111L2, RNF165, ARKL2, ARK2C, LNCAMPC}"
9578,ENSG00000152242,{ARK2N},28172,147339.0,"{ RNF111L1, ARKL1, ARK2N, MGC12909, C18ORF25}"
12749,ENSG00000169717,{ACTRT2},24026,140625.0,"{ ARP-T2, FLJ25424, ARPM2, ACTRT2}"
33923,ENSG00000232795,{SCAND3P1},42037,,"{ ZBED9P1, SCAND3P1}"
47393,ENSG00000257400,{LNCHR1},56254,,{LNCHR1}
58590,ENSG00000275410,{HNF1B},11630,6928.0,"{ HNF1BETA, MODY5, LFB3, HNF1B, VHNF1, TCF2}"
59459,ENSG00000276194,{HNF1B},11630,6928.0,"{ HNF1BETA, MODY5, LFB3, HNF1B, VHNF1, TCF2}"


### Need to remove aliases that match their primary key (gene symbol)

In [298]:
mini_ensg_df["alias_symbol"] = mini_ensg_df.alias_symbol - mini_ensg_df.gene_symbol
false_pos_mini_ensg_df = mini_ensg_df[
    mini_ensg_df.alias_symbol.apply(lambda x: x & ensg_gene_symbol_set)
    == mini_ensg_df.gene_symbol
]
false_pos_mini_ensg_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ensg_df["alias_symbol"] = mini_ensg_df.alias_symbol - mini_ensg_df.gene_symbol


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol


In [299]:
mini_ensg_df.loc[mini_ensg_df["NCBI_ID"] == "92960"]

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
3004,ENSG00000104883,{PEX11G},20208,92960,{ PEX11GAMMA}


## Find intersection points using alias symbol sets

In [300]:
# alias_symbol_sets_series = mini_ensg_df.alias_symbol

In [301]:
mini_ensg_df["intersect_point"] = mini_ensg_df.alias_symbol.apply(
    lambda x: x & ensg_gene_symbol_set
)
ag_collision_ensg_df = mini_ensg_df[mini_ensg_df.intersect_point != set()]
ag_collision_ensg_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ensg_df["intersect_point"] = mini_ensg_df.alias_symbol.apply(


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point
88,ENSG00000005206,{SPPL2B},30627,56928,"{ KIAA1532, PSL1, IMP4}",{IMP4}
223,ENSG00000008516,{MMP25},14246,64386,"{ MT6-MMP, MMPL1, MMP20}",{MMP20}
324,ENSG00000012983,{MAP4K5},6867,11183,"{GCKR, KHS, KHS1}",{GCKR}
340,ENSG00000013573,{DDX11},2736,1663,"{ KRG-2, WABS, CHLR1, CHL1}",{CHL1}
379,ENSG00000018236,{CNTN1},2171,1272,"{ GP135, F3}",{F3}


In [302]:
ag_collision_ensg_df["source"] = "ENSG"
ag_collision_ensg_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ensg_df["source"] = "ENSG"


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
88,ENSG00000005206,{SPPL2B},30627,56928,"{ KIAA1532, PSL1, IMP4}",{IMP4},ENSG
223,ENSG00000008516,{MMP25},14246,64386,"{ MT6-MMP, MMPL1, MMP20}",{MMP20},ENSG
324,ENSG00000012983,{MAP4K5},6867,11183,"{GCKR, KHS, KHS1}",{GCKR},ENSG
340,ENSG00000013573,{DDX11},2736,1663,"{ KRG-2, WABS, CHLR1, CHL1}",{CHL1},ENSG
379,ENSG00000018236,{CNTN1},2171,1272,"{ GP135, F3}",{F3},ENSG
...,...,...,...,...,...,...,...
74459,ENSG00000291654,{PCDHA6},8672,56142,"{CNR2, CNRS2, PCDH-ALPHA6, CRNR2}",{CNR2},ENSG
74495,ENSG00000291690,{PCDHB11},8682,56125,"{ PCDH-BETA11, ME2}",{ME2},ENSG
74578,ENSG00000291781,{TTC5},19274,91875,{STRAP},{STRAP},ENSG
74928,ENSG00000292149,{TCF7L1},11640,83439,{TCF3},{TCF3},ENSG


make a set of collisions
- some records have multiple collisions, need to explode to one per row so that when i change it back to a str and make a set it doesnt count two collisions seperated by a commma as one unique collision

In [303]:
ag_collision_ensg_df = ag_collision_ensg_df.explode(column="intersect_point")
ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
88,ENSG00000005206,{SPPL2B},30627,56928,"{ KIAA1532, PSL1, IMP4}",IMP4,ENSG
223,ENSG00000008516,{MMP25},14246,64386,"{ MT6-MMP, MMPL1, MMP20}",MMP20,ENSG
324,ENSG00000012983,{MAP4K5},6867,11183,"{GCKR, KHS, KHS1}",GCKR,ENSG
340,ENSG00000013573,{DDX11},2736,1663,"{ KRG-2, WABS, CHLR1, CHL1}",CHL1,ENSG
379,ENSG00000018236,{CNTN1},2171,1272,"{ GP135, F3}",F3,ENSG
...,...,...,...,...,...,...,...
74459,ENSG00000291654,{PCDHA6},8672,56142,"{CNR2, CNRS2, PCDH-ALPHA6, CRNR2}",CNR2,ENSG
74495,ENSG00000291690,{PCDHB11},8682,56125,"{ PCDH-BETA11, ME2}",ME2,ENSG
74578,ENSG00000291781,{TTC5},19274,91875,{STRAP},STRAP,ENSG
74928,ENSG00000292149,{TCF7L1},11640,83439,{TCF3},TCF3,ENSG


In [304]:
duplicates_ag_collision_ensg_df = ag_collision_ensg_df[
    ag_collision_ensg_df.duplicated("gene_symbol", keep=False)
]
duplicates_ag_collision_ensg_df = duplicates_ag_collision_ensg_df.sort_values(
    "gene_symbol"
)
duplicates_ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
1554,ENSG00000081842,{PCDHA6},8672,56142.0,"{CNR2, CNRS2, PCDH-ALPHA6, CRNR2}",CNR2,ENSG
57502,ENSG00000274585,{RNU2-1},10142,124907963.0,{U2},U2,ENSG
57503,ENSG00000274585,{RNU2-1},10142,124907964.0,{U2},U2,ENSG
57504,ENSG00000274585,{RNU2-1},10142,124907965.0,{U2},U2,ENSG
59227,ENSG00000275920,{KRTAP5-3},23598,387266.0,"{KRTAP5-9, KRTAP5.3}",KRTAP5-9,ENSG
59495,ENSG00000276234,{TADA2A},11531,6871.0,"{ADA2, HADA2, ADA2A, TADA2L}",ADA2,ENSG
59496,ENSG00000276238,{NANOGP7},23105,,{NANOGP3},NANOGP3,ENSG
59565,ENSG00000276333,{HS3ST6},14178,64711.0,{HS3ST5},HS3ST5,ENSG
60203,ENSG00000276876,{MYT1},7622,4661.0,"{ ZC2HC4A, PLPB1, MYTI, ZC2H2C1, NZF2, MTF1}",MTF1,ENSG
60328,ENSG00000277033,{B3GNTL1},21727,146712.0,{B3GNT8},B3GNT8,ENSG


In [305]:
ensg_alias_gene_collision_set = set(ag_collision_ensg_df["intersect_point"])
len(ensg_alias_gene_collision_set)

229

In [306]:
ag_collision_ensg_df["gene_symbol"] = ag_collision_ensg_df["gene_symbol"].str.join(", ")

In [307]:
ensg_alias_gene_collision_primary_symbol_set = set(ag_collision_ensg_df["gene_symbol"])
len(ensg_alias_gene_collision_primary_symbol_set)

242

1. Why is the alias-gene collision set not the same length as the set of primary symbols with collisions ?
2. Why is the length of the alias-gene collison set shorter?
 - A priamry gene symbol with an alias-gene collision has an alias that matches a different gene's primary gene symbol.
 - Multiple genes can share a single alias (alias-alias collision)
 - If that shared alias is an alias-gene collision, then there will be more unique gene symbols in the set of primary symbols with collisions than the set of alias-gene collisions. 

### Convert sets to str or list
    works on the first pass, gets Assertion Error on passes after

In [308]:
ag_collision_ensg_df["gene_symbol"] = ag_collision_ensg_df["gene_symbol"].str.replace(
    r"[{}]", ""
)
ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
88,ENSG00000005206,SPPL2B,30627,56928,"{ KIAA1532, PSL1, IMP4}",IMP4,ENSG
223,ENSG00000008516,MMP25,14246,64386,"{ MT6-MMP, MMPL1, MMP20}",MMP20,ENSG
324,ENSG00000012983,MAP4K5,6867,11183,"{GCKR, KHS, KHS1}",GCKR,ENSG
340,ENSG00000013573,DDX11,2736,1663,"{ KRG-2, WABS, CHLR1, CHL1}",CHL1,ENSG
379,ENSG00000018236,CNTN1,2171,1272,"{ GP135, F3}",F3,ENSG
...,...,...,...,...,...,...,...
74459,ENSG00000291654,PCDHA6,8672,56142,"{CNR2, CNRS2, PCDH-ALPHA6, CRNR2}",CNR2,ENSG
74495,ENSG00000291690,PCDHB11,8682,56125,"{ PCDH-BETA11, ME2}",ME2,ENSG
74578,ENSG00000291781,TTC5,19274,91875,{STRAP},STRAP,ENSG
74928,ENSG00000292149,TCF7L1,11640,83439,{TCF3},TCF3,ENSG


### Alphabetize alias_symbol

In [309]:
ag_collision_ensg_df["alias_symbol"] = ag_collision_ensg_df["alias_symbol"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
88,ENSG00000005206,SPPL2B,30627,56928,"[ KIAA1532, PSL1, IMP4]",IMP4,ENSG
223,ENSG00000008516,MMP25,14246,64386,"[ MMPL1, MT6-MMP, MMP20]",MMP20,ENSG
324,ENSG00000012983,MAP4K5,6867,11183,"[ KHS, KHS1, GCKR]",GCKR,ENSG
340,ENSG00000013573,DDX11,2736,1663,"[ CHLR1, KRG-2, WABS, CHL1]",CHL1,ENSG
379,ENSG00000018236,CNTN1,2171,1272,"[ GP135, F3]",F3,ENSG
...,...,...,...,...,...,...,...
74459,ENSG00000291654,PCDHA6,8672,56142,"[ CNRS2, CRNR2, PCDH-ALPHA6, CNR2]",CNR2,ENSG
74495,ENSG00000291690,PCDHB11,8682,56125,"[ PCDH-BETA11, ME2]",ME2,ENSG
74578,ENSG00000291781,TTC5,19274,91875,[STRAP],STRAP,ENSG
74928,ENSG00000292149,TCF7L1,11640,83439,[TCF3],TCF3,ENSG


### Convert lists to str

In [310]:
ag_collision_ensg_df["alias_symbol"] = ag_collision_ensg_df["alias_symbol"].str.join(
    ", "
)

In [311]:
ag_collision_ensg_df = ag_collision_ensg_df.sort_values("intersect_point")
ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
65759,ENSG00000283293,RN7SK,10037,125050,7SK,7SK,ENSG
12287,ENSG00000167780,SOAT2,11178,8435,ACAT2,ACAT2,ENSG
25650,ENSG00000220267,ACTBP8,141,,ACTBP2,ACTBP2,ENSG
59495,ENSG00000276234,TADA2A,11531,6871,"ADA2A, HADA2, TADA2L, ADA2",ADA2,ENSG
60383,ENSG00000277104,TADA2A,11531,6871,"ADA2A, HADA2, TADA2L, ADA2",ADA2,ENSG
...,...,...,...,...,...,...,...
8401,ENSG00000143258,USP21,12620,27005,"USP23, USP16",USP16,ENSG
9928,ENSG00000155313,USP25,12624,29761,USP21,USP21,ENSG
54879,ENSG00000270858,VDAC1P5,12676,,"VDAC5P, VDAC3",VDAC3,ENSG
42358,ENSG00000249947,XBP1P1,12802,,"XBPP1, XBP1",XBP1,ENSG


In [312]:
ag_collision_ensg_df = ag_collision_ensg_df.drop_duplicates(
    subset=["gene_symbol", "intersect_point"], keep="first"
)
ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
65759,ENSG00000283293,RN7SK,10037,125050,7SK,7SK,ENSG
12287,ENSG00000167780,SOAT2,11178,8435,ACAT2,ACAT2,ENSG
25650,ENSG00000220267,ACTBP8,141,,ACTBP2,ACTBP2,ENSG
59495,ENSG00000276234,TADA2A,11531,6871,"ADA2A, HADA2, TADA2L, ADA2",ADA2,ENSG
594,ENSG00000042980,ADAM28,206,10863,"EMDCII, MDC-LM, MDC-LS, ADAM23",ADAM23,ENSG
...,...,...,...,...,...,...,...
8401,ENSG00000143258,USP21,12620,27005,"USP23, USP16",USP16,ENSG
9928,ENSG00000155313,USP25,12624,29761,USP21,USP21,ENSG
54879,ENSG00000270858,VDAC1P5,12676,,"VDAC5P, VDAC3",VDAC3,ENSG
42358,ENSG00000249947,XBP1P1,12802,,"XBPP1, XBP1",XBP1,ENSG


## Add civic_gene_id to table

In [313]:
genes = civicpy.get_all_genes()

In [314]:
ensg_NCBI_ID_list = list(ag_collision_ensg_df["NCBI_ID"])

In [315]:
civic_ids = []
for id in ensg_NCBI_ID_list:
    found_civic_id = ""
    for gene in genes:
        if id == str(gene.entrez_id):
            found_civic_id = gene.id
    civic_ids.append(found_civic_id)

ag_collision_ensg_df["civic_ids"] = civic_ids
ag_collision_ensg_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ensg_df["civic_ids"] = civic_ids


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids
65759,ENSG00000283293,RN7SK,10037,125050,7SK,7SK,ENSG,
12287,ENSG00000167780,SOAT2,11178,8435,ACAT2,ACAT2,ENSG,
25650,ENSG00000220267,ACTBP8,141,,ACTBP2,ACTBP2,ENSG,
59495,ENSG00000276234,TADA2A,11531,6871,"ADA2A, HADA2, TADA2L, ADA2",ADA2,ENSG,
594,ENSG00000042980,ADAM28,206,10863,"EMDCII, MDC-LM, MDC-LS, ADAM23",ADAM23,ENSG,
...,...,...,...,...,...,...,...,...
8401,ENSG00000143258,USP21,12620,27005,"USP23, USP16",USP16,ENSG,
9928,ENSG00000155313,USP25,12624,29761,USP21,USP21,ENSG,
54879,ENSG00000270858,VDAC1P5,12676,,"VDAC5P, VDAC3",VDAC3,ENSG,
42358,ENSG00000249947,XBP1P1,12802,,"XBPP1, XBP1",XBP1,ENSG,


In [316]:
ag_collision_ensg_df["civic_ids"].replace("", np.nan, inplace=True)
ag_collision_ensg_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ag_collision_ensg_df["civic_ids"].replace("", np.nan, inplace=True)
  ag_collision_ensg_df["civic_ids"].replace("", np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ensg_df["civic_ids"].replace("", np.nan, inplace=True)


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids
65759,ENSG00000283293,RN7SK,10037,125050.0,7SK,7SK,ENSG,
12287,ENSG00000167780,SOAT2,11178,8435.0,ACAT2,ACAT2,ENSG,
25650,ENSG00000220267,ACTBP8,141,,ACTBP2,ACTBP2,ENSG,
59495,ENSG00000276234,TADA2A,11531,6871.0,"ADA2A, HADA2, TADA2L, ADA2",ADA2,ENSG,
594,ENSG00000042980,ADAM28,206,10863.0,"EMDCII, MDC-LM, MDC-LS, ADAM23",ADAM23,ENSG,


In [317]:
civic_ag_collision_ensg_df = ag_collision_ensg_df.copy()
civic_ag_collision_ensg_df.dropna(subset=["civic_ids"], inplace=True)
civic_ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids
3677,ENSG00000109321,AREG,651,374,"AREGB, CRDGF, SDGF, AR",AR,ENSG,389.0
7325,ENSG00000136492,BRIP1,20473,83990,"FANCJ, OF, BACH1",BACH1,ENSG,15955.0
12645,ENSG00000169245,CXCL10,10637,3627,"CRG-2, GIP-10, IFI10, INP10, IP-10, MOB-...",C7,ENSG,3017.0
6208,ENSG00000129682,FGF13,3670,2258,"FHF2, FLJ30672, LINC00889, FGF2",FGF2,ENSG,1883.0
4305,ENSG00000114353,GNAI2,4385,2771,"GNAI2B, GIP",GIP,ENSG,2312.0
6137,ENSG00000129071,MBD4,6919,8930,MED1,MED1,ENSG,7084.0


## Add MP ID and score to table

In [318]:
civic_ag_collision_ensg_df_add_molprof_df = transform_df_mp_id(
    civic_ag_collision_ensg_df
)
civic_ag_collision_ensg_df_add_molprof_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,molecular_profile_id
3677,ENSG00000109321,AREG,651,374,"AREGB, CRDGF, SDGF, AR",AR,ENSG,389.0,[385]
7325,ENSG00000136492,BRIP1,20473,83990,"FANCJ, OF, BACH1",BACH1,ENSG,15955.0,
12645,ENSG00000169245,CXCL10,10637,3627,"CRG-2, GIP-10, IFI10, INP10, IP-10, MOB-...",C7,ENSG,3017.0,
6208,ENSG00000129682,FGF13,3670,2258,"FHF2, FLJ30672, LINC00889, FGF2",FGF2,ENSG,1883.0,[1759]
4305,ENSG00000114353,GNAI2,4385,2771,"GNAI2B, GIP",GIP,ENSG,2312.0,[2185]
6137,ENSG00000129071,MBD4,6919,8930,MED1,MED1,ENSG,7084.0,


In [319]:
civic_true_mini_ensg_add_molprof_score_df = transform_df_mp_score(
    civic_ag_collision_ensg_df_add_molprof_df
)
civic_true_mini_ensg_add_molprof_score_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,molecular_profile_id,molecular_profile_score
3677,ENSG00000109321,AREG,651,374,"AREGB, CRDGF, SDGF, AR",AR,ENSG,389.0,[385],[12.0]
7325,ENSG00000136492,BRIP1,20473,83990,"FANCJ, OF, BACH1",BACH1,ENSG,15955.0,,
12645,ENSG00000169245,CXCL10,10637,3627,"CRG-2, GIP-10, IFI10, INP10, IP-10, MOB-...",C7,ENSG,3017.0,,
6208,ENSG00000129682,FGF13,3670,2258,"FHF2, FLJ30672, LINC00889, FGF2",FGF2,ENSG,1883.0,[1759],[10.0]
4305,ENSG00000114353,GNAI2,4385,2771,"GNAI2B, GIP",GIP,ENSG,2312.0,[2185],[0.0]
6137,ENSG00000129071,MBD4,6919,8930,MED1,MED1,ENSG,7084.0,,


## Add evidence_ids column

In [320]:
civic_variant_ids = civicpy.get_all_variants()

In [321]:
ensg_civic_id_list = list(civic_ag_collision_ensg_df["civic_ids"])

In [322]:
civic_ag_collision_ensg_df.civic_ids.astype(int)

3677       389
7325     15955
12645     3017
6208      1883
4305      2312
6137      7084
Name: civic_ids, dtype: int64

In [323]:
civic_eids = []
for civic_id in ensg_civic_id_list:
    gene_civic_eids = []
    for variant in civic_variant_ids:
        # print(civic_id, type(civic_id))
        # print(   = =variant.gene_id, type(variant.gene_id))
        if int(civic_id) == variant.gene_id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in gene_civic_eids:
                        gene_civic_eids.append(e.id)
            gene_civic_eids = gene_civic_eids or ""
    civic_eids.append(gene_civic_eids)
civic_ag_collision_ensg_df["civic_eids"] = civic_eids
civic_ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,civic_eids
3677,ENSG00000109321,AREG,651,374,"AREGB, CRDGF, SDGF, AR",AR,ENSG,389.0,"[1020, 788, 847, 781, 846, 780]"
7325,ENSG00000136492,BRIP1,20473,83990,"FANCJ, OF, BACH1",BACH1,ENSG,15955.0,[11208]
12645,ENSG00000169245,CXCL10,10637,3627,"CRG-2, GIP-10, IFI10, INP10, IP-10, MOB-...",C7,ENSG,3017.0,[8026]
6208,ENSG00000129682,FGF13,3670,2258,"FHF2, FLJ30672, LINC00889, FGF2",FGF2,ENSG,1883.0,[1062]
4305,ENSG00000114353,GNAI2,4385,2771,"GNAI2B, GIP",GIP,ENSG,2312.0,"[11943, 11839]"
6137,ENSG00000129071,MBD4,6919,8930,MED1,MED1,ENSG,7084.0,[7596]


In [324]:
civic_evidence_list = civicpy.get_all_evidence()

In [325]:
ensg_civic_eid_list = list(civic_ag_collision_ensg_df["civic_eids"])
ensg_civic_eid_list[0]

[1020, 788, 847, 781, 846, 780]

In [326]:
evidence_ratings = []
for eid_list in ensg_civic_eid_list:
    found_rating = []
    for eid in eid_list:
        for evidence in civic_evidence_list:
            if eid == evidence.id:
                found_rating.append(evidence.rating)
    evidence_ratings.append(found_rating)
civic_ag_collision_ensg_df["evidence_ratings"] = evidence_ratings
civic_ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,civic_eids,evidence_ratings
3677,ENSG00000109321,AREG,651,374,"AREGB, CRDGF, SDGF, AR",AR,ENSG,389.0,"[1020, 788, 847, 781, 846, 780]","[4, 3, 3, 2, 2, 3]"
7325,ENSG00000136492,BRIP1,20473,83990,"FANCJ, OF, BACH1",BACH1,ENSG,15955.0,[11208],[2]
12645,ENSG00000169245,CXCL10,10637,3627,"CRG-2, GIP-10, IFI10, INP10, IP-10, MOB-...",C7,ENSG,3017.0,[8026],[4]
6208,ENSG00000129682,FGF13,3670,2258,"FHF2, FLJ30672, LINC00889, FGF2",FGF2,ENSG,1883.0,[1062],[3]
4305,ENSG00000114353,GNAI2,4385,2771,"GNAI2B, GIP",GIP,ENSG,2312.0,"[11943, 11839]","[4, 3]"
6137,ENSG00000129071,MBD4,6919,8930,MED1,MED1,ENSG,7084.0,[7596],[4]


In [327]:
average_evidence_rating = []
for alist in civic_ag_collision_ensg_df["evidence_ratings"]:
    avg_rating = sum(alist) / len(alist)
    average_evidence_rating.append(avg_rating)
civic_ag_collision_ensg_df["average_evidence_ratings"] = average_evidence_rating
civic_ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,civic_eids,evidence_ratings,average_evidence_ratings
3677,ENSG00000109321,AREG,651,374,"AREGB, CRDGF, SDGF, AR",AR,ENSG,389.0,"[1020, 788, 847, 781, 846, 780]","[4, 3, 3, 2, 2, 3]",2.833333
7325,ENSG00000136492,BRIP1,20473,83990,"FANCJ, OF, BACH1",BACH1,ENSG,15955.0,[11208],[2],2.0
12645,ENSG00000169245,CXCL10,10637,3627,"CRG-2, GIP-10, IFI10, INP10, IP-10, MOB-...",C7,ENSG,3017.0,[8026],[4],4.0
6208,ENSG00000129682,FGF13,3670,2258,"FHF2, FLJ30672, LINC00889, FGF2",FGF2,ENSG,1883.0,[1062],[3],3.0
4305,ENSG00000114353,GNAI2,4385,2771,"GNAI2B, GIP",GIP,ENSG,2312.0,"[11943, 11839]","[4, 3]",3.5
6137,ENSG00000129071,MBD4,6919,8930,MED1,MED1,ENSG,7084.0,[7596],[4],4.0


In [328]:
sum_evidence_rating = []
for alist in civic_ag_collision_ensg_df["evidence_ratings"]:
    sum_rating = sum(alist)
    sum_evidence_rating.append(sum_rating)
civic_ag_collision_ensg_df["sum_evidence_rating"] = sum_evidence_rating
civic_ag_collision_ensg_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,civic_eids,evidence_ratings,average_evidence_ratings,sum_evidence_rating
3677,ENSG00000109321,AREG,651,374,"AREGB, CRDGF, SDGF, AR",AR,ENSG,389.0,"[1020, 788, 847, 781, 846, 780]","[4, 3, 3, 2, 2, 3]",2.833333,17
7325,ENSG00000136492,BRIP1,20473,83990,"FANCJ, OF, BACH1",BACH1,ENSG,15955.0,[11208],[2],2.0,2
12645,ENSG00000169245,CXCL10,10637,3627,"CRG-2, GIP-10, IFI10, INP10, IP-10, MOB-...",C7,ENSG,3017.0,[8026],[4],4.0,4
6208,ENSG00000129682,FGF13,3670,2258,"FHF2, FLJ30672, LINC00889, FGF2",FGF2,ENSG,1883.0,[1062],[3],3.0,3
4305,ENSG00000114353,GNAI2,4385,2771,"GNAI2B, GIP",GIP,ENSG,2312.0,"[11943, 11839]","[4, 3]",3.5,7
6137,ENSG00000129071,MBD4,6919,8930,MED1,MED1,ENSG,7084.0,[7596],[4],4.0,4


# HGNC

## Set up table

In [329]:
mini_hgnc_df = pd.read_csv(
    "../downloaded_files/hgnc_biomart_gene20240626.txt", sep="\t",dtype={"NCBI gene ID": pd.Int64Dtype()}
)
mini_hgnc_df = mini_hgnc_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Approved symbol": "gene_symbol",
        "Alias symbol": "alias_symbol",
        "Ensembl gene ID": "ENSG_ID",
        "NCBI gene ID": "NCBI_ID",
    }
)
mini_hgnc_df

Unnamed: 0,HGNC_ID,alias_symbol,NCBI_ID,ENSG_ID,gene_symbol
0,HGNC:5,,1.0,ENSG00000121410,A1BG
1,HGNC:37133,FLJ23569,503538.0,ENSG00000268895,A1BG-AS1
2,HGNC:24086,ACF,29974.0,ENSG00000148584,A1CF
3,HGNC:24086,ASP,29974.0,ENSG00000148584,A1CF
4,HGNC:24086,ACF64,29974.0,ENSG00000148584,A1CF
...,...,...,...,...,...
67578,HGNC:29027,KIAA0399,23140.0,ENSG00000074755,ZZEF1
67579,HGNC:29027,ZZZ4,23140.0,ENSG00000074755,ZZEF1
67580,HGNC:29027,FLJ10821,23140.0,ENSG00000074755,ZZEF1
67581,HGNC:24523,DKFZP564I052,26009.0,ENSG00000036549,ZZZ3


In [330]:
mini_hgnc_df["alias_symbol"] = mini_hgnc_df["alias_symbol"].fillna("").astype(str)
mini_hgnc_df = (
    mini_hgnc_df.groupby(
        ["ENSG_ID", "gene_symbol", "HGNC_ID", "NCBI_ID"], dropna=False
    )["alias_symbol"]
    .apply(lambda x: ", ".join(x.dropna()))
    .reset_index()
)
mini_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,HGNC:11858,7105.0,"T245, TSPAN-6"
1,ENSG00000000005,TNMD,HGNC:17757,64102.0,"myodulin, ChM1L, tendin, TEM, BRICD4"
2,ENSG00000000419,DPM1,HGNC:3005,8813.0,"MPDS, CDGIE"
3,ENSG00000000457,SCYL3,HGNC:19285,57147.0,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,HGNC:25565,55732.0,"FLJ10706, Apolo1, FLIP, MEICA1"
...,...,...,...,...,...
45641,,ZNF97,HGNC:13173,,
45642,,ZNFP1,HGNC:13181,,
45643,,ZPAXP,HGNC:51635,105373450.0,ZPX1P
45644,,ZRK,HGNC:13193,,


In [331]:
# mini_hgnc_df["NCBI_ID"] = mini_hgnc_df["NCBI_ID"].astype(str).str.removesuffix(".0")
# mini_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,HGNC:11858,7105,"T245, TSPAN-6"
1,ENSG00000000005,TNMD,HGNC:17757,64102,"myodulin, ChM1L, tendin, TEM, BRICD4"
2,ENSG00000000419,DPM1,HGNC:3005,8813,"MPDS, CDGIE"
3,ENSG00000000457,SCYL3,HGNC:19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,HGNC:25565,55732,"FLJ10706, Apolo1, FLIP, MEICA1"
...,...,...,...,...,...
45641,,ZNF97,HGNC:13173,,
45642,,ZNFP1,HGNC:13181,,
45643,,ZPAXP,HGNC:51635,105373450,ZPX1P
45644,,ZRK,HGNC:13193,,


In [332]:
mini_hgnc_df["HGNC_ID"] = mini_hgnc_df["HGNC_ID"].str.lstrip("HGNC:")
mini_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"T245, TSPAN-6"
1,ENSG00000000005,TNMD,17757,64102,"myodulin, ChM1L, tendin, TEM, BRICD4"
2,ENSG00000000419,DPM1,3005,8813,"MPDS, CDGIE"
3,ENSG00000000457,SCYL3,19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,25565,55732,"FLJ10706, Apolo1, FLIP, MEICA1"
...,...,...,...,...,...
45641,,ZNF97,13173,,
45642,,ZNFP1,13181,,
45643,,ZPAXP,51635,105373450,ZPX1P
45644,,ZRK,13193,,


In [333]:
mini_hgnc_df.to_csv("../created_files/mini_hgnc_df.csv", index=False)
mini_hgnc_df.head()

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"T245, TSPAN-6"
1,ENSG00000000005,TNMD,17757,64102,"myodulin, ChM1L, tendin, TEM, BRICD4"
2,ENSG00000000419,DPM1,3005,8813,"MPDS, CDGIE"
3,ENSG00000000457,SCYL3,19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,25565,55732,"FLJ10706, Apolo1, FLIP, MEICA1"


### Make a set of the primary gene symbols

In [334]:
hgnc_gene_symbol_set = set(mini_hgnc_df["gene_symbol"])
# all_gene_symbols_set

In [335]:
total_number_hgnc_gene_symbols = len(hgnc_gene_symbol_set)
total_number_hgnc_gene_symbols

45646

Drop genes with no aliases

In [336]:
mini_hgnc_df = mini_hgnc_df[
    ~mini_hgnc_df["alias_symbol"].isnull() & (mini_hgnc_df["alias_symbol"] != "")
]
mini_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"T245, TSPAN-6"
1,ENSG00000000005,TNMD,17757,64102,"myodulin, ChM1L, tendin, TEM, BRICD4"
2,ENSG00000000419,DPM1,3005,8813,"MPDS, CDGIE"
3,ENSG00000000457,SCYL3,19285,57147,"PACE-1, PACE1"
4,ENSG00000000460,FIRRM,25565,55732,"FLJ10706, Apolo1, FLIP, MEICA1"
...,...,...,...,...,...
45632,,ZNF78L2,13152,,pT3
45636,,ZNF88,13163,,HPF8
45638,,ZNF94,13170,,F11465
45643,,ZPAXP,51635,105373450,ZPX1P


### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [337]:
mini_hgnc_df["alias_symbol"] = mini_hgnc_df["alias_symbol"].astype(str)
type(mini_hgnc_df.alias_symbol[2].split(";"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_hgnc_df["alias_symbol"] = mini_hgnc_df["alias_symbol"].astype(str)


list

In [338]:
mini_hgnc_df["alias_symbol"] = [x.split(";") for x in mini_hgnc_df.alias_symbol]
mini_hgnc_df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_hgnc_df["alias_symbol"] = [x.split(";") for x in mini_hgnc_df.alias_symbol]


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"[T245, TSPAN-6]"


In [339]:
mini_hgnc_df["alias_symbol"] = np.where(
    mini_hgnc_df.alias_symbol == "", "", mini_hgnc_df.alias_symbol.map(set)
)
mini_hgnc_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_hgnc_df["alias_symbol"] = np.where(


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol
0,ENSG00000000003,TSPAN6,11858,7105,"{T245, TSPAN-6}"
1,ENSG00000000005,TNMD,17757,64102,"{myodulin, ChM1L, tendin, TEM, BRICD4}"
2,ENSG00000000419,DPM1,3005,8813,"{MPDS, CDGIE}"
3,ENSG00000000457,SCYL3,19285,57147,"{PACE-1, PACE1}"
4,ENSG00000000460,FIRRM,25565,55732,"{FLJ10706, Apolo1, FLIP, MEICA1}"


## Add test for false positives in the intersection points
#### (places where x in alias_smbol matches x in mini_hgnc_df.gene_symbol in the same row)

In [340]:
mini_hgnc_df["gene_symbol"] = [x.split(";") for x in mini_hgnc_df.gene_symbol]
mini_hgnc_df["gene_symbol"] = np.where(
    mini_hgnc_df.gene_symbol == "", "", mini_hgnc_df.gene_symbol.map(set)
)
false_pos_mini_hgnc_df = mini_hgnc_df[
    mini_hgnc_df.alias_symbol.apply(lambda x: x & hgnc_gene_symbol_set)
    == mini_hgnc_df.gene_symbol
]
false_pos_mini_hgnc_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_hgnc_df["gene_symbol"] = [x.split(";") for x in mini_hgnc_df.gene_symbol]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_hgnc_df["gene_symbol"] = np.where(


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol


### Need to remove aliases that match their primary key (gene symbol)

In [341]:
mini_hgnc_df["alias_symbol"] = mini_hgnc_df.alias_symbol - mini_hgnc_df.gene_symbol

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_hgnc_df["alias_symbol"] = mini_hgnc_df.alias_symbol - mini_hgnc_df.gene_symbol


## Find intersection points using alias symbol sets

In [342]:
# alias_symbol_sets_series = mini_hgnc_df.alias_symbol

In [343]:
mini_hgnc_df["intersect_point"] = mini_hgnc_df.alias_symbol.apply(
    lambda x: x & hgnc_gene_symbol_set
)
ag_collision_hgnc_df = mini_hgnc_df[mini_hgnc_df.intersect_point != set()]
ag_collision_hgnc_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_hgnc_df["intersect_point"] = mini_hgnc_df.alias_symbol.apply(


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point
25,ENSG00000002726,{AOC1},80,26,{DAO},{DAO}
461,ENSG00000027001,{MIPEP},7104,4285,{MIP},{MIP}
691,ENSG00000051341,{POLQ},9186,10721,{POLH},{POLH}
698,ENSG00000052344,{PRSS8},9491,5652,{CAP1},{CAP1}
807,ENSG00000060069,{CTDP1},2498,9150,{FCP1},{FCP1}
...,...,...,...,...,...,...
42532,,{L1RE3},16899,,{LRE3},{LRE3}
42980,,{MT-HPR},7438,,{HPR},{HPR}
43306,,{PCDHA@},8662,56117,{CNR1},{CNR1}
44660,,{SPG32},32314,,{SPG29},{SPG29}


In [344]:
ag_collision_hgnc_df["source"] = "HGNC"
ag_collision_hgnc_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_hgnc_df["source"] = "HGNC"


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
25,ENSG00000002726,{AOC1},80,26,{DAO},{DAO},HGNC
461,ENSG00000027001,{MIPEP},7104,4285,{MIP},{MIP},HGNC
691,ENSG00000051341,{POLQ},9186,10721,{POLH},{POLH},HGNC
698,ENSG00000052344,{PRSS8},9491,5652,{CAP1},{CAP1},HGNC
807,ENSG00000060069,{CTDP1},2498,9150,{FCP1},{FCP1},HGNC


### Convert sets to str or list

In [345]:
type(ag_collision_hgnc_df["gene_symbol"][25])

set

In [346]:
ag_collision_hgnc_df["gene_symbol"]

25         {AOC1}
461       {MIPEP}
691        {POLQ}
698       {PRSS8}
807       {CTDP1}
           ...   
42532     {L1RE3}
42980    {MT-HPR}
43306    {PCDHA@}
44660     {SPG32}
45593    {ZNF159}
Name: gene_symbol, Length: 96, dtype: object

In [347]:
list

list

In [348]:
def simple_func(arg):
    # arg = list(arg)
    print(arg)
    print(type(arg))
    print(list(arg))
    return arg


ag_collision_hgnc_df["gene_symbol"].apply(simple_func)

{'AOC1'}
<class 'set'>
['AOC1']
{'MIPEP'}
<class 'set'>
['MIPEP']
{'POLQ'}
<class 'set'>
['POLQ']
{'PRSS8'}
<class 'set'>
['PRSS8']
{'CTDP1'}
<class 'set'>
['CTDP1']
{'LZTS1'}
<class 'set'>
['LZTS1']
{'KIF2A'}
<class 'set'>
['KIF2A']
{'CDH19'}
<class 'set'>
['CDH19']
{'PTPN18'}
<class 'set'>
['PTPN18']
{'CADPS2'}
<class 'set'>
['CADPS2']
{'AKR1B1'}
<class 'set'>
['AKR1B1']
{'MLF2'}
<class 'set'>
['MLF2']
{'TOX4'}
<class 'set'>
['TOX4']
{'HOOK2'}
<class 'set'>
['HOOK2']
{'SEC14L3'}
<class 'set'>
['SEC14L3']
{'E2F1'}
<class 'set'>
['E2F1']
{'TIMP1'}
<class 'set'>
['TIMP1']
{'ACOD1'}
<class 'set'>
['ACOD1']
{'CRYM'}
<class 'set'>
['CRYM']
{'PPY'}
<class 'set'>
['PPY']
{'SMOC2'}
<class 'set'>
['SMOC2']
{'LAMA4'}
<class 'set'>
['LAMA4']
{'GNAI2'}
<class 'set'>
['GNAI2']
{'INSL6'}
<class 'set'>
['INSL6']
{'PMEPA1'}
<class 'set'>
['PMEPA1']
{'SOX9'}
<class 'set'>
['SOX9']
{'CPA4'}
<class 'set'>
['CPA4']
{'MBD4'}
<class 'set'>
['MBD4']
{'RNASE1'}
<class 'set'>
['RNASE1']
{'RIN2'}
<class 'set'>

25         {AOC1}
461       {MIPEP}
691        {POLQ}
698       {PRSS8}
807       {CTDP1}
           ...   
42532     {L1RE3}
42980    {MT-HPR}
43306    {PCDHA@}
44660     {SPG32}
45593    {ZNF159}
Name: gene_symbol, Length: 96, dtype: object

In [349]:
# Assertion error if passed more than once
ag_collision_hgnc_df["gene_symbol"] = ag_collision_hgnc_df["gene_symbol"].apply(list)
ag_collision_hgnc_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_hgnc_df["gene_symbol"] = ag_collision_hgnc_df["gene_symbol"].apply(list)


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
25,ENSG00000002726,[AOC1],80,26,{DAO},{DAO},HGNC
461,ENSG00000027001,[MIPEP],7104,4285,{MIP},{MIP},HGNC
691,ENSG00000051341,[POLQ],9186,10721,{POLH},{POLH},HGNC
698,ENSG00000052344,[PRSS8],9491,5652,{CAP1},{CAP1},HGNC
807,ENSG00000060069,[CTDP1],2498,9150,{FCP1},{FCP1},HGNC
...,...,...,...,...,...,...,...
42532,,[L1RE3],16899,,{LRE3},{LRE3},HGNC
42980,,[MT-HPR],7438,,{HPR},{HPR},HGNC
43306,,[PCDHA@],8662,56117,{CNR1},{CNR1},HGNC
44660,,[SPG32],32314,,{SPG29},{SPG29},HGNC


In [350]:
# ag_collision_hgnc_df["ENSG_ID"] = ag_collision_hgnc_df["ENSG_ID"].astype(str)
# ag_collision_hgnc_df

### Alphabetize alias_symbol

In [351]:
ag_collision_hgnc_df["alias_symbol"] = ag_collision_hgnc_df["alias_symbol"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_hgnc_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_hgnc_df["alias_symbol"] = ag_collision_hgnc_df["alias_symbol"].apply(


Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
25,ENSG00000002726,[AOC1],80,26,[DAO],{DAO},HGNC
461,ENSG00000027001,[MIPEP],7104,4285,[MIP],{MIP},HGNC
691,ENSG00000051341,[POLQ],9186,10721,[POLH],{POLH},HGNC
698,ENSG00000052344,[PRSS8],9491,5652,[CAP1],{CAP1},HGNC
807,ENSG00000060069,[CTDP1],2498,9150,[FCP1],{FCP1},HGNC


make a set of collisions
- some records have multiple collisions, need to explode to one per row so that when i change it back to a str and make a set it doesnt count two collisions seperated by a commma as one unique collision

In [352]:
ag_collision_hgnc_df = ag_collision_hgnc_df.explode(column="intersect_point")
ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
25,ENSG00000002726,[AOC1],80,26,[DAO],DAO,HGNC
461,ENSG00000027001,[MIPEP],7104,4285,[MIP],MIP,HGNC
691,ENSG00000051341,[POLQ],9186,10721,[POLH],POLH,HGNC
698,ENSG00000052344,[PRSS8],9491,5652,[CAP1],CAP1,HGNC
807,ENSG00000060069,[CTDP1],2498,9150,[FCP1],FCP1,HGNC
...,...,...,...,...,...,...,...
42532,,[L1RE3],16899,,[LRE3],LRE3,HGNC
42980,,[MT-HPR],7438,,[HPR],HPR,HGNC
43306,,[PCDHA@],8662,56117,[CNR1],CNR1,HGNC
44660,,[SPG32],32314,,[SPG29],SPG29,HGNC


In [353]:
hgnc_alias_gene_collision_set = set(ag_collision_hgnc_df["intersect_point"])
len(hgnc_alias_gene_collision_set)

94

In [354]:
ag_collision_hgnc_df["gene_symbol"] = ag_collision_hgnc_df["gene_symbol"].str.join(", ")

In [355]:
hgnc_alias_gene_collision_primary_symbol_set = set(ag_collision_hgnc_df["gene_symbol"])
len(hgnc_alias_gene_collision_primary_symbol_set)

96

### Alphabetize alias_symbol

In [356]:
ag_collision_hgnc_df["alias_symbol"] = ag_collision_hgnc_df["alias_symbol"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_hgnc_df.head()

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
25,ENSG00000002726,AOC1,80,26,[DAO],DAO,HGNC
461,ENSG00000027001,MIPEP,7104,4285,[MIP],MIP,HGNC
691,ENSG00000051341,POLQ,9186,10721,[POLH],POLH,HGNC
698,ENSG00000052344,PRSS8,9491,5652,[CAP1],CAP1,HGNC
807,ENSG00000060069,CTDP1,2498,9150,[FCP1],FCP1,HGNC


In [357]:
print(type(ag_collision_hgnc_df.loc[25, "gene_symbol"]))

<class 'str'>


### Convert lists to str

In [358]:
ag_collision_hgnc_df["alias_symbol"] = ag_collision_hgnc_df["alias_symbol"].str.join(
    ", "
)

In [359]:
ag_collision_hgnc_df = ag_collision_hgnc_df.sort_values("intersect_point")
ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
12161,ENSG00000167780,SOAT2,11178,8435,ACAT2,ACAT2,HGNC
22389,ENSG00000220267,ACTBP8,141,68,ACTBP2,ACTBP2,HGNC
10075,ENSG00000157500,APPL1,24035,26060,APPL,APPL,HGNC
1663,ENSG00000085662,AKR1B1,381,231,AR,AR,HGNC
13884,ENSG00000175711,B3GNTL1,21727,146712,B3GNT8,B3GNT8,HGNC
...,...,...,...,...,...,...,...
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC
9908,ENSG00000156096,UGT2B4,12553,7363,UGT2B11,UGT2B11,HGNC
8327,ENSG00000143258,USP21,12620,27005,USP16,USP16,HGNC
9829,ENSG00000155313,USP25,12624,29761,USP21,USP21,HGNC


In [360]:
ag_collision_hgnc_df = ag_collision_hgnc_df.drop_duplicates(
    subset=["gene_symbol", "intersect_point"], keep="first"
)
ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
12161,ENSG00000167780,SOAT2,11178,8435,ACAT2,ACAT2,HGNC
22389,ENSG00000220267,ACTBP8,141,68,ACTBP2,ACTBP2,HGNC
10075,ENSG00000157500,APPL1,24035,26060,APPL,APPL,HGNC
1663,ENSG00000085662,AKR1B1,381,231,AR,AR,HGNC
13884,ENSG00000175711,B3GNTL1,21727,146712,B3GNT8,B3GNT8,HGNC
...,...,...,...,...,...,...,...
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC
9908,ENSG00000156096,UGT2B4,12553,7363,UGT2B11,UGT2B11,HGNC
8327,ENSG00000143258,USP21,12620,27005,USP16,USP16,HGNC
9829,ENSG00000155313,USP25,12624,29761,USP21,USP21,HGNC


## Add civic_gene_id to table

In [361]:
genes = civicpy.get_all_genes()

In [362]:
hgnc_NCBI_ID_list = list(ag_collision_hgnc_df["NCBI_ID"])

In [363]:
ag_collision_hgnc_df["NCBI_ID"] = ag_collision_hgnc_df["NCBI_ID"].str.replace(
    "nan", "000", regex=False
)
ag_collision_hgnc_df.head()

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source
12161,ENSG00000167780,SOAT2,11178,8435,ACAT2,ACAT2,HGNC
22389,ENSG00000220267,ACTBP8,141,68,ACTBP2,ACTBP2,HGNC
10075,ENSG00000157500,APPL1,24035,26060,APPL,APPL,HGNC
1663,ENSG00000085662,AKR1B1,381,231,AR,AR,HGNC
13884,ENSG00000175711,B3GNTL1,21727,146712,B3GNT8,B3GNT8,HGNC


In [364]:
ag_collision_hgnc_df["NCBI_ID"] = ag_collision_hgnc_df["NCBI_ID"].astype(int)

In [365]:
civic_ids = []
for entrez_id in hgnc_NCBI_ID_list:
    found_civic_id = ""
    for gene in genes:
        if entrez_id == str(gene.entrez_id):
            found_civic_id = gene.id
    civic_ids.append(found_civic_id)

ag_collision_hgnc_df["civic_ids"] = civic_ids
ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids
12161,ENSG00000167780,SOAT2,11178,8435,ACAT2,ACAT2,HGNC,
22389,ENSG00000220267,ACTBP8,141,68,ACTBP2,ACTBP2,HGNC,
10075,ENSG00000157500,APPL1,24035,26060,APPL,APPL,HGNC,
1663,ENSG00000085662,AKR1B1,381,231,AR,AR,HGNC,
13884,ENSG00000175711,B3GNTL1,21727,146712,B3GNT8,B3GNT8,HGNC,
...,...,...,...,...,...,...,...,...
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC,12422
9908,ENSG00000156096,UGT2B4,12553,7363,UGT2B11,UGT2B11,HGNC,
8327,ENSG00000143258,USP21,12620,27005,USP16,USP16,HGNC,
9829,ENSG00000155313,USP25,12624,29761,USP21,USP21,HGNC,


In [366]:
ag_collision_hgnc_df["civic_ids"].replace("", np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ag_collision_hgnc_df["civic_ids"].replace("", np.nan, inplace=True)
  ag_collision_hgnc_df["civic_ids"].replace("", np.nan, inplace=True)


In [367]:
civic_ag_collision_hgnc_df = ag_collision_hgnc_df.copy()
civic_ag_collision_hgnc_df.dropna(subset=["civic_ids"], inplace=True)
civic_ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids
2649,ENSG00000102265,TIMP1,11820,7076,EPO,EPO,HGNC,5777.0
4266,ENSG00000114353,GNAI2,4385,2771,GIP,GIP,HGNC,2312.0
6084,ENSG00000129071,MBD4,6919,8930,MED1,MED1,HGNC,7084.0
10780,ENSG00000162733,DDR2,2731,4921,TKT,TKT,HGNC,51.0
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC,12422.0


In [368]:
ag_collision_hgnc_df.to_csv(
    "../created_files/ag_collision_hgnc_df_w_entrez_id.csv", index=False
)

## Add MP ID & score

In [369]:
civic_ag_collision_hgnc_df_add_molprof_df = transform_df_mp_id(
    civic_ag_collision_hgnc_df
)
civic_ag_collision_hgnc_df_add_molprof_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,molecular_profile_id
2649,ENSG00000102265,TIMP1,11820,7076,EPO,EPO,HGNC,5777.0,
4266,ENSG00000114353,GNAI2,4385,2771,GIP,GIP,HGNC,2312.0,[2185]
6084,ENSG00000129071,MBD4,6919,8930,MED1,MED1,HGNC,7084.0,
10780,ENSG00000162733,DDR2,2731,4921,TKT,TKT,HGNC,51.0,
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC,12422.0,


In [370]:
civic_ag_collision_hgnc_df_add_molprof_score_df = transform_df_mp_score(
    civic_ag_collision_hgnc_df_add_molprof_df
)
civic_ag_collision_hgnc_df_add_molprof_score_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,molecular_profile_id,molecular_profile_score
2649,ENSG00000102265,TIMP1,11820,7076,EPO,EPO,HGNC,5777.0,,
4266,ENSG00000114353,GNAI2,4385,2771,GIP,GIP,HGNC,2312.0,[2185],[0.0]
6084,ENSG00000129071,MBD4,6919,8930,MED1,MED1,HGNC,7084.0,,
10780,ENSG00000162733,DDR2,2731,4921,TKT,TKT,HGNC,51.0,,
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC,12422.0,,


## Add evidence_ids column

In [371]:
civic_variant_ids = civicpy.get_all_variants()

In [372]:
hgnc_civic_id_list = list(civic_ag_collision_hgnc_df["civic_ids"])

In [373]:
civic_ag_collision_hgnc_df.civic_ids.astype(int)

2649      5777
4266      2312
6084      7084
10780       51
30806    12422
Name: civic_ids, dtype: int64

In [374]:
civic_eids = []
for civic_id in hgnc_civic_id_list:
    gene_civic_eids = []

    for variant in civic_variant_ids:
        if int(civic_id) == variant.gene_id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in gene_civic_eids:
                        gene_civic_eids.append(e.id)

            gene_civic_eids = gene_civic_eids or ""
    civic_eids.append(gene_civic_eids)
civic_ag_collision_hgnc_df["civic_eids"] = civic_eids
civic_ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,civic_eids
2649,ENSG00000102265,TIMP1,11820,7076,EPO,EPO,HGNC,5777.0,"[927, 901]"
4266,ENSG00000114353,GNAI2,4385,2771,GIP,GIP,HGNC,2312.0,"[11943, 11839]"
6084,ENSG00000129071,MBD4,6919,8930,MED1,MED1,HGNC,7084.0,[7596]
10780,ENSG00000162733,DDR2,2731,4921,TKT,TKT,HGNC,51.0,"[266, 267, 268, 9853, 269, 270, 271, 9854, 272]"
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC,12422.0,"[1792, 1795]"


In [375]:
civic_evidence_list = civicpy.get_all_evidence()

In [376]:
hgnc_civic_eid_list = list(civic_ag_collision_hgnc_df["civic_eids"])
hgnc_civic_eid_list[0]

[927, 901]

In [377]:
evidence_ratings = []
for eid_list in hgnc_civic_eid_list:
    found_rating = []
    for eid in eid_list:
        for evidence in civic_evidence_list:
            if eid == evidence.id:
                found_rating.append(evidence.rating)
    evidence_ratings.append(found_rating)
civic_ag_collision_hgnc_df["evidence_ratings"] = evidence_ratings
civic_ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,civic_eids,evidence_ratings
2649,ENSG00000102265,TIMP1,11820,7076,EPO,EPO,HGNC,5777.0,"[927, 901]","[2, 2]"
4266,ENSG00000114353,GNAI2,4385,2771,GIP,GIP,HGNC,2312.0,"[11943, 11839]","[4, 3]"
6084,ENSG00000129071,MBD4,6919,8930,MED1,MED1,HGNC,7084.0,[7596],[4]
10780,ENSG00000162733,DDR2,2731,4921,TKT,TKT,HGNC,51.0,"[266, 267, 268, 9853, 269, 270, 271, 9854, 272]","[4, 4, 4, 2, 4, 4, 4, 3, 3]"
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC,12422.0,"[1792, 1795]","[5, 4]"


In [378]:
average_evidence_rating = []
for alist in civic_ag_collision_hgnc_df["evidence_ratings"]:
    avg_rating = sum(alist) / len(alist)
    average_evidence_rating.append(avg_rating)
civic_ag_collision_hgnc_df["average_evidence_ratings"] = average_evidence_rating
civic_ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,civic_eids,evidence_ratings,average_evidence_ratings
2649,ENSG00000102265,TIMP1,11820,7076,EPO,EPO,HGNC,5777.0,"[927, 901]","[2, 2]",2.0
4266,ENSG00000114353,GNAI2,4385,2771,GIP,GIP,HGNC,2312.0,"[11943, 11839]","[4, 3]",3.5
6084,ENSG00000129071,MBD4,6919,8930,MED1,MED1,HGNC,7084.0,[7596],[4],4.0
10780,ENSG00000162733,DDR2,2731,4921,TKT,TKT,HGNC,51.0,"[266, 267, 268, 9853, 269, 270, 271, 9854, 272]","[4, 4, 4, 2, 4, 4, 4, 3, 3]",3.555556
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC,12422.0,"[1792, 1795]","[5, 4]",4.5


In [379]:
sum_evidence_rating = []
for alist in civic_ag_collision_hgnc_df["evidence_ratings"]:
    sum_rating = sum(alist)
    sum_evidence_rating.append(sum_rating)
civic_ag_collision_hgnc_df["sum_evidence_rating"] = sum_evidence_rating
civic_ag_collision_hgnc_df

Unnamed: 0,ENSG_ID,gene_symbol,HGNC_ID,NCBI_ID,alias_symbol,intersect_point,source,civic_ids,civic_eids,evidence_ratings,average_evidence_ratings,sum_evidence_rating
2649,ENSG00000102265,TIMP1,11820,7076,EPO,EPO,HGNC,5777.0,"[927, 901]","[2, 2]",2.0,4
4266,ENSG00000114353,GNAI2,4385,2771,GIP,GIP,HGNC,2312.0,"[11943, 11839]","[4, 3]",3.5,7
6084,ENSG00000129071,MBD4,6919,8930,MED1,MED1,HGNC,7084.0,[7596],[4],4.0,4
10780,ENSG00000162733,DDR2,2731,4921,TKT,TKT,HGNC,51.0,"[266, 267, 268, 9853, 269, 270, 271, 9854, 272]","[4, 4, 4, 2, 4, 4, 4, 3, 3]",3.555556,32
30806,ENSG00000241635,UGT1A1,12530,54658,UGT1A,UGT1A,HGNC,12422.0,"[1792, 1795]","[5, 4]",4.5,9


# NCBI Info

In [380]:
mini_ncbi_df = pd.read_csv("../downloaded_files/Homo_sapiens.gene_info20240627", sep="\t")

### Drop all columns besides ENSG_ID, gene_symbol, and alias_symbol

In [381]:
mini_ncbi_df = mini_ncbi_df[
["GeneID", "Symbol", "Synonyms", "dbXrefs"]
]
mini_ncbi_df = mini_ncbi_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Synonyms": "alias_symbol"}
)
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,dbXrefs
0,1,A1BG,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...
2,3,A2MP1,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...
3,9,NAT1,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...
4,10,NAT2,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...


Split dbXrefs into individual columns

In [382]:
mini_ncbi_df = mini_ncbi_df.assign(
    MIM=np.nan,
    HGNC_ID=np.nan,
    ENSG_ID=np.nan,
    AllianceGenome=np.nan,
    MIRbase=np.nan,
    IMGTgene_db=np.nan,
    dash=np.nan,
    unknown=np.nan,
)

In [383]:
index_pos = 0

print(len(mini_ncbi_df))
while index_pos < len(mini_ncbi_df):
    xrefs = mini_ncbi_df["dbXrefs"][index_pos].split("|")

    for xref in xrefs:
        xref = xref.lower()
        if xref.startswith("mim:"):
            xref = xref.replace("mim:", "")
            mini_ncbi_df["MIM"][index_pos] = xref
        elif xref.startswith("hgnc:hgnc:"):
            xref = xref.replace("hgnc:hgnc:", "")
            mini_ncbi_df["HGNC_ID"][index_pos] = xref
        elif xref.startswith("ensembl:"):
            xref = xref.replace("ensembl:", "")
            mini_ncbi_df["ENSG_ID"][index_pos] = xref
        elif xref.startswith("alliancegenome:"):
            xref = xref.replace("alliancegenome:", "")
            mini_ncbi_df["AllianceGenome"][index_pos] = xref
        elif xref.startswith("mirbase"):
            xref = xref.replace("mirbase:", "")
            mini_ncbi_df["MIRbase"][index_pos] = xref
        elif xref.startswith("imgt/gene-db:"):
            xref = xref.replace("imgt/gene-db:", "")
            mini_ncbi_df["IMGTgene_db"][index_pos] = xref
        elif xref.startswith("-"):
            mini_ncbi_df["dash"][index_pos] = xref
        else:
            mini_ncbi_df["unknown"][index_pos] = xref

    index_pos += 1
    pass

print(index_pos)

193456


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  mini_ncbi_df["MIM"][index_pos] = xref
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["MIM"][index

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  mini_ncbi_df["MIRbase"][index_pos] = xref
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["MIRbase

193456


In [384]:
mini_ncbi_df["ENSG_ID"] = mini_ncbi_df["ENSG_ID"].str.replace("ensg", "ENSG", 1)

In [385]:
mini_ncbi_df = mini_ncbi_df.drop(
    [
        "AllianceGenome",
        "MIRbase",
        "IMGTgene_db",
        "dash",
        "unknown",
        "dbXrefs",
        "MIM",
    ],
    axis=1,
)
mini_ncbi_df = mini_ncbi_df.rename(columns={"HGNC_ID": "HGNC_ID"})
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B|ABG|GAB|HYST2477,5,ENSG00000121410
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7,7,ENSG00000175899
2,3,A2MP1,A2MP,8,ENSG00000291190
3,9,NAT1,AAC1|MNAT|NAT-1|NATI,7645,ENSG00000171428
4,10,NAT2,AAC2|NAT-2|PNAT,7646,ENSG00000156006
...,...,...,...,...,...
193451,8923215,trnD,-,,
193452,8923216,trnP,-,,
193453,8923217,trnA,-,,
193454,8923218,COX1,-,,


In [386]:
mini_ncbi_df.to_csv("../created_files/mini_ncbi_df.csv", index=False)
mini_ncbi_df.head()

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B|ABG|GAB|HYST2477,5,ENSG00000121410
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7,7,ENSG00000175899
2,3,A2MP1,A2MP,8,ENSG00000291190
3,9,NAT1,AAC1|MNAT|NAT-1|NATI,7645,ENSG00000171428
4,10,NAT2,AAC2|NAT-2|PNAT,7646,ENSG00000156006


One gene symbol may have multiple NCBI IDs

In [387]:
mini_ncbi_df["symbol_duplicates"] = mini_ncbi_df.duplicated(
    subset="gene_symbol", keep=False
)
dup_symbol_mini_ncbi_df = mini_ncbi_df[mini_ncbi_df["symbol_duplicates"] == True]

In [388]:
dup_symbol_mini_ncbi_df = dup_symbol_mini_ncbi_df.sort_values("gene_symbol")
dup_symbol_mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,symbol_duplicates
193408,6775087,12S rRNA,-,,,True
193449,8923213,12S rRNA,-,,,True
193424,8923188,ATP6,-,,,True
3541,4508,ATP6,ATPase6|MTATP6,7414,,True
193395,6775074,ATP6,-,,,True
...,...,...,...,...,...,...
193411,6775090,trnV,-,,,True
193430,8923194,trnW,-,,,True
193413,6775092,trnW,-,,,True
193403,6775082,trnY,-,,,True


In [389]:
mini_ncbi_df = mini_ncbi_df.drop(["symbol_duplicates"], axis=1)

### Make a set of primary gene symbols

In [390]:
ncbi_gene_symbol_set = set(mini_ncbi_df["gene_symbol"])

In [391]:
total_number_ncbi_gene_symbols = len(ncbi_gene_symbol_set)
total_number_ncbi_gene_symbols

193303

Drop genes with no aliases

In [392]:
mini_ncbi_df = mini_ncbi_df.replace("-", np.nan)
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B|ABG|GAB|HYST2477,5,ENSG00000121410
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7,7,ENSG00000175899
2,3,A2MP1,A2MP,8,ENSG00000291190
3,9,NAT1,AAC1|MNAT|NAT-1|NATI,7645,ENSG00000171428
4,10,NAT2,AAC2|NAT-2|PNAT,7646,ENSG00000156006
...,...,...,...,...,...
193451,8923215,trnD,,,
193452,8923216,trnP,,,
193453,8923217,trnA,,,
193454,8923218,COX1,,,


In [393]:
mini_ncbi_df = mini_ncbi_df.dropna(subset=["alias_symbol"])
mini_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,A1B|ABG|GAB|HYST2477,5,ENSG00000121410
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7,7,ENSG00000175899
2,3,A2MP1,A2MP,8,ENSG00000291190
3,9,NAT1,AAC1|MNAT|NAT-1|NATI,7645,ENSG00000171428
4,10,NAT2,AAC2|NAT-2|PNAT,7646,ENSG00000156006
...,...,...,...,...,...
190958,131696449,LOC131696449,PKD1P1-NPIPA5L,,
190961,131840634,GLTC1,GLTC,56861,
193342,132532400,GABRA6-AS1,ARBAG,40248,
193377,133395150,LNCARGI,ARGI,56890,


### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [394]:
# alias_symbol_sets = mini_ncbi_df.alias_symbol

In [395]:
mini_ncbi_df["alias_symbol"] = mini_ncbi_df["alias_symbol"].astype(str)
mini_ncbi_df["alias_symbol"] = [x.split("|") for x in mini_ncbi_df.alias_symbol]
mini_ncbi_df["alias_symbol"] = np.where(
    mini_ncbi_df.alias_symbol == " ", " ", mini_ncbi_df.alias_symbol.map(set)
)
mini_ncbi_df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["alias_symbol"] = mini_ncbi_df["alias_symbol"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["alias_symbol"] = [x.split("|") for x in mini_ncbi_df.alias_symbol]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["alias_symbol"] = np.where(


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID
0,1,A1BG,"{HYST2477, ABG, GAB, A1B}",5,ENSG00000121410


### Add test for false positives in the intersection points
#### (places where x in alias_symbol matches x in mini_hgnc_df.gene_symbol in the same row)

In [396]:
mini_ncbi_df["gene_symbol"] = [x.split(";") for x in mini_ncbi_df.gene_symbol]
mini_ncbi_df["gene_symbol"] = np.where(
    mini_ncbi_df.gene_symbol == "", "", mini_ncbi_df.gene_symbol.map(set)
)
false_pos_mini_ncbi_df = mini_ncbi_df[
    mini_ncbi_df.alias_symbol.apply(lambda x: x & ncbi_gene_symbol_set)
    == mini_ncbi_df.gene_symbol
]
false_pos_mini_ncbi_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["gene_symbol"] = [x.split(";") for x in mini_ncbi_df.gene_symbol]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["gene_symbol"] = np.where(


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID


In [397]:
mini_ncbi_df["alias_symbol"] = mini_ncbi_df.alias_symbol - mini_ncbi_df.gene_symbol
display(mini_ncbi_df.iloc[7805])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["alias_symbol"] = mini_ncbi_df.alias_symbol - mini_ncbi_df.gene_symbol


NCBI_ID                       10944
gene_symbol              {C11orf58}
alias_symbol    {SMAP, IMAGE145052}
HGNC_ID                       16990
ENSG_ID             ENSG00000110696
Name: 8575, dtype: object

### Find intersection points using alias symbol sets and gene_symbol

In [398]:
mini_ncbi_df["intersect_point"] = mini_ncbi_df.alias_symbol.apply(
    lambda x: x & ncbi_gene_symbol_set
)
ag_collision_ncbi_df = mini_ncbi_df[mini_ncbi_df.intersect_point != set()]
ag_collision_ncbi_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_ncbi_df["intersect_point"] = mini_ncbi_df.alias_symbol.apply(


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,intersect_point
9,15,{AANAT},"{DSPS, SNAT}",19,ENSG00000129673,{DSPS}
20,26,{AOC1},"{DAO, ABP1, ABP, DAO1, KAO, KDAO}",80,ENSG00000002726,{DAO}
34,40,{ASIC2},"{ACCN, BNaC1, BNC1, ASIC2a, MDEG, ACCN1, hBNaC1}",99,ENSG00000108684,{BNC1}
44,53,{ACP2},{LAP},123,ENSG00000134575,{LAP}
56,68,{ACTBP8},{ACTBP2},141,,{ACTBP2}
...,...,...,...,...,...,...
70125,124905743,{LOC124905743},{FCGR3B},,,{FCGR3B}
70798,124906461,{LOC124906461},{DUX4},,,{DUX4}
74259,125316803,{POLGARF},"{ORF-Y, POLG}",56246,ENSG00000291307,{POLG}
140700,127898561,{MIURF},"{MIEF1-MP, AltMIEF1, AltMiD51, MIEF1}",,ENSG00000285025,{MIEF1}


### Replace NAN in ENSG_ID

In [399]:
ag_collision_ncbi_df["ENSG_ID"] = ag_collision_ncbi_df["ENSG_ID"].replace("NAN", "nan")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["ENSG_ID"] = ag_collision_ncbi_df["ENSG_ID"].replace("NAN", "nan")


In [400]:
ag_collision_ncbi_df["source"] = "NCBI Info"
ag_collision_ncbi_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["source"] = "NCBI Info"


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,intersect_point,source
9,15,{AANAT},"{DSPS, SNAT}",19,ENSG00000129673,{DSPS},NCBI Info
20,26,{AOC1},"{DAO, ABP1, ABP, DAO1, KAO, KDAO}",80,ENSG00000002726,{DAO},NCBI Info
34,40,{ASIC2},"{ACCN, BNaC1, BNC1, ASIC2a, MDEG, ACCN1, hBNaC1}",99,ENSG00000108684,{BNC1},NCBI Info
44,53,{ACP2},{LAP},123,ENSG00000134575,{LAP},NCBI Info
56,68,{ACTBP8},{ACTBP2},141,,{ACTBP2},NCBI Info


### Convert sets to str or list

In [401]:
# Assertion error when when run more than once
ag_collision_ncbi_df["gene_symbol"] = ag_collision_ncbi_df["gene_symbol"].apply(list)
ag_collision_ncbi_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["gene_symbol"] = ag_collision_ncbi_df["gene_symbol"].apply(list)


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,intersect_point,source
9,15,[AANAT],"{DSPS, SNAT}",19,ENSG00000129673,{DSPS},NCBI Info
20,26,[AOC1],"{DAO, ABP1, ABP, DAO1, KAO, KDAO}",80,ENSG00000002726,{DAO},NCBI Info
34,40,[ASIC2],"{ACCN, BNaC1, BNC1, ASIC2a, MDEG, ACCN1, hBNaC1}",99,ENSG00000108684,{BNC1},NCBI Info
44,53,[ACP2],{LAP},123,ENSG00000134575,{LAP},NCBI Info
56,68,[ACTBP8],{ACTBP2},141,,{ACTBP2},NCBI Info


In [402]:
ag_collision_ncbi_df["ENSG_ID"] = ag_collision_ncbi_df["ENSG_ID"].astype(str)
ag_collision_ncbi_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["ENSG_ID"] = ag_collision_ncbi_df["ENSG_ID"].astype(str)


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,intersect_point,source
9,15,[AANAT],"{DSPS, SNAT}",19,ENSG00000129673,{DSPS},NCBI Info
20,26,[AOC1],"{DAO, ABP1, ABP, DAO1, KAO, KDAO}",80,ENSG00000002726,{DAO},NCBI Info
34,40,[ASIC2],"{ACCN, BNaC1, BNC1, ASIC2a, MDEG, ACCN1, hBNaC1}",99,ENSG00000108684,{BNC1},NCBI Info
44,53,[ACP2],{LAP},123,ENSG00000134575,{LAP},NCBI Info
56,68,[ACTBP8],{ACTBP2},141,,{ACTBP2},NCBI Info


### Alphabetize alias_symbol

In [403]:
ag_collision_ncbi_df["alias_symbol"] = ag_collision_ncbi_df["alias_symbol"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_ncbi_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["alias_symbol"] = ag_collision_ncbi_df["alias_symbol"].apply(


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,intersect_point,source
9,15,[AANAT],"[DSPS, SNAT]",19,ENSG00000129673,{DSPS},NCBI Info
20,26,[AOC1],"[ABP, ABP1, DAO, DAO1, KAO, KDAO]",80,ENSG00000002726,{DAO},NCBI Info
34,40,[ASIC2],"[ACCN, ACCN1, ASIC2a, BNaC1, BNC1, hBNaC1, MDEG]",99,ENSG00000108684,{BNC1},NCBI Info
44,53,[ACP2],[LAP],123,ENSG00000134575,{LAP},NCBI Info
56,68,[ACTBP8],[ACTBP2],141,,{ACTBP2},NCBI Info


make a set of collisions
- some records have multiple collisions, need to explode to one per row so that when i change it back to a str and make a set it doesnt count two collisions seperated by a commma as one unique collision

In [404]:
intersect_explode_ag_collision_ncbi_df = ag_collision_ncbi_df.explode(
    column="intersect_point"
)
intersect_explode_ag_collision_ncbi_df

Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,intersect_point,source
9,15,[AANAT],"[DSPS, SNAT]",19,ENSG00000129673,DSPS,NCBI Info
20,26,[AOC1],"[ABP, ABP1, DAO, DAO1, KAO, KDAO]",80,ENSG00000002726,DAO,NCBI Info
34,40,[ASIC2],"[ACCN, ACCN1, ASIC2a, BNaC1, BNC1, hBNaC1, MDEG]",99,ENSG00000108684,BNC1,NCBI Info
44,53,[ACP2],[LAP],123,ENSG00000134575,LAP,NCBI Info
56,68,[ACTBP8],[ACTBP2],141,,ACTBP2,NCBI Info
...,...,...,...,...,...,...,...
70125,124905743,[LOC124905743],[FCGR3B],,,FCGR3B,NCBI Info
70798,124906461,[LOC124906461],[DUX4],,,DUX4,NCBI Info
74259,125316803,[POLGARF],"[ORF-Y, POLG]",56246,ENSG00000291307,POLG,NCBI Info
140700,127898561,[MIURF],"[AltMiD51, AltMIEF1, MIEF1, MIEF1-MP]",,ENSG00000285025,MIEF1,NCBI Info


In [405]:
ncbi_alias_gene_collision_set = set(
    intersect_explode_ag_collision_ncbi_df["intersect_point"]
)
len(ncbi_alias_gene_collision_set)

1554

In [406]:
intersect_explode_ag_collision_ncbi_df["gene_symbol"] = (
    intersect_explode_ag_collision_ncbi_df["gene_symbol"].str.join(", ")
)

In [407]:
ncbi_alias_gene_collision_primary_symbol_set = set(
    intersect_explode_ag_collision_ncbi_df["gene_symbol"]
)
len(ncbi_alias_gene_collision_primary_symbol_set)

1688

### Alphabetize intersect_point

In [408]:
ag_collision_ncbi_df["intersect_point"] = ag_collision_ncbi_df["intersect_point"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_ncbi_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["intersect_point"] = ag_collision_ncbi_df["intersect_point"].apply(


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,intersect_point,source
9,15,[AANAT],"[DSPS, SNAT]",19,ENSG00000129673,[DSPS],NCBI Info
20,26,[AOC1],"[ABP, ABP1, DAO, DAO1, KAO, KDAO]",80,ENSG00000002726,[DAO],NCBI Info
34,40,[ASIC2],"[ACCN, ACCN1, ASIC2a, BNaC1, BNC1, hBNaC1, MDEG]",99,ENSG00000108684,[BNC1],NCBI Info
44,53,[ACP2],[LAP],123,ENSG00000134575,[LAP],NCBI Info
56,68,[ACTBP8],[ACTBP2],141,,[ACTBP2],NCBI Info


### Convert lists to str

In [409]:
ag_collision_ncbi_df["gene_symbol"] = ag_collision_ncbi_df["gene_symbol"].str.join(", ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["gene_symbol"] = ag_collision_ncbi_df["gene_symbol"].str.join(", ")


In [410]:
ag_collision_ncbi_df["alias_symbol"] = ag_collision_ncbi_df["alias_symbol"].str.join(
    ", "
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["alias_symbol"] = ag_collision_ncbi_df["alias_symbol"].str.join(


In [411]:
ag_collision_ncbi_df["intersect_point"] = ag_collision_ncbi_df[
    "intersect_point"
].str.join(", ")
ag_collision_ncbi_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ag_collision_ncbi_df["intersect_point"] = ag_collision_ncbi_df[


Unnamed: 0,NCBI_ID,gene_symbol,alias_symbol,HGNC_ID,ENSG_ID,intersect_point,source
9,15,AANAT,"DSPS, SNAT",19,ENSG00000129673,DSPS,NCBI Info
20,26,AOC1,"ABP, ABP1, DAO, DAO1, KAO, KDAO",80,ENSG00000002726,DAO,NCBI Info
34,40,ASIC2,"ACCN, ACCN1, ASIC2a, BNaC1, BNC1, hBNaC1, MDEG",99,ENSG00000108684,BNC1,NCBI Info
44,53,ACP2,LAP,123,ENSG00000134575,LAP,NCBI Info
56,68,ACTBP8,ACTBP2,141,,ACTBP2,NCBI Info
...,...,...,...,...,...,...,...
70125,124905743,LOC124905743,FCGR3B,,,FCGR3B,NCBI Info
70798,124906461,LOC124906461,DUX4,,,DUX4,NCBI Info
74259,125316803,POLGARF,"ORF-Y, POLG",56246,ENSG00000291307,POLG,NCBI Info
140700,127898561,MIURF,"AltMiD51, AltMIEF1, MIEF1, MIEF1-MP",,ENSG00000285025,MIEF1,NCBI Info


# Merge 3 sets together

In [412]:
merged_alias_gene_intersections_df = pd.concat(
    [
        ag_collision_hgnc_df[
            ["gene_symbol", "alias_symbol", "intersect_point", "source"]
        ],
        ag_collision_ncbi_df[
            ["gene_symbol", "alias_symbol", "intersect_point", "source"]
        ],
        ag_collision_ensg_df[
            ["gene_symbol", "alias_symbol", "intersect_point", "source"]
        ],
    ]
)
merged_alias_gene_intersections_df

Unnamed: 0,gene_symbol,alias_symbol,intersect_point,source
12161,SOAT2,ACAT2,ACAT2,HGNC
22389,ACTBP8,ACTBP2,ACTBP2,HGNC
10075,APPL1,APPL,APPL,HGNC
1663,AKR1B1,AR,AR,HGNC
13884,B3GNTL1,B3GNT8,B3GNT8,HGNC
...,...,...,...,...
8401,USP21,"USP23, USP16",USP16,ENSG
9928,USP25,USP21,USP21,ENSG
54879,VDAC1P5,"VDAC5P, VDAC3",VDAC3,ENSG
42358,XBP1P1,"XBPP1, XBP1",XBP1,ENSG


In [413]:
merged_alias_gene_intersections_df.loc[
    merged_alias_gene_intersections_df["intersect_point"] == "CFM1"
]

Unnamed: 0,gene_symbol,alias_symbol,intersect_point,source
22292,RFLNB,"CFM1, FAM101B",CFM1,NCBI Info


# Convert to csv

In [414]:
merged_alias_gene_intersections_df.to_csv(
    "../created_files/merged_alias_gene_intersections.csv", index=False
)

In [415]:
print(merged_alias_gene_intersections_df["source"].value_counts())

source
NCBI Info    1688
ENSG          242
HGNC           96
Name: count, dtype: int64


In [416]:
common_ag_collisions = (
    ncbi_alias_gene_collision_primary_symbol_set
    & hgnc_alias_gene_collision_primary_symbol_set
    & ensg_alias_gene_collision_primary_symbol_set
)
common_ag_collisions

{'ACOD1',
 'ACTBP8',
 'B3GNTL1',
 'CADPS2',
 'CDH19',
 'CPA4',
 'CPNE1',
 'CPNE2',
 'CYCSP5',
 'FABP5P1',
 'GAL3ST3',
 'GALK2',
 'GALP',
 'GET1',
 'GLB1',
 'GNAI2',
 'HOOK1',
 'HOOK2',
 'HOOK3',
 'INSL6',
 'KIF2A',
 'LAMA4',
 'LGALS12',
 'LZTS1',
 'MB',
 'MBD4',
 'MGAM',
 'MIPEP',
 'MLF2',
 'NANOGP2',
 'NANOGP4',
 'NANOGP7',
 'NELL2',
 'NPPC',
 'NUDT19',
 'PAK6',
 'PDPK1',
 'PMEPA1',
 'PMS2CL',
 'PPIAP10',
 'PPY',
 'PRSS8',
 'PTPN18',
 'RGPD1',
 'RGS18',
 'RHEBP1',
 'RIN2',
 'RNASE1',
 'RNASE12',
 'RNASE7',
 'RNASEH1P1',
 'RSC1A1',
 'SEC14L3',
 'SEMA3F-AS1',
 'SERPINB8',
 'SMOC2',
 'SOAT2',
 'SYCP2',
 'UGT2B4',
 'USP21',
 'USP25'}

In [417]:
len(common_ag_collisions)

61