### Import 

In [9]:
import pandas as pd
import numpy as np
from civicpy import civic as civicpy

from civicpy import LOCAL_CACHE_PATH

In [10]:
civicpy.load_cache(local_cache_path=LOCAL_CACHE_PATH, on_stale="ignore")

civic_molprof_ids = civicpy.get_all_molecular_profiles(
    include_status=["accepted", "submitted", "rejected"]
)

UnpicklingError: pickle data was truncated

In [None]:
civic_variant_ids = civicpy.get_all_variants()

In [None]:
def transform_df_mp_id(df: pd.DataFrame) -> pd.DataFrame:
    """Transform dataframe to include molecular profile ID information

    :param df: Dataframe of variants
    :return: Transformed dataframe with molecular profile ID information
    """
    tmp_df = df.copy(deep=True)

    variants_molprof_ids = []
    variant_ids = list(tmp_df["civic_ids"])

    for v_id in variant_ids:
        variant_molprof_ids = []

        for variant in civic_variant_ids:
            if int(v_id) == variant.id:
                for mp in variant.molecular_profiles:
                    if mp.id not in variant_molprof_ids:
                        variant_molprof_ids.append(mp.id)

        variants_molprof_ids.append(variant_molprof_ids or "")

    tmp_df["molecular_profile_id"] = variants_molprof_ids
    return tmp_df

In [None]:
def transform_df_mp_score(df: pd.DataFrame) -> pd.DataFrame:
    """Transform dataframe to include molecular profile score information

    :param df: Dataframe of variants
    :return: Transformed dataframe with molecular profile score information
    """
    variants_molprof_scores = []
    normalized_variant_molprof_ids = list(df["molecular_profile_id"])

    for mp_ids in normalized_variant_molprof_ids:
        variant_molprof_scores = []
        for mp_id in mp_ids:
            for molprof in civic_molprof_ids:
                if int(mp_id) == molprof.id:
                    if molprof.molecular_profile_score not in variant_molprof_scores:
                        variant_molprof_scores.append(molprof.molecular_profile_score)

        variants_molprof_scores.append(variant_molprof_scores or "")

    df["molecular_profile_score"] = variants_molprof_scores
    return df

# Ensembl

In [None]:
mini_ensg_df = pd.read_csv(
    "../downloaded_files/ensg_biomart_gene20240626.txt", sep="\t",dtype={"NCBI gene (formerly Entrezgene) ID": pd.Int64Dtype()}
)
mini_ensg_df = mini_ensg_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Gene Synonym": "alias_symbol",
        "Gene name": "gene_symbol",
        "Gene stable ID": "ENSG_ID",
        "NCBI gene (formerly Entrezgene) ID": "NCBI_ID",
    }
)
mini_ensg_df

In [None]:
mini_ensg_df["alias_symbol"] = mini_ensg_df["alias_symbol"].fillna("").astype(str)
mini_ensg_df = (
    mini_ensg_df.groupby(
        ["ENSG_ID", "gene_symbol", "HGNC_ID", "NCBI_ID"], dropna=False
    )["alias_symbol"]
    .apply(lambda x: ",".join(x.dropna()))
    .reset_index()
)
mini_ensg_df

In [None]:
mini_ensg_df["HGNC_ID"] = mini_ensg_df["HGNC_ID"].str.lstrip("HGNC:")
mini_ensg_df

In [None]:
mini_ensg_df.to_csv("../created_files/mini_ensg_df.csv", index=False)
mini_ensg_df.head()

One gene symbol may have multiple NCBI IDs

In [None]:
mini_ensg_df["symbol_duplicates"] = mini_ensg_df.duplicated(
    subset="gene_symbol", keep=False
)
dup_symbol_mini_ensg_df = mini_ensg_df[mini_ensg_df["symbol_duplicates"]]
dup_symbol_mini_ensg_df

In [None]:
mini_ensg_df.loc[mini_ensg_df["gene_symbol"] == "ST7"]

In [None]:
mini_ensg_df = mini_ensg_df.drop("symbol_duplicates", axis=1)
mini_ensg_df

### Make a set of the primary gene symbols

In [None]:
ensg_gene_symbol_set = set(mini_ensg_df["gene_symbol"])

In [None]:
len(ensg_gene_symbol_set)

In [None]:
total_number_ensembl_gene_symbols = len(ensg_gene_symbol_set)
total_number_ensembl_gene_symbols

Drop genes with no aliases

In [None]:
mini_ensg_df = mini_ensg_df.replace(" ", np.nan)
mini_ensg_df = mini_ensg_df.replace("", np.nan)
mini_ensg_df = mini_ensg_df.replace("-", np.nan)
mini_ensg_df = mini_ensg_df.dropna(subset=["alias_symbol"])
mini_ensg_df

### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [None]:
mini_ensg_df["alias_symbol"] = mini_ensg_df["alias_symbol"].astype(str)
mini_ensg_df["alias_symbol"] = [x.split(",") for x in mini_ensg_df.alias_symbol]
mini_ensg_df["alias_symbol"] = np.where(
    mini_ensg_df.alias_symbol == "", "", mini_ensg_df.alias_symbol.map(set)
)
mini_ensg_df

## Test for false positives (when a gene symbol has an alias that is exactly the same as the primary gene symbol)

In [None]:
mini_ensg_df["gene_symbol"] = mini_ensg_df["gene_symbol"].map(str)
print(mini_ensg_df.dtypes)

In [None]:
mini_ensg_df["gene_symbol"] = [x.split(",") for x in mini_ensg_df.gene_symbol]
mini_ensg_df["gene_symbol"] = np.where(
    mini_ensg_df.gene_symbol == "", "", mini_ensg_df.gene_symbol.map(set)
)
false_pos_mini_ensg_df = mini_ensg_df[
    mini_ensg_df.alias_symbol.apply(lambda x: x & ensg_gene_symbol_set)
    == mini_ensg_df.gene_symbol
]
false_pos_mini_ensg_df

### Need to remove aliases that match their primary key (gene symbol)

In [None]:
mini_ensg_df["alias_symbol"] = mini_ensg_df.alias_symbol - mini_ensg_df.gene_symbol
false_pos_mini_ensg_df = mini_ensg_df[
    mini_ensg_df.alias_symbol.apply(lambda x: x & ensg_gene_symbol_set)
    == mini_ensg_df.gene_symbol
]
false_pos_mini_ensg_df

In [None]:
mini_ensg_df.loc[mini_ensg_df["NCBI_ID"] == 92960]

## Find intersection points using alias symbol sets

In [None]:
# alias_symbol_sets_series = mini_ensg_df.alias_symbol

In [None]:
mini_ensg_df["intersect_point"] = mini_ensg_df.alias_symbol.apply(
    lambda x: x & ensg_gene_symbol_set
)
ag_collision_ensg_df = mini_ensg_df[mini_ensg_df["intersect_point"].apply(lambda x: len(x) > 0)]
ag_collision_ensg_df.head()

In [None]:
ag_collision_ensg_df["source"] = "ENSG"
ag_collision_ensg_df

In [None]:
ag_collision_ensg_df = ag_collision_ensg_df.applymap(lambda x: ','.join(map(str, x)) if isinstance(x, set) else x)
ag_collision_ensg_df['NCBI_ID'] = ag_collision_ensg_df['NCBI_ID'].fillna(0).astype(int)

In [None]:
ag_collision_ensg_df

In [None]:
ag_collision_ensg_df.to_csv('../created_files/ag_collision_ensg_df.csv', index=True)

In [None]:
ag_collision_ensg_df = pd.read_csv(
    "../created_files/ag_collision_ensg_df.csv", index_col=[0])
ag_collision_ensg_df

make a set of collisions
- some records have multiple collisions, need to explode to one per row so that when i change it back to a str and make a set it doesnt count two collisions seperated by a commma as one unique collision

In [None]:
ag_collision_ensg_df['intersect_point'] = ag_collision_ensg_df['intersect_point'].str.split(',')
ag_collision_ensg_df = ag_collision_ensg_df.explode(column="intersect_point")
ag_collision_ensg_df

In [None]:
duplicates_ag_collision_ensg_df = ag_collision_ensg_df[
    ag_collision_ensg_df.duplicated("gene_symbol", keep=False)
]
duplicates_ag_collision_ensg_df = duplicates_ag_collision_ensg_df.sort_values(
    "gene_symbol"
)
duplicates_ag_collision_ensg_df

In [None]:
len(ensg_gene_symbol_set)

In [None]:
ensg_alias_gene_collision_set = set(ag_collision_ensg_df["intersect_point"])
len(ensg_alias_gene_collision_set)

In [None]:
ensg_alias_gene_collision_primary_symbol_set = set(ag_collision_ensg_df["gene_symbol"])
len(ensg_alias_gene_collision_primary_symbol_set)

1. Why is the alias-gene collision set not the same length as the set of primary symbols with collisions ?
2. Why is the length of the alias-gene collison set shorter?
 - A priamry gene symbol with an alias-gene collision has an alias that matches a different gene's primary gene symbol.
 - Multiple genes can share a single alias (alias-alias collision)
 - If that shared alias is an alias-gene collision, then there will be more unique gene symbols in the set of primary symbols with collisions than the set of alias-gene collisions. 

### Alphabetize alias_symbol

In [None]:
ag_collision_ensg_df['alias_symbol'] = ag_collision_ensg_df['alias_symbol'].str.split(',')
ag_collision_ensg_df["alias_symbol"] = ag_collision_ensg_df["alias_symbol"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_ensg_df

### Convert lists to str

In [None]:
ag_collision_ensg_df["alias_symbol"] = ag_collision_ensg_df["alias_symbol"].str.join(
    ", "
)

In [None]:
ag_collision_ensg_df = ag_collision_ensg_df.sort_values("intersect_point")
ag_collision_ensg_df

In [None]:
ag_collision_ensg_df = ag_collision_ensg_df.drop_duplicates(
    subset=["gene_symbol", "intersect_point"], keep="first"
)
ag_collision_ensg_df

## Add civic_gene_id to table

In [None]:
genes = civicpy.get_all_genes()

In [None]:
ensg_NCBI_ID_list = list(ag_collision_ensg_df["NCBI_ID"])

In [None]:
civic_ids = []
for id in ensg_NCBI_ID_list:
    found_civic_id = ""
    for gene in genes:
        if id == str(gene.entrez_id):
            found_civic_id = gene.id
    civic_ids.append(found_civic_id)

ag_collision_ensg_df["civic_ids"] = civic_ids
ag_collision_ensg_df

In [None]:
ag_collision_ensg_df["civic_ids"].replace("", np.nan, inplace=True)
ag_collision_ensg_df.head()

In [None]:
civic_ag_collision_ensg_df = ag_collision_ensg_df.copy()
civic_ag_collision_ensg_df.dropna(subset=["civic_ids"], inplace=True)
civic_ag_collision_ensg_df

## Add MP ID and score to table

In [None]:
civic_ag_collision_ensg_df_add_molprof_df = transform_df_mp_id(
    civic_ag_collision_ensg_df
)
civic_ag_collision_ensg_df_add_molprof_df

In [None]:
civic_true_mini_ensg_add_molprof_score_df = transform_df_mp_score(
    civic_ag_collision_ensg_df_add_molprof_df
)
civic_true_mini_ensg_add_molprof_score_df

## Add evidence_ids column

In [None]:
civic_variant_ids = civicpy.get_all_variants()

In [None]:
ensg_civic_id_list = list(civic_ag_collision_ensg_df["civic_ids"])

In [None]:
civic_ag_collision_ensg_df.civic_ids.astype(int)

In [None]:
civic_eids = []
for civic_id in ensg_civic_id_list:
    gene_civic_eids = []
    for variant in civic_variant_ids:
        # print(civic_id, type(civic_id))
        # print(   = =variant.gene_id, type(variant.gene_id))
        if int(civic_id) == variant.gene_id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in gene_civic_eids:
                        gene_civic_eids.append(e.id)
            gene_civic_eids = gene_civic_eids or ""
    civic_eids.append(gene_civic_eids)
civic_ag_collision_ensg_df["civic_eids"] = civic_eids
civic_ag_collision_ensg_df

In [None]:
civic_evidence_list = civicpy.get_all_evidence()

In [None]:
ensg_civic_eid_list = list(civic_ag_collision_ensg_df["civic_eids"])
ensg_civic_eid_list[0]

In [None]:
evidence_ratings = []
for eid_list in ensg_civic_eid_list:
    found_rating = []
    for eid in eid_list:
        for evidence in civic_evidence_list:
            if eid == evidence.id:
                found_rating.append(evidence.rating)
    evidence_ratings.append(found_rating)
civic_ag_collision_ensg_df["evidence_ratings"] = evidence_ratings
civic_ag_collision_ensg_df

In [None]:
average_evidence_rating = []
for alist in civic_ag_collision_ensg_df["evidence_ratings"]:
    avg_rating = sum(alist) / len(alist)
    average_evidence_rating.append(avg_rating)
civic_ag_collision_ensg_df["average_evidence_ratings"] = average_evidence_rating
civic_ag_collision_ensg_df

In [None]:
sum_evidence_rating = []
for alist in civic_ag_collision_ensg_df["evidence_ratings"]:
    sum_rating = sum(alist)
    sum_evidence_rating.append(sum_rating)
civic_ag_collision_ensg_df["sum_evidence_rating"] = sum_evidence_rating
civic_ag_collision_ensg_df

# HGNC

## Set up table

In [None]:
mini_hgnc_df = pd.read_csv(
    "../downloaded_files/hgnc_biomart_gene20240626.txt", sep="\t",dtype={"NCBI gene ID": pd.Int64Dtype()}
)
mini_hgnc_df = mini_hgnc_df.rename(
    columns={
        "HGNC ID": "HGNC_ID",
        "Approved symbol": "gene_symbol",
        "Alias symbol": "alias_symbol",
        "Ensembl gene ID": "ENSG_ID",
        "NCBI gene ID": "NCBI_ID",
    }
)
mini_hgnc_df

In [None]:
mini_hgnc_df["alias_symbol"] = mini_hgnc_df["alias_symbol"].fillna("").astype(str)
mini_hgnc_df = (
    mini_hgnc_df.groupby(
        ["ENSG_ID", "gene_symbol", "HGNC_ID", "NCBI_ID"], dropna=False
    )["alias_symbol"]
    .apply(lambda x: ", ".join(x.dropna()))
    .reset_index()
)
mini_hgnc_df

In [None]:
mini_hgnc_df["HGNC_ID"] = mini_hgnc_df["HGNC_ID"].str.lstrip("HGNC:")
mini_hgnc_df

In [None]:
mini_hgnc_df.to_csv("../created_files/mini_hgnc_df.csv", index=False)
mini_hgnc_df.head()

### Make a set of the primary gene symbols

In [None]:
hgnc_gene_symbol_set = set(mini_hgnc_df["gene_symbol"])
# all_gene_symbols_set

In [None]:
total_number_hgnc_gene_symbols = len(hgnc_gene_symbol_set)
total_number_hgnc_gene_symbols

Drop genes with no aliases

In [None]:
mini_hgnc_df = mini_hgnc_df.replace(" ", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("", np.nan)
mini_hgnc_df = mini_hgnc_df.replace("-", np.nan)
mini_hgnc_df = mini_hgnc_df.dropna(subset=["alias_symbol"])
mini_hgnc_df

### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [None]:
mini_hgnc_df["alias_symbol"] = mini_hgnc_df["alias_symbol"].astype(str)
mini_hgnc_df["alias_symbol"] = [x.split(",") for x in mini_hgnc_df.alias_symbol]
mini_hgnc_df["alias_symbol"] = np.where(
    mini_hgnc_df.alias_symbol == "", "", mini_hgnc_df.alias_symbol.map(set)
)
mini_hgnc_df.head(1)

## Add test for false positives in the intersection points
#### (places where x in alias_smbol matches x in mini_hgnc_df.gene_symbol in the same row)

In [None]:
mini_hgnc_df["gene_symbol"] = [x.split(";") for x in mini_hgnc_df.gene_symbol]
mini_hgnc_df["gene_symbol"] = np.where(
    mini_hgnc_df.gene_symbol == "", "", mini_hgnc_df.gene_symbol.map(set)
)
false_pos_mini_hgnc_df = mini_hgnc_df[
    mini_hgnc_df.alias_symbol.apply(lambda x: x & hgnc_gene_symbol_set)
    == mini_hgnc_df.gene_symbol
]
false_pos_mini_hgnc_df

### Need to remove aliases that match their primary key (gene symbol)

In [None]:
mini_hgnc_df["alias_symbol"] = mini_hgnc_df.alias_symbol - mini_hgnc_df.gene_symbol

## Find intersection points using alias symbol sets

In [None]:
# alias_symbol_sets_series = mini_hgnc_df.alias_symbol

In [None]:
mini_hgnc_df["intersect_point"] = mini_hgnc_df.alias_symbol.apply(
    lambda x: x & hgnc_gene_symbol_set
)
ag_collision_hgnc_df = mini_hgnc_df[mini_hgnc_df.intersect_point != set()]
ag_collision_hgnc_df

In [None]:
ag_collision_hgnc_df["source"] = "HGNC"
ag_collision_hgnc_df.head()

### Convert sets to str or list

In [None]:
type(ag_collision_hgnc_df["gene_symbol"][25])

In [None]:
ag_collision_hgnc_df["gene_symbol"]

In [None]:
list

In [None]:
def simple_func(arg):
    # arg = list(arg)
    print(arg)
    print(type(arg))
    print(list(arg))
    return arg


ag_collision_hgnc_df["gene_symbol"].apply(simple_func)

In [None]:
# Assertion error if passed more than once
ag_collision_hgnc_df["gene_symbol"] = ag_collision_hgnc_df["gene_symbol"].apply(list)
ag_collision_hgnc_df

In [None]:
# ag_collision_hgnc_df["ENSG_ID"] = ag_collision_hgnc_df["ENSG_ID"].astype(str)
# ag_collision_hgnc_df

### Alphabetize alias_symbol

In [None]:
ag_collision_hgnc_df["alias_symbol"] = ag_collision_hgnc_df["alias_symbol"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_hgnc_df.head()

make a set of collisions
- some records have multiple collisions, need to explode to one per row so that when i change it back to a str and make a set it doesnt count two collisions seperated by a commma as one unique collision

In [None]:
ag_collision_hgnc_df = ag_collision_hgnc_df.explode(column="intersect_point")
ag_collision_hgnc_df

In [None]:
hgnc_alias_gene_collision_set = set(ag_collision_hgnc_df["intersect_point"])
len(hgnc_alias_gene_collision_set)

In [None]:
ag_collision_hgnc_df["gene_symbol"] = ag_collision_hgnc_df["gene_symbol"].str.join(", ")

In [None]:
hgnc_alias_gene_collision_primary_symbol_set = set(ag_collision_hgnc_df["gene_symbol"])
len(hgnc_alias_gene_collision_primary_symbol_set)

### Alphabetize alias_symbol

In [None]:
ag_collision_hgnc_df["alias_symbol"] = ag_collision_hgnc_df["alias_symbol"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_hgnc_df.head()

In [None]:
print(type(ag_collision_hgnc_df.loc[25, "gene_symbol"]))

### Convert lists to str

In [None]:
ag_collision_hgnc_df["alias_symbol"] = ag_collision_hgnc_df["alias_symbol"].str.join(
    ", "
)

In [None]:
ag_collision_hgnc_df = ag_collision_hgnc_df.sort_values("intersect_point")
ag_collision_hgnc_df

In [None]:
ag_collision_hgnc_df = ag_collision_hgnc_df.drop_duplicates(
    subset=["gene_symbol", "intersect_point"], keep="first"
)
ag_collision_hgnc_df

## Add civic_gene_id to table

In [None]:
genes = civicpy.get_all_genes()

In [None]:
hgnc_NCBI_ID_list = list(ag_collision_hgnc_df["NCBI_ID"])

In [None]:
ag_collision_hgnc_df["NCBI_ID"] = ag_collision_hgnc_df["NCBI_ID"].str.replace(
    "nan", "000", regex=False
)
ag_collision_hgnc_df.head()

In [None]:
ag_collision_hgnc_df["NCBI_ID"] = ag_collision_hgnc_df["NCBI_ID"].astype(int)

In [None]:
civic_ids = []
for entrez_id in hgnc_NCBI_ID_list:
    found_civic_id = ""
    for gene in genes:
        if entrez_id == str(gene.entrez_id):
            found_civic_id = gene.id
    civic_ids.append(found_civic_id)

ag_collision_hgnc_df["civic_ids"] = civic_ids
ag_collision_hgnc_df

In [None]:
ag_collision_hgnc_df["civic_ids"].replace("", np.nan, inplace=True)

In [None]:
civic_ag_collision_hgnc_df = ag_collision_hgnc_df.copy()
civic_ag_collision_hgnc_df.dropna(subset=["civic_ids"], inplace=True)
civic_ag_collision_hgnc_df

In [None]:
ag_collision_hgnc_df.to_csv(
    "../created_files/ag_collision_hgnc_df_w_entrez_id.csv", index=False
)

## Add MP ID & score

In [None]:
civic_ag_collision_hgnc_df_add_molprof_df = transform_df_mp_id(
    civic_ag_collision_hgnc_df
)
civic_ag_collision_hgnc_df_add_molprof_df

In [None]:
civic_ag_collision_hgnc_df_add_molprof_score_df = transform_df_mp_score(
    civic_ag_collision_hgnc_df_add_molprof_df
)
civic_ag_collision_hgnc_df_add_molprof_score_df

## Add evidence_ids column

In [None]:
civic_variant_ids = civicpy.get_all_variants()

In [None]:
hgnc_civic_id_list = list(civic_ag_collision_hgnc_df["civic_ids"])

In [None]:
civic_ag_collision_hgnc_df.civic_ids.astype(int)

In [None]:
civic_eids = []
for civic_id in hgnc_civic_id_list:
    gene_civic_eids = []

    for variant in civic_variant_ids:
        if int(civic_id) == variant.gene_id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in gene_civic_eids:
                        gene_civic_eids.append(e.id)

            gene_civic_eids = gene_civic_eids or ""
    civic_eids.append(gene_civic_eids)
civic_ag_collision_hgnc_df["civic_eids"] = civic_eids
civic_ag_collision_hgnc_df

In [None]:
civic_evidence_list = civicpy.get_all_evidence()

In [None]:
hgnc_civic_eid_list = list(civic_ag_collision_hgnc_df["civic_eids"])
hgnc_civic_eid_list[0]

In [None]:
evidence_ratings = []
for eid_list in hgnc_civic_eid_list:
    found_rating = []
    for eid in eid_list:
        for evidence in civic_evidence_list:
            if eid == evidence.id:
                found_rating.append(evidence.rating)
    evidence_ratings.append(found_rating)
civic_ag_collision_hgnc_df["evidence_ratings"] = evidence_ratings
civic_ag_collision_hgnc_df

In [None]:
average_evidence_rating = []
for alist in civic_ag_collision_hgnc_df["evidence_ratings"]:
    avg_rating = sum(alist) / len(alist)
    average_evidence_rating.append(avg_rating)
civic_ag_collision_hgnc_df["average_evidence_ratings"] = average_evidence_rating
civic_ag_collision_hgnc_df

In [None]:
sum_evidence_rating = []
for alist in civic_ag_collision_hgnc_df["evidence_ratings"]:
    sum_rating = sum(alist)
    sum_evidence_rating.append(sum_rating)
civic_ag_collision_hgnc_df["sum_evidence_rating"] = sum_evidence_rating
civic_ag_collision_hgnc_df

# NCBI Info

In [None]:
mini_ncbi_df = pd.read_csv("../downloaded_files/Homo_sapiens.gene_info20240627", sep="\t")

### Drop all columns besides ENSG_ID, gene_symbol, and alias_symbol

In [None]:
mini_ncbi_df = mini_ncbi_df[
["GeneID", "Symbol", "Synonyms", "dbXrefs"]
]
mini_ncbi_df = mini_ncbi_df.rename(
    columns={"GeneID": "NCBI_ID", "Symbol": "gene_symbol", "Synonyms": "alias_symbol"}
)
mini_ncbi_df

Split dbXrefs into individual columns

In [None]:
mini_ncbi_df = mini_ncbi_df.assign(
    MIM=np.nan,
    HGNC_ID=np.nan,
    ENSG_ID=np.nan,
    AllianceGenome=np.nan,
    MIRbase=np.nan,
    IMGTgene_db=np.nan,
    dash=np.nan,
    unknown=np.nan,
)

In [None]:
index_pos = 0

print(len(mini_ncbi_df))
while index_pos < len(mini_ncbi_df):
    xrefs = mini_ncbi_df["dbXrefs"][index_pos].split("|")

    for xref in xrefs:
        xref = xref.lower()
        if xref.startswith("mim:"):
            xref = xref.replace("mim:", "")
            mini_ncbi_df["MIM"][index_pos] = xref
        elif xref.startswith("hgnc:hgnc:"):
            xref = xref.replace("hgnc:hgnc:", "")
            mini_ncbi_df["HGNC_ID"][index_pos] = xref
        elif xref.startswith("ensembl:"):
            xref = xref.replace("ensembl:", "")
            mini_ncbi_df["ENSG_ID"][index_pos] = xref
        elif xref.startswith("alliancegenome:"):
            xref = xref.replace("alliancegenome:", "")
            mini_ncbi_df["AllianceGenome"][index_pos] = xref
        elif xref.startswith("mirbase"):
            xref = xref.replace("mirbase:", "")
            mini_ncbi_df["MIRbase"][index_pos] = xref
        elif xref.startswith("imgt/gene-db:"):
            xref = xref.replace("imgt/gene-db:", "")
            mini_ncbi_df["IMGTgene_db"][index_pos] = xref
        elif xref.startswith("-"):
            mini_ncbi_df["dash"][index_pos] = xref
        else:
            mini_ncbi_df["unknown"][index_pos] = xref

    index_pos += 1
    pass

print(index_pos)

In [None]:
mini_ncbi_df["ENSG_ID"] = mini_ncbi_df["ENSG_ID"].str.replace("ensg", "ENSG", 1)

In [None]:
mini_ncbi_df = mini_ncbi_df.drop(
    [
        "AllianceGenome",
        "MIRbase",
        "IMGTgene_db",
        "dash",
        "unknown",
        "dbXrefs",
        "MIM",
    ],
    axis=1,
)
mini_ncbi_df = mini_ncbi_df.rename(columns={"HGNC_ID": "HGNC_ID"})
mini_ncbi_df

In [None]:
mini_ncbi_df.to_csv("../created_files/mini_ncbi_df.csv", index=False)
mini_ncbi_df.head()

One gene symbol may have multiple NCBI IDs

In [None]:
mini_ncbi_df["symbol_duplicates"] = mini_ncbi_df.duplicated(
    subset="gene_symbol", keep=False
)
dup_symbol_mini_ncbi_df = mini_ncbi_df[mini_ncbi_df["symbol_duplicates"] == True]

In [None]:
dup_symbol_mini_ncbi_df = dup_symbol_mini_ncbi_df.sort_values("gene_symbol")
dup_symbol_mini_ncbi_df

In [None]:
mini_ncbi_df = mini_ncbi_df.drop(["symbol_duplicates"], axis=1)

### Make a set of primary gene symbols

In [None]:
ncbi_gene_symbol_set = set(mini_ncbi_df["gene_symbol"])

In [None]:
total_number_ncbi_gene_symbols = len(ncbi_gene_symbol_set)
total_number_ncbi_gene_symbols

Drop genes with no aliases

In [None]:
mini_ncbi_df = mini_ncbi_df.replace(" ", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("", np.nan)
mini_ncbi_df = mini_ncbi_df.replace("-", np.nan)
mini_ncbi_df = mini_ncbi_df.dropna(subset=["alias_symbol"])
mini_ncbi_df

### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [None]:
mini_ncbi_df["alias_symbol"] = mini_ncbi_df["alias_symbol"].astype(str)
mini_ncbi_df["alias_symbol"] = [x.split("|") for x in mini_ncbi_df.alias_symbol]
mini_ncbi_df["alias_symbol"] = np.where(
    mini_ncbi_df.alias_symbol == " ", " ", mini_ncbi_df.alias_symbol.map(set)
)
mini_ncbi_df.head(1)

### Add test for false positives in the intersection points
#### (places where x in alias_symbol matches x in mini_hgnc_df.gene_symbol in the same row)

In [None]:
mini_ncbi_df["gene_symbol"] = [x.split(";") for x in mini_ncbi_df.gene_symbol]
mini_ncbi_df["gene_symbol"] = np.where(
    mini_ncbi_df.gene_symbol == "", "", mini_ncbi_df.gene_symbol.map(set)
)
false_pos_mini_ncbi_df = mini_ncbi_df[
    mini_ncbi_df.alias_symbol.apply(lambda x: x & ncbi_gene_symbol_set)
    == mini_ncbi_df.gene_symbol
]
false_pos_mini_ncbi_df.head()

In [None]:
mini_ncbi_df["alias_symbol"] = mini_ncbi_df.alias_symbol - mini_ncbi_df.gene_symbol
display(mini_ncbi_df.iloc[7805])

### Find intersection points using alias symbol sets and gene_symbol

In [None]:
mini_ncbi_df["intersect_point"] = mini_ncbi_df.alias_symbol.apply(
    lambda x: x & ncbi_gene_symbol_set
)
ag_collision_ncbi_df = mini_ncbi_df[mini_ncbi_df.intersect_point != set()]
ag_collision_ncbi_df

In [None]:
ag_collision_ncbi_df["source"] = "NCBI Info"
ag_collision_ncbi_df.head()

### Convert sets to str or list

In [None]:
# Assertion error when when run more than once
ag_collision_ncbi_df["gene_symbol"] = ag_collision_ncbi_df["gene_symbol"].apply(list)
ag_collision_ncbi_df.head()

In [None]:
ag_collision_ncbi_df["ENSG_ID"] = ag_collision_ncbi_df["ENSG_ID"].astype(str)
ag_collision_ncbi_df.head()

### Alphabetize alias_symbol

In [None]:
ag_collision_ncbi_df["alias_symbol"] = ag_collision_ncbi_df["alias_symbol"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_ncbi_df.head()

make a set of collisions
- some records have multiple collisions, need to explode to one per row so that when i change it back to a str and make a set it doesnt count two collisions seperated by a commma as one unique collision

In [None]:
intersect_explode_ag_collision_ncbi_df = ag_collision_ncbi_df.explode(
    column="intersect_point"
)
intersect_explode_ag_collision_ncbi_df

In [None]:
ncbi_alias_gene_collision_set = set(
    intersect_explode_ag_collision_ncbi_df["intersect_point"]
)
len(ncbi_alias_gene_collision_set)

In [None]:
intersect_explode_ag_collision_ncbi_df["gene_symbol"] = (
    intersect_explode_ag_collision_ncbi_df["gene_symbol"].str.join(", ")
)

In [None]:
ncbi_alias_gene_collision_primary_symbol_set = set(
    intersect_explode_ag_collision_ncbi_df["gene_symbol"]
)
len(ncbi_alias_gene_collision_primary_symbol_set)

### Alphabetize intersect_point

In [None]:
ag_collision_ncbi_df["intersect_point"] = ag_collision_ncbi_df["intersect_point"].apply(
    lambda x: sorted(list(x), key=str.casefold)
)
ag_collision_ncbi_df.head()

### Convert lists to str

In [None]:
ag_collision_ncbi_df["gene_symbol"] = ag_collision_ncbi_df["gene_symbol"].str.join(", ")

In [None]:
ag_collision_ncbi_df["alias_symbol"] = ag_collision_ncbi_df["alias_symbol"].str.join(
    ", "
)

In [None]:
ag_collision_ncbi_df["intersect_point"] = ag_collision_ncbi_df[
    "intersect_point"
].str.join(", ")
ag_collision_ncbi_df

# Merge 3 sets together

In [None]:
merged_alias_gene_intersections_df = pd.concat(
    [
        ag_collision_hgnc_df[
            ["gene_symbol", "alias_symbol", "intersect_point", "source"]
        ],
        ag_collision_ncbi_df[
            ["gene_symbol", "alias_symbol", "intersect_point", "source"]
        ],
        ag_collision_ensg_df[
            ["gene_symbol", "alias_symbol", "intersect_point", "source"]
        ],
    ]
)
merged_alias_gene_intersections_df

In [None]:
merged_alias_gene_intersections_df.loc[
    merged_alias_gene_intersections_df["intersect_point"] == "CFM1"
]

# Convert to csv

In [None]:
merged_alias_gene_intersections_df.to_csv(
    "../created_files/merged_alias_gene_intersections.csv", index=False
)

In [None]:
print(merged_alias_gene_intersections_df["source"].value_counts())

In [None]:
common_ag_collisions = (
    ncbi_alias_gene_collision_primary_symbol_set
    & hgnc_alias_gene_collision_primary_symbol_set
    & ensg_alias_gene_collision_primary_symbol_set
)
common_ag_collisions

In [None]:
len(common_ag_collisions)