#### Here we investigate the relashionship between:
    - mRNA level predictability of a landmark gene 
    and 
    - its known organelle level biological function using GO annotations

In [17]:
import pandas as pd
import numpy as np


def locations_of_substring(string, substring):
    """Return a list of locations of a substring."""
    substring_length = len(substring)

    def recurse(locations_found, start):
        location = string.find(substring, start)
        if location != -1:
            return recurse(locations_found + [location], location + substring_length)
        else:
            return locations_found

    return recurse([], 0)

#### For LUAD dataset:

1 - Read predictability map of categorical features (using MLP model)

2 - Assign the feature categories to compartments/stains

3 - Read functional annotations of the reference set according to DAVIDs output and add columns for each channel
   - Add channel specific annotation to each columns channel
   

In [18]:
########### 1 ###########
filename = "../results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx"
saved_scores = pd.read_excel(filename, sheet_name=None)
which_ds_model = "LUAD-9-MLP-ht"
dfcats = saved_scores[which_ds_model].rename(columns={"Unnamed: 0": "ID"})
dfcats = dfcats[dfcats.columns[~dfcats.isna().any()].tolist()]
# saved_scores.keys()

########### 2 ###########
Channelss = ["DNA", "RNA", "AGP", "Mito", "ER"]
Channelss_cats = [
    "DNA|Nuclei_AreaShape",
    "RNA",
    "AGP|Cytoplasm_AreaShape|Cells_AreaShape",
    "Mito",
    "ER",
]

for ci in range(len(Channelss)):
    dfcats["max_" + Channelss[ci]] = dfcats.loc[
        :, dfcats.columns.str.contains(Channelss_cats[ci])
    ].max(axis=1)


########### 3 ###########
gene_cats_bpcc = pd.read_csv("./GO/go_BP_CC_MF_DIRECT_921.txt", delimiter="\t")
comps = [
    "mitochondri",
    "Golgi",
    "membrane",
    "cytoskeleton",
    "actin",
    "endoplasmic",
    "RNA",
    "nucleol",
    "cell division",
    "mitosis",
    "mitotic",
    "cell cycle",
]


for c in comps:
    gene_cats_bpcc[c] = (
        gene_cats_bpcc["GOTERM_BP_DIRECT"]
        .astype(str)
        .apply(
            lambda x: "".join(
                [
                    x[:si].split("~")[-1] + x[si:].split("GO")[0]
                    for si in locations_of_substring(x, c)
                ]
            )
            if c in x
            else ""
        )
        + gene_cats_bpcc["GOTERM_CC_DIRECT"]
        .astype(str)
        .apply(
            lambda x: "".join(
                [
                    x[:si].split("~")[-1] + x[si:].split("GO")[0]
                    for si in locations_of_substring(x, c)
                ]
            )
            if c in x
            else ""
        )
        + gene_cats_bpcc["GOTERM_MF_DIRECT"]
        .astype(str)
        .apply(
            lambda x: "".join(
                [
                    x[:si].split("~")[-1] + x[si:].split("GO")[0]
                    for si in locations_of_substring(x, c)
                ]
            )
            if c in x
            else ""
        )
    )  # +\
    gene_cats_bpcc["UP_SEQ_FEATURE"].astype(str).apply(
        lambda x: "".join(
            [
                x[:si].split("~")[-1] + x[si:].split("GO")[0]
                for si in locations_of_substring(x, c)
            ]
        )
        if c in x
        else ""
    )

gene_cats_bpcc["RNA_nucleoli"] = gene_cats_bpcc["RNA"] + gene_cats_bpcc["nucleol"]
gene_cats_bpcc["DNA"] = (
    gene_cats_bpcc["cell division"]
    + gene_cats_bpcc["mitosis"]
    + gene_cats_bpcc["mitotic"]
    + gene_cats_bpcc["cell cycle"]
)

gene_cats_bpcc["cytoskeleton-actin"] = (
    gene_cats_bpcc["cytoskeleton"]
    + gene_cats_bpcc["actin"]
    + gene_cats_bpcc["Golgi"]
    + gene_cats_bpcc["membrane"]
)


gene_cats_bpcc = pd.merge(gene_cats_bpcc, dfcats, how="inner", on="ID")

#########################
Channelss_dict = {
    "DNA": "DNA",
    "RNA_nucleoli": "RNA",
    "cytoskeleton-actin": "AGP",
    "mitochondri": "Mito",
    "endoplasmic": "ER",
}
Chan_rev_dict = dict(zip(Channelss_dict.values(), Channelss_dict.keys()))

comps2 = ["mitochondri", "cytoskeleton-actin", "endoplasmic", "RNA_nucleoli", "DNA"]
gene_cats_bpcc["any_comps"] = ""
for co in comps2:
    gene_cats_bpcc["any_comps"] = gene_cats_bpcc["any_comps"] + gene_cats_bpcc[co]

from sklearn.metrics import confusion_matrix
from scipy.stats import fisher_exact

# top_bool=(gene_cats_bpcc['top58']==True).values
table2 = pd.DataFrame(index=Channelss_dict.keys(), columns=Channelss_dict.values())
table3 = pd.DataFrame(
    index=Channelss,
    columns=["odds ratio", "restComp_oddsratio", "anyComp_oddsratio", "top-ratio"],
)
# table3=pd.DataFrame(index=Channelss,columns=['Prevalence','anyComp-Prevalence','noComp-Prevalence','top-ratio'])

table = []
for c in Channelss:
    print(c)
    top_bool = (gene_cats_bpcc["max_" + c] < 0.1).values
    n_top = sum(top_bool)

    table1 = pd.DataFrame(
        index=comps2 + ["any comp", "no comp"],
        columns=["Prevalence", "p-value", "odds ratio"],
    )
    for co in comps2:
        enr_ratio = (
            gene_cats_bpcc[top_bool & (gene_cats_bpcc[co] != "")].shape[0]
            / gene_cats_bpcc[(gene_cats_bpcc[co] != "")].shape[0]
        )
        comp_bool = (gene_cats_bpcc[co] != "").values
        oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, comp_bool))
        #     print(co, ':',gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc[co]!='')].shape[0],\
        #          ', ',gene_cats_bpcc[(gene_cats_bpcc[co]!='')].shape[0])

        table1.loc[co, ["Prevalence", "p-value", "odds ratio"]] = (
            enr_ratio * 100,
            pvalue,
            oddsratio,
        )
        #         print(co,": ",np.round(enr_ratio*100,2),'%     ','pvalue:', np.round(pvalue,2),'  oddsratio:',np.round(oddsratio,2))
        table2.loc[co, c] = oddsratio

    enr_ratio = (
        gene_cats_bpcc[top_bool & (gene_cats_bpcc["any_comps"] != "")].shape[0]
        / gene_cats_bpcc[(gene_cats_bpcc["any_comps"] != "")].shape[0]
    )
    any_oddsratio, pvalue = fisher_exact(
        confusion_matrix(top_bool, (gene_cats_bpcc["any_comps"] != "").values)
    )
    #     print("any_comps: ",np.round(enr_ratio*100,2),'%     ','pvalue:', np.round(pvalue,2),'  oddsratio:',np.round(oddsratio,2))
    table1.loc["any comp", ["Prevalence", "p-value", "odds ratio"]] = (
        enr_ratio * 100,
        pvalue,
        oddsratio,
    )

    nocomp_enr_ratio = (
        gene_cats_bpcc[(top_bool) & (gene_cats_bpcc["any_comps"] == "")].shape[0]
        / gene_cats_bpcc[(gene_cats_bpcc["any_comps"] == "")].shape[0]
    )

    comps2 = ["mitochondri", "cytoskeleton-actin", "endoplasmic", "RNA_nucleoli", "DNA"]
    comps2.remove(Chan_rev_dict[c])
    gene_cats_bpcc["rest_comps"] = ""
    for co in comps2:
        gene_cats_bpcc["rest_comps"] = gene_cats_bpcc["rest_comps"] + gene_cats_bpcc[co]

    rest_enr_ratio = (
        gene_cats_bpcc[(top_bool) & (gene_cats_bpcc["rest_comps"] == "")].shape[0]
        / gene_cats_bpcc[(gene_cats_bpcc["rest_comps"] == "")].shape[0]
    )
    rest_oddsratio, pvalue = fisher_exact(
        confusion_matrix(top_bool, (gene_cats_bpcc["rest_comps"] != "").values)
    )

    table.append(table1)
    #     print('num top ('+ str(n_top)+')/total genes (912): ', np.round((n_top/912)*100,2),'%')
    table3.loc[
        c, ["odds ratio", "restComp_oddsratio", "anyComp_oddsratio", "top-ratio"]
    ] = (
        table1.loc[Chan_rev_dict[c], "odds ratio"],
        rest_oddsratio,
        any_oddsratio,
        np.round((n_top / 912) * 100, 2),
    )

#     table3.loc[c,['Prevalence','anyComp-Prevalence','noComp-Prevalence','top-ratio']]=table1.loc[Chan_rev_dict[c],'Prevalence'],\
#     enr_ratio*100,nocomp_enr_ratio*100,np.round((n_top/912)*100,2)

#     print(Chan_rev_dict[c],': ',table1.loc[Chan_rev_dict[c],['Prevalence']].values)

# table3['dif']=table3['Prevalence']-table3['anyComp-Prevalence']
table3["dif"] = table3["odds ratio"] - table3["restComp_oddsratio"]
print(table3["dif"].min(), table3["dif"].sum())


DNA
RNA
AGP
Mito
ER
-0.5605805723875003 -1.3206249548960154


In [12]:
table3

Unnamed: 0,odds ratio,restComp_oddsratio,anyComp_oddsratio,top-ratio,dif
DNA,0.793062,1.353643,1.419434,20.5,-0.560581
RNA,1.291008,1.317432,1.339849,20.83,-0.026424
AGP,0.834736,1.322061,1.637884,18.75,-0.487325
Mito,1.174962,1.359223,1.566091,20.72,-0.184261
ER,1.050249,1.112284,1.320451,22.92,-0.062034


In [None]:
print(table1.to_markdown())

## GO terms search for overlap of highly predictable genes (top 58)

In [15]:
top58 = pd.read_csv("./GO/top_58_common.txt", header=None)[0].tolist()
gene_cats_bpcc = pd.read_csv("./GO/go_bp_cc_D2021_each_gene_cat.txt", delimiter="\t")
comps = [
    "mitochondri",
    "Golgi",
    "membrane",
    "cytoskeleton",
    "actin",
    "endoplasmic",
    "RNA",
    "nucleol",
    "cell division",
    "mitosis",
    "mitotic",
    "cell cycle",
]
# for c in comps:
#     gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).apply(lambda x: x if c in x else '')+\
#     gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).apply(lambda x: x[:x.find(c)].split('~')[-1]+x[x.find(c):].split('GO')[0] if c in x else '')

for c in comps:
    gene_cats_bpcc[c] = gene_cats_bpcc["GOTERM_BP_DIRECT"].astype(str).apply(
        lambda x: "".join(
            [
                x[:si].split("~")[-1] + x[si:].split("GO")[0]
                for si in locations_of_substring(x, c)
            ]
        )
        if c in x
        else ""
    ) + gene_cats_bpcc["GOTERM_CC_DIRECT"].astype(str).apply(
        lambda x: "".join(
            [
                x[:si].split("~")[-1] + x[si:].split("GO")[0]
                for si in locations_of_substring(x, c)
            ]
        )
        if c in x
        else ""
    )

gene_cats_bpcc["RNA_nucleoli"] = gene_cats_bpcc["RNA"] + gene_cats_bpcc["nucleol"]
gene_cats_bpcc["DNA"] = (
    gene_cats_bpcc["cell division"]
    + gene_cats_bpcc["mitosis"]
    + gene_cats_bpcc["mitotic"]
    + gene_cats_bpcc["cell cycle"]
)

gene_cats_bpcc["cytoskeleton-actin"] = (
    gene_cats_bpcc["cytoskeleton"] + gene_cats_bpcc["actin"]
)


# gene_cats_bpcc['RNA_nucleoli']=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).apply(lambda x: x if 'RNA' in x or 'nucleoli' in x else '')+\
# gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).apply(lambda x: x if 'RNA' in x or 'nucleoli' in x else '')

# gene_cats_bpcc['DNA']=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).apply(lambda x: x if 'cell division' in x or 'mitosis' in x or 'mitotic' in x or 'cell cycle' in x else '')+\
# gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).apply(lambda x: x if 'cell division' in x or 'mitosis' in x or 'mitotic' in x or 'cell cycle' in x else '')

# gene_cats_bpcc['DNA']=gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).apply(lambda x: x if 'cell division' in x or 'mitosis' in x or 'cell cycle' in x else '')+\
# gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).apply(lambda x: x if 'cell division' in x or 'mitosis' in x or 'cell cycle' in x else '')


gene_cats_bpcc.loc[gene_cats_bpcc["ID"].isin(top58), "top58"] = True


#####################################
comps2 = [
    "mitochondri",
    "Golgi",
    "membrane",
    "cytoskeleton-actin",
    "endoplasmic",
    "RNA_nucleoli",
    "DNA",
]
from sklearn.metrics import confusion_matrix
from scipy.stats import fisher_exact

top_bool = (gene_cats_bpcc["top58"] == True).values

table1 = pd.DataFrame(
    index=comps2 + ["any comp", "no comp"],
    columns=["Prevalence", "p-value", "odds ratio"],
)
for co in comps2:
    enr_ratio = (
        gene_cats_bpcc[
            (gene_cats_bpcc["top58"] == True) & (gene_cats_bpcc[co] != "")
        ].shape[0]
        / gene_cats_bpcc[(gene_cats_bpcc[co] != "")].shape[0]
    )
    comp_bool = (gene_cats_bpcc[co] != "").values
    oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, comp_bool))
    #     print(co, ':',gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc[co]!='')].shape[0],\
    #          ', ',gene_cats_bpcc[(gene_cats_bpcc[co]!='')].shape[0])

    table1.loc[co, ["Prevalence", "p-value", "odds ratio"]] = (
        enr_ratio * 100,
        pvalue,
        oddsratio,
    )
    print(
        co,
        ": ",
        np.round(enr_ratio * 100, 2),
        "%     ",
        "pvalue:",
        np.round(pvalue, 2),
        "  oddsratio:",
        np.round(oddsratio, 2),
    )

gene_cats_bpcc["any_comps"] = ""
for co in comps2:
    gene_cats_bpcc["any_comps"] = gene_cats_bpcc["any_comps"] + gene_cats_bpcc[co]


enr_ratio = (
    gene_cats_bpcc[
        (gene_cats_bpcc["top58"] == True) & (gene_cats_bpcc["any_comps"] != "")
    ].shape[0]
    / gene_cats_bpcc[(gene_cats_bpcc["any_comps"] != "")].shape[0]
)
oddsratio, pvalue = fisher_exact(
    confusion_matrix(top_bool, (gene_cats_bpcc["any_comps"] != "").values)
)
print(
    "any_comps: ",
    np.round(enr_ratio * 100, 2),
    "%     ",
    "pvalue:",
    np.round(pvalue, 2),
    "  oddsratio:",
    np.round(oddsratio, 2),
)
table1.loc["any comp", ["Prevalence", "p-value", "odds ratio"]] = (
    enr_ratio * 100,
    pvalue,
    oddsratio,
)


enr_ratio = (
    gene_cats_bpcc[
        (gene_cats_bpcc["top58"] == True) & (gene_cats_bpcc["any_comps"] == "")
    ].shape[0]
    / gene_cats_bpcc[(gene_cats_bpcc["any_comps"] == "")].shape[0]
)
oddsratio, pvalue = fisher_exact(
    confusion_matrix(top_bool, (gene_cats_bpcc["any_comps"] == "").values)
)
print(
    "no comps: ",
    np.round(enr_ratio * 100, 2),
    "%     ",
    "pvalue:",
    np.round(pvalue, 2),
    "  oddsratio:",
    np.round(oddsratio, 2),
)
table1.loc["no comp", ["Prevalence", "p-value", "odds ratio"]] = (
    enr_ratio * 100,
    pvalue,
    oddsratio,
)

print("num top (54)/total genes (722): ", np.round((54 / 722) * 100, 2), "%")

gene_cats_bpcc.to_csv("./GO/go_bp_cc_D2021_each_gene_cat_completed.csv", index=False)


mitochondri :  8.7 %      pvalue: 0.59   oddsratio: 1.23
Golgi :  8.33 %      pvalue: 0.68   oddsratio: 1.15
membrane :  7.37 %      pvalue: 1.0   oddsratio: 0.96
cytoskeleton-actin :  7.69 %      pvalue: 1.0   oddsratio: 1.03
endoplasmic :  9.82 %      pvalue: 0.33   oddsratio: 1.44
RNA_nucleoli :  5.56 %      pvalue: 0.27   oddsratio: 0.66
DNA :  11.02 %      pvalue: 0.1   oddsratio: 1.72
any_comps:  8.06 %      pvalue: 0.16   oddsratio: 2.15
no comps:  3.92 %      pvalue: 0.16   oddsratio: 0.47
num top (54)/total genes (722):  7.48 %
