# Results intersections #

### Parameters ###
Will be overriden by results_intersections.yaml

In [18]:
#Parameters
threshold = 0.3
min_main_repeats = 10
min_other_repeats = 3
trait = "lifespan"
life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
debug_local = True

In [19]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '..', '/data/sources/yspecies/notebooks', '/data/miniconda3/envs/yspecies/lib/python38.zip', '/data/miniconda3/envs/yspecies/lib/python3.8', '/data/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/data/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/data/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/anton/.ipython']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from typing import *
from yspecies import *
from yspecies.workflow import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.config import *

In [21]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

### Load pathes ###

In [23]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [24]:
external = shap_results = locations.output.external 
intersections = locations.output.intersections
shap = external.shap
linear = external.linear
causality = external.dir / "causality"
folder = linear / f"R2_THRESHOLD_{int(threshold * 100)}"

In [25]:
stage_one = locations.output.stage_one
stage_two = locations.output.stage_two
(stage_one, stage_two)

(PosixPath('../data/output/stage_1'), PosixPath('../data/output/stage_2'))

### Function to mark pro and anti genes ###

In [26]:
#used to evaluate the direction
def mark_direction(row, threshold: float, column: str = "mean_kendall_tau", max_repeats: int = 10):    
    cor = row[column] / max_repeats * row["repeats"]
    if cor >= threshold * 2:
        return "strong pro"
    if cor >= threshold:
        return "pro"
    elif cor > 0.0:
        return "weak pro"
    elif round(cor, 4) == 0.0:
        return "neutral"
    elif cor >= -threshold:
        return "weak anti"
    elif cor <= -threshold * 2:
        return "strong anti"
    else:
        return "anti"

## Summarize lifespan results by lifehistories ##

In [44]:
value_cols = ['repeats', 'mean_shap', 'mean_kendall_tau']
value_cols_directed = value_cols + ["direction"]
cols = ['reference_gene','symbol'] + value_cols
cols

['reference_gene', 'symbol', 'repeats', 'mean_shap', 'mean_kendall_tau']

In [28]:
def load_trait(path: Path, cols: list, min_repeats = 1):
    df = pd.read_csv(path, sep="\t")[cols].set_index("reference_gene")
    df = df[df.repeats >= min_repeats]
    df["direction"] = df.apply(lambda row: mark_direction(row, threshold=threshold), axis=1)
    return df

In [55]:
main_trait_stage_1 = load_trait(stage_one / f"{trait}_selected.tsv", cols, 1)
main_trait_stage_2 = load_trait(stage_two / f"{trait}.tsv", cols, min_main_repeats)
main_trait = main_trait_stage_2.join(main_trait_stage_1[value_cols_directed], rsuffix=f"_{trait}_stage_1")
print(main_trait.shape)
main_trait

(44, 9)


Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSG00000204498,NFKBIL1,10.0,25.1,-0.563,anti,10.0,7.35,-0.141,weak anti
ENSG00000010219,DYRK4,10.0,22.1,0.447,pro,10.0,8.375,0.263,weak pro
ENSG00000167515,TRAPPC2L,10.0,21.05,-0.316,anti,9.0,6.056,-0.168,weak anti
ENSG00000165501,LRR1,10.0,20.15,0.721,strong pro,3.0,2.25,0.658,weak pro
ENSG00000142002,DPP9,10.0,19.05,-0.801,strong anti,4.0,2.5,-0.703,weak anti
ENSG00000160948,VPS28,10.0,17.15,-0.571,anti,5.0,2.25,-0.329,weak anti
ENSG00000023191,RNH1,10.0,16.775,-0.605,strong anti,4.0,4.0,-0.319,weak anti
ENSG00000165555,NOXRED1,10.0,16.0,0.816,strong pro,4.0,3.938,0.799,pro
ENSG00000137343,ATAT1,10.0,15.575,-0.593,anti,2.0,2.875,-0.403,weak anti
ENSG00000170835,CEL,10.0,15.525,0.712,strong pro,8.0,3.219,0.694,pro


In [56]:
other_life_history = [t for t in life_history if not t == trait]
other_traits = OrderedDict([(t, load_trait(stage_one / (t + "_selected.tsv"), cols, min_other_repeats)) for t in other_life_history])
other_traits["gestation_days"]

Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,direction
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000006282,SPATA20,10.0,7.4,0.66,strong pro
ENSG00000066923,STAG3,10.0,6.5,0.424,pro
ENSG00000171121,KCNMB3,10.0,5.9,0.623,strong pro
ENSG00000172531,PPP1CA,9.0,5.25,-0.755,strong anti
ENSG00000105672,ETV2,9.0,4.083,0.681,strong pro
ENSG00000188747,NOXA1,9.0,3.944,0.651,pro
ENSG00000010219,DYRK4,8.0,11.219,0.474,pro
ENSG00000163584,RPL22L1,7.0,4.214,-0.586,anti
ENSG00000165568,AKR1E2,7.0,4.071,-0.519,anti
ENSG00000101190,TCFL5,7.0,2.929,0.65,pro


In [57]:
joined = main_trait
for t,v in other_traits.items():
    joined = joined.join(v, rsuffix = f"_{t}")#
#joined.to_csv(locations.output.stage_two / "lifespan_with_traits.tsv", sep="\t", index_label="reference_gene")
joined.columns

Index(['symbol', 'repeats', 'mean_shap', 'mean_kendall_tau', 'direction',
       'repeats_lifespan_stage_1', 'mean_shap_lifespan_stage_1',
       'mean_kendall_tau_lifespan_stage_1', 'direction_lifespan_stage_1',
       'symbol_mass_kg', 'repeats_mass_kg', 'mean_shap_mass_kg',
       'mean_kendall_tau_mass_kg', 'direction_mass_kg', 'symbol_mtGC',
       'repeats_mtGC', 'mean_shap_mtGC', 'mean_kendall_tau_mtGC',
       'direction_mtGC', 'symbol_metabolic_rate', 'repeats_metabolic_rate',
       'mean_shap_metabolic_rate', 'mean_kendall_tau_metabolic_rate',
       'direction_metabolic_rate', 'symbol_temperature', 'repeats_temperature',
       'mean_shap_temperature', 'mean_kendall_tau_temperature',
       'direction_temperature', 'symbol_gestation_days',
       'repeats_gestation_days', 'mean_shap_gestation_days',
       'mean_kendall_tau_gestation_days', 'direction_gestation_days'],
      dtype='object')

In [59]:
row = joined.iloc[0]
def summarize_life_history(row: pd.Series):   
    acc = []
    for t in other_life_history:
         rep = row[f"repeats_{t}"]
         if rep >= min_other_repeats:
           sh = row[f"mean_shap_{t}"]
           tau = row[f"mean_kendall_tau_{t}"]
           direction = row[f"direction_{t}"]
           acc.append(f"{t} ({direction} | {int(rep)} | {round(sh,2)} | {round(tau,4)})")
    return ", ".join(acc)
summarize_life_history(row)

'mass_kg (weak anti | 3 | 7.08 | -0.1927)'

In [79]:
joined["other_life_history_traits"] = joined.apply(summarize_life_history, axis=1)
shap_results = joined[main_trait.columns.to_list()+["other_life_history_traits"]]
shap_results.to_csv(locations.output.stage_two / "shap_results.tsv", sep="\t", index_label="reference_gene")
shap_results

Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1,other_life_history_traits
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000204498,NFKBIL1,10.0,25.1,-0.563,anti,10.0,7.35,-0.141,weak anti,mass_kg (weak anti | 3 | 7.08 | -0.1927)
ENSG00000010219,DYRK4,10.0,22.1,0.447,pro,10.0,8.375,0.263,weak pro,"mass_kg (weak pro | 4 | 21.19 | 0.2116), gesta..."
ENSG00000167515,TRAPPC2L,10.0,21.05,-0.316,anti,9.0,6.056,-0.168,weak anti,mtGC (anti | 8 | 4.03 | -0.5141)
ENSG00000165501,LRR1,10.0,20.15,0.721,strong pro,3.0,2.25,0.658,weak pro,
ENSG00000142002,DPP9,10.0,19.05,-0.801,strong anti,4.0,2.5,-0.703,weak anti,
ENSG00000160948,VPS28,10.0,17.15,-0.571,anti,5.0,2.25,-0.329,weak anti,
ENSG00000023191,RNH1,10.0,16.775,-0.605,strong anti,4.0,4.0,-0.319,weak anti,mtGC (weak anti | 4 | 2.19 | -0.5196)
ENSG00000165555,NOXRED1,10.0,16.0,0.816,strong pro,4.0,3.938,0.799,pro,
ENSG00000137343,ATAT1,10.0,15.575,-0.593,anti,2.0,2.875,-0.403,weak anti,
ENSG00000170835,CEL,10.0,15.525,0.712,strong pro,8.0,3.219,0.694,pro,mtGC (pro | 5 | 2.4 | 0.6377)


### Load linear ###

In [61]:
def lag_linear(df: pd.DataFrame)->pd.DataFrame:
    df["MLS_influence_linear"] = df.apply(lambda row: "pro" if row["maxlifespan_sign"]>0 else "anti", axis = 1)
    selected = df[["reference_gene","symbol","organ","maxlifespan_adjpval","maxlifespan_r2_adj", "MLS_influence_linear"]].sort_values(by="maxlifespan_r2_adj", ascending=False).drop_duplicates()
    selected["organ"] = selected.apply(lambda row: row["organ"] + " (" +row["MLS_influence_linear"]+" r^2=" + str(round(row["maxlifespan_r2_adj"],4)) +")", axis=1)
    return selected[["reference_gene","symbol","organ"]].groupby(["reference_gene", "symbol"], as_index=False).agg({'organ': ', '.join}).set_index("reference_gene")


In [62]:
linear_organ = lag_linear(pd.read_csv(folder / "linear_models_on_species_vars.csv").rename(columns={"Unnamed: 0": "reference_gene"}))
linear_organ.head()

Unnamed: 0_level_0,symbol,organ
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000003,TSPAN6,Liver (pro r^2=0.4136)
ENSG00000001036,FUCA2,"Brain (anti r^2=0.3894), Heart (anti r^2=0.3263)"
ENSG00000001497,LAS1L,Heart (anti r^2=0.507)
ENSG00000003402,CFLAR,Heart (pro r^2=0.3485)
ENSG00000003436,TFPI,Brain (anti r^2=0.3418)


# Join with linear models #

In [80]:
shap_with_linear = shap_results.join(linear_organ["organ"])
shap_with_linear.to_csv(locations.output.dir / "results" / "shap_with_linear.tsv", sep="\t", index_label="reference_gene")
shap_with_linear

Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1,other_life_history_traits,organ
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000204498,NFKBIL1,10.0,25.1,-0.563,anti,10.0,7.35,-0.141,weak anti,mass_kg (weak anti | 3 | 7.08 | -0.1927),
ENSG00000010219,DYRK4,10.0,22.1,0.447,pro,10.0,8.375,0.263,weak pro,"mass_kg (weak pro | 4 | 21.19 | 0.2116), gesta...",Heart (pro r^2=0.3529)
ENSG00000167515,TRAPPC2L,10.0,21.05,-0.316,anti,9.0,6.056,-0.168,weak anti,mtGC (anti | 8 | 4.03 | -0.5141),"Brain (anti r^2=0.5246), Heart (anti r^2=0.3671)"
ENSG00000165501,LRR1,10.0,20.15,0.721,strong pro,3.0,2.25,0.658,weak pro,,"Heart (pro r^2=0.4585), Lung (pro r^2=0.4286),..."
ENSG00000142002,DPP9,10.0,19.05,-0.801,strong anti,4.0,2.5,-0.703,weak anti,,"Lung (anti r^2=0.5074), Heart (anti r^2=0.3048)"
ENSG00000160948,VPS28,10.0,17.15,-0.571,anti,5.0,2.25,-0.329,weak anti,,Lung (anti r^2=0.3473)
ENSG00000023191,RNH1,10.0,16.775,-0.605,strong anti,4.0,4.0,-0.319,weak anti,mtGC (weak anti | 4 | 2.19 | -0.5196),
ENSG00000165555,NOXRED1,10.0,16.0,0.816,strong pro,4.0,3.938,0.799,pro,,Liver (pro r^2=0.3163)
ENSG00000137343,ATAT1,10.0,15.575,-0.593,anti,2.0,2.875,-0.403,weak anti,,"Lung (anti r^2=0.6775), Heart (anti r^2=0.3218)"
ENSG00000170835,CEL,10.0,15.525,0.712,strong pro,8.0,3.219,0.694,pro,mtGC (pro | 5 | 2.4 | 0.6377),Brain (pro r^2=0.4815)


# Genage annotations #

In [64]:
genage_folder = locations.input.annotations.genage

genage_conversions = pd.read_csv(genage_folder.conversion, sep="\t")
genage = genage_conversions[["Ensembl","Gene Symbol", "Organism", "Lifespan Effect", "Longevity Influence", "Method"]].drop_duplicates()
genage.head(10)

Unnamed: 0,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method
0,YPL174C,NIP100,Saccharomyces cerevisiae,decrease,fitness,Deletion
1,YER091C,MET6,Saccharomyces cerevisiae,decrease,fitness,Deletion
2,YDR108W,TRS85,Saccharomyces cerevisiae,decrease,fitness,Deletion
4,YMR135C,GID8,Saccharomyces cerevisiae,decrease,fitness,Deletion
6,YCR024C-A,PMP1,Saccharomyces cerevisiae,decrease,fitness,Deletion
7,YDR181C,SAS4,Saccharomyces cerevisiae,increase,anti,Deletion
8,YJL210W,PEX2,Saccharomyces cerevisiae,decrease,fitness,Deletion
9,YNL079C,TPM1,Saccharomyces cerevisiae,decrease,fitness,Deletion
10,YJR127C,RSF2,Saccharomyces cerevisiae,decrease,fitness,Deletion
11,YMR307W,GAS1,Saccharomyces cerevisiae,decrease,fitness,Deletion


In [65]:
genage_genes = pd.read_csv(genage_folder.orthologs.dir / "all.tsv", sep="\t").rename(columns={"Homo_sapiens":"reference_gene"})
genage_genes.head(5)

Unnamed: 0,reference_gene,Caenorhabditis_elegans,Drosophila_melanogaster,Mus_musculus,Mesocricetus_auratus,Danio_rerio
0,ENSG00000242265,,,ENSMUSG00000092035,ENSMAUG00000018656,ENSDARG00000109342
1,ENSG00000139990,WBGene00011242,FBgn0250755,ENSMUSG00000049106,ENSMAUG00000004350,ENSDARG00000060320
2,ENSG00000073921,WBGene00006751,FBgn0086372,ENSMUSG00000039361,ENSMAUG00000011448,ENSDARG00000012866;ENSDARG00000014137
3,ENSG00000139687,WBGene00003020,,ENSMUSG00000022105,ENSMAUG00000020865,ENSDARG00000006782
4,ENSG00000119977,WBGene00017120,,ENSMUSG00000025008,ENSMAUG00000021808,


In [70]:
genage_humanized = genage.head(0)
for c in genage_genes.columns[1:]:
    col = c.replace("_", " ")
    selected_genes = genage_genes[["reference_gene",c]].rename(columns = {c:"Ensembl"}).dropna()
    selected_genes["Ensembl"] = selected_genes.apply(lambda row: row["Ensembl"].split(";"),1)
    genage_org = genage[genage["Organism"]==col]
    merged = selected_genes.explode("Ensembl").merge(genage_org, on="Ensembl", how="inner")
    genage_humanized = pd.concat([genage_humanized, merged])
genage_humanized = genage_humanized.drop_duplicates()
genage_humanized.head(10)

Unnamed: 0,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method,reference_gene
0,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000095917
1,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000116176
2,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000172236
3,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000197253
4,WBGene00004481,rps-12,Caenorhabditis elegans,increase,anti,Post-developmental RNA interference,ENSG00000112306
5,WBGene00004789,sgk-1,Caenorhabditis elegans,increase,anti,Deletion,ENSG00000101049
6,WBGene00004789,sgk-1,Caenorhabditis elegans,increase,anti,Deletion,ENSG00000104205
7,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000087053
8,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000063601
9,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000003987


### Add genage human info ###

In [71]:
genage_human = pd.read_csv(genage_folder.human, sep='\t')
genage_human.shape

(307, 6)

### Check for genage genes ###

In [72]:
shap_with_linear_genage = shap_with_linear.merge(genage_humanized, on="reference_gene")
shap_with_linear.to_csv(locations.output.dir / "results" / "genage_intersection.tsv", index_label="reference_gene", sep="\t")
shap_with_linear_genage

Unnamed: 0,reference_gene,symbol,repeats,mean_shap,mean_kendall_tau,direction,other_life_history_traits,organ,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method
0,ENSG00000164362,TERT,10.0,14.025,-0.555,anti,"mtGC (weak anti | 3 | 5.92 | -0.4467), gestati...","Heart (anti r^2=0.4954), Liver (anti r^2=0.4836)",ENSMUSG00000021611,Tert,Mus musculus,increase,pro,Overexpression
1,ENSG00000164362,TERT,10.0,14.025,-0.555,anti,"mtGC (weak anti | 3 | 5.92 | -0.4467), gestati...","Heart (anti r^2=0.4954), Liver (anti r^2=0.4836)",ENSMUSG00000021611,Tert,Mus musculus,increase,pro,Knockin
2,ENSG00000164362,TERT,10.0,14.025,-0.555,anti,"mtGC (weak anti | 3 | 5.92 | -0.4467), gestati...","Heart (anti r^2=0.4954), Liver (anti r^2=0.4836)",ENSDARG00000042637,tert,Danio rerio,decrease,pro,Mutation
3,ENSG00000142937,RPS8,10.0,11.525,0.677,strong pro,,,WBGene00004477,rps-8,Caenorhabditis elegans,increase,anti,RNA interference
4,ENSG00000162959,MEMO1,10.0,10.25,-0.676,strong anti,gestation_days (weak anti | 4 | 3.56 | -0.257),"Heart (anti r^2=0.4534), Brain (anti r^2=0.444...",WBGene00016500,memo-1,Caenorhabditis elegans,increase,anti,Mutation


In [218]:
#all_ext_genage = all_ext.merge(genage_humanized, on="reference_gene")
#all_ext_genage.head(5)

In [219]:
#all_ext_human_genage = genage_human.merge(all_ext, on="symbol")
#all_ext_human_genage

# Causality #

### Intesection ###

In [162]:
#causal_anton_5_5 = pd.read_csv(causality / "causal_relations_5_tissues_new_set_Rodrigo.csv")
#causal_current = causal_anton_5_5 
#fathers = (causal_current[causal_current["predicate"] == "is father of"])[["node1"]].drop_duplicates().rename(columns={"node1": "symbol"})
#fathers

In [163]:
"""
causal_merge = current.merge(fathers, on="symbol", how="left", indicator = True).sort_values(by="_merge", ascending=False)
causal_merge["_merge"] = causal_merge["_merge"]=="both"
causal = causal_merge \
  .rename(columns={"_merge": "causal"}) \
  .set_index("reference_gene") \
  .sort_values(by="kendall_tau_to_max_lifespan", ascending=False)
causal.head(10)
"""

'\ncausal_merge = current.merge(fathers, on="symbol", how="left", indicator = True).sort_values(by="_merge", ascending=False)\ncausal_merge["_merge"] = causal_merge["_merge"]=="both"\ncausal = causal_merge   .rename(columns={"_merge": "causal"})   .set_index("reference_gene")   .sort_values(by="kendall_tau_to_max_lifespan", ascending=False)\ncausal.head(10)\n'

# Saving results #

In [221]:
#all.to_csv(intersections / "intersections.tsv", sep = "\t", index = True)
#all_ext.to_csv(intersections / "intersections_5_and_7.tsv", sep = "\t", index = False)

In [220]:
#all_ext_genage.to_csv(intersections / "genage_model_intersections.tsv", sep = "\t", index = False)
#all_ext_human_genage.to_csv(intersections / "genage_human_intersections.tsv", sep = "\t", index = False)