# Results intersections #

### Parameters ###
Will be overriden by results_intersections.yaml

In [7]:
#Parameters
threshold = 0.2
debug_local = True

In [8]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from typing import *
from yspecies import *
from yspecies.workflow import *
from yspecies.dataset import *
from yspecies.utils import *

In [10]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

### Load pathes ###

In [12]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [13]:
external = shap_results = locations.output.external 
intersections = locations.output.intersections
shap = external.shap
linear = external.linear
causality = external.dir / "causality"

### Load linear ###

In [62]:
def lag_linear(df: pd.DataFrame)->pd.DataFrame:
    df["MLS_influence_linear"] = df.apply(lambda row: "pro" if row["maxlifespan_sign"]>0 else "anti", axis = 1)
    selected = df[["reference_gene","symbol","organ","maxlifespan_adjpval","maxlifespan_r2_adj", "MLS_influence_linear"]].sort_values(by="maxlifespan_r2_adj", ascending=False).drop_duplicates()
    selected["organ"] = selected.apply(lambda row: row["organ"] + " (" +row["MLS_influence_linear"]+" r^2=" + str(round(row["maxlifespan_r2_adj"],4)) +")", axis=1)
    return selected[["reference_gene","symbol","organ"]].groupby(["reference_gene", "symbol"], as_index=False).agg({'organ': ', '.join}).set_index("reference_gene")


In [63]:
linear_organ = lag_linear(pd.read_csv(linear / 'with_organ_indicator_columns.tsv', sep="\t"))
linear_organ.head(10)

Unnamed: 0_level_0,symbol,organ
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000003,TSPAN6,Liver (pro r^2=0.3542)
ENSG00000000457,SCYL3,Lung (pro r^2=0.3503)
ENSG00000001036,FUCA2,Brain (anti r^2=0.4096)
ENSG00000001084,GCLC,Kidney (anti r^2=0.5489)
ENSG00000001461,NIPAL3,Liver (pro r^2=0.3229)
ENSG00000003249,DBNDD1,Kidney (pro r^2=0.3247)
ENSG00000003402,CFLAR,Heart (pro r^2=0.3864)
ENSG00000004059,ARF5,Liver (anti r^2=0.4983)
ENSG00000004139,SARM1,Kidney (pro r^2=0.4836)
ENSG00000004799,PDK4,"Lung (pro r^2=0.6136), Brain (pro r^2=0.3881)"


### Describe life-histroy ###

In [65]:
def lifehistory(row, extra: list, threshold: float):
    def get(col: str):
        return col +" (" + str(round(row[col], 4))+")" if abs(row[col])>=abs(threshold) else ""
    return ', '.join([get(e).replace("kendall_tau_to_", "") for e in extra if get(e) != ""])

def with_lifehistory(genes: pd.DataFrame):
    main = ['ids','name', 'gain_score_to_max_lifespan', 'kendall_tau_to_max_lifespan']
    extra = ["kendall_tau_to_gestation_days", "kendall_tau_to_mass_g", "kendall_tau_to_temperature_celsius", "kendall_tau_to_metabolic_rate"]
    selected_genes = genes[main].rename(columns = {"ids": "reference_gene", "name": "symbol"})
    selected_genes["life_history"] = genes.apply(lambda row: lifehistory(row, extra, threshold), axis=1)
    return selected_genes

### Mark pro and anti-MLS genes ###

In [66]:
#used to evaluate the direction
def mark_direction(row, threshold: float, col: str = "kendall_tau_to_max_lifespan"):
    cor = row[col]
    if cor >= threshold * 2:
        return "strong pro"
    if cor >= threshold:
        return "pro"
    elif cor > 0.0:
        return "weak pro"
    elif round(cor, 4) == 0.0:
        return "neutral"
    elif cor >= -threshold:
        return "weak anti"
    elif cor <= -threshold * 2:
        return "strong anti"
    else:
        return "anti"
    
#ads gene direction field
def with_direction(df: pd.DataFrame, pro_threshold: float = 0.2, min_gain: float = 1, field: str = "MLS_influence", correlation_col: str = "kendall_tau_to_max_lifespan"):
    df[field] = df.apply(lambda row: mark_direction(row, threshold, correlation_col), axis = 1)
    return df[df["gain_score_to_max_lifespan"]>=min_gain].sort_values(by="gain_score_to_max_lifespan", ascending = False)

#annotations with -pro -anti and lifehistory
def annotated(df: pd.DataFrame, threshold: float = 0.2, lifehistory_threshold = 0.2):
    return with_direction(with_lifehistory(df))
    

### Processing lifehistory ###

### Load selected genes ###

In [67]:
anton_5_5  = pd.read_csv(shap / "5_tissues_species_validation.csv")
anton_7_5  = pd.read_csv(shap / "7_tissues_species_validation.csv")

old = annotated(pd.read_csv(shap / "5_tissues_anton_species_5_bootstraps.csv", index_col=0), threshold = threshold ) #anton_5_5
current = annotated(anton_5_5, threshold = threshold)
current_7 = annotated(anton_7_5, threshold = threshold)
current

Unnamed: 0,reference_gene,symbol,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,life_history,MLS_influence
63,ENSG00000185880,TRIM69,689024.909,0.151,,weak pro
41,ENSG00000133256,PDE6B,133800.141,0.545,gestation_days (0.6518),strong pro
58,ENSG00000006282,SPATA20,93405.346,0.636,gestation_days (0.6903),strong pro
18,ENSG00000129187,DCTD,71669.584,0.656,gestation_days (0.7374),strong pro
3,ENSG00000157343,ARMC12,19732.875,0.628,gestation_days (0.6295),strong pro
65,ENSG00000107815,TWNK,19014.891,-0.715,,strong anti
9,ENSG00000164879,CA3,18519.766,-0.499,,strong anti
52,ENSG00000107551,RASSF4,16217.082,0.6,,strong pro
8,ENSG00000198663,C6orf89,15864.829,-0.681,gestation_days (-0.7211),strong anti
56,ENSG00000149577,SIDT2,12894.13,-0.534,,strong anti


# Causality #

### Intesection ###

In [68]:
causal_anton_5_5 = pd.read_csv(causality / "causal_relations_5_tissues_new_set_Rodrigo.csv")
causal_current = causal_anton_5_5 
fathers = (causal_current[causal_current["predicate"] == "is father of"])[["node1"]].drop_duplicates().rename(columns={"node1": "symbol"})
fathers

Unnamed: 0,symbol
0,SPATA20
2,NUCKS1
5,BRAP
10,RASSF4
11,TWNK
17,DCTD
23,PDE6B
29,METTL5
34,IGSF10
37,ARMC12


In [69]:
causal_merge = current.merge(fathers, on="symbol", how="left", indicator = True).sort_values(by="_merge", ascending=False)
causal_merge["_merge"] = causal_merge["_merge"]=="both"
causal = causal_merge \
  .rename(columns={"_merge": "causal"}) \
  .set_index("reference_gene") \
  .sort_values(by="kendall_tau_to_max_lifespan", ascending=False)
causal.head(10)

Unnamed: 0_level_0,symbol,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,life_history,MLS_influence,causal
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000138382,METTL5,567.972,0.719,,strong pro,True
ENSG00000166436,TRIM66,1093.345,0.696,,strong pro,True
ENSG00000160323,ADAMTS13,1240.185,0.659,,strong pro,False
ENSG00000129187,DCTD,71669.584,0.656,gestation_days (0.7374),strong pro,True
ENSG00000069275,NUCKS1,577.205,0.65,,strong pro,True
ENSG00000006282,SPATA20,93405.346,0.636,gestation_days (0.6903),strong pro,True
ENSG00000157343,ARMC12,19732.875,0.628,gestation_days (0.6295),strong pro,True
ENSG00000113946,CLDN16,12825.009,0.616,gestation_days (0.5818),strong pro,False
ENSG00000168060,NAALADL1,12657.496,0.614,,strong pro,True
ENSG00000107551,RASSF4,16217.082,0.6,,strong pro,True


# comparison with old results #

In [70]:
old_ext =  old.rename(columns={"gain_score_to_max_lifespan": "gain_score_to_max_lifespan_OLD", "kendall_tau_to_max_lifespan": "kendall_tau_to_max_lifespan_OLD", "life_history": "life_history_OLD"}) 
novel = causal.reset_index()
new_old_comparison = novel.merge(old_ext, on=["reference_gene", "symbol"], how="outer", indicator = True) \
  .sort_values(by="_merge", ascending = False)
new_old_comparison["_merge"] = new_old_comparison["_merge"].replace({"right_only": "old_results", "left_only": "new_results"})
new_old_comparison.head(10)

Unnamed: 0,reference_gene,symbol,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,life_history,MLS_influence_x,causal,gain_score_to_max_lifespan_OLD,kendall_tau_to_max_lifespan_OLD,life_history_OLD,MLS_influence_y,_merge
22,ENSG00000107815,TWNK,19014.891,-0.715,,strong anti,True,29538.657,-0.717,,strong anti,both
16,ENSG00000164879,CA3,18519.766,-0.499,,strong anti,False,11699.457,-0.257,,anti,both
2,ENSG00000160323,ADAMTS13,1240.185,0.659,,strong pro,False,4504.608,0.693,,strong pro,both
3,ENSG00000129187,DCTD,71669.584,0.656,gestation_days (0.7374),strong pro,True,83059.823,0.64,gestation_days (0.7391),strong pro,both
4,ENSG00000069275,NUCKS1,577.205,0.65,,strong pro,True,2750.898,0.715,gestation_days (0.5525),strong pro,both
5,ENSG00000006282,SPATA20,93405.346,0.636,gestation_days (0.6903),strong pro,True,67496.807,0.593,"gestation_days (0.6627), mass_g (0.5261), meta...",strong pro,both
6,ENSG00000157343,ARMC12,19732.875,0.628,gestation_days (0.6295),strong pro,True,14611.846,0.637,gestation_days (0.6541),strong pro,both
20,ENSG00000198663,C6orf89,15864.829,-0.681,gestation_days (-0.7211),strong anti,True,7371.253,-0.746,gestation_days (-0.682),strong anti,both
8,ENSG00000168060,NAALADL1,12657.496,0.614,,strong pro,True,4438.045,0.58,,strong pro,both
9,ENSG00000107551,RASSF4,16217.082,0.6,,strong pro,True,39677.401,0.542,,strong pro,both


In [71]:
new_old_comparison.to_csv(intersections / "new_old_comparison.tsv", sep = "\t", index = False)

# Join with linear models #

In [72]:
def shap_with_linear(shap_causal: pd.DataFrame, linear_organ: pd.DataFrame):
  return shap_causal.reset_index().merge(linear_organ,on="symbol", how="left") \
    .rename(columns={"organ": "organs (r^2) in linear models", "life_history": "life_history_kendal_tau", "kendall_tau_to_max_lifespan": "MLS_kendall_tau", "gain_score_to_max_lifespan": "MLS_gain_score"}) \
    .sort_values(by="MLS_kendall_tau", ascending = False) \
    .set_index("reference_gene").sort_values(by="MLS_gain_score", ascending=False)

In [73]:
all = shap_with_linear(causal, linear_organ)
all.head(10)

Unnamed: 0_level_0,symbol,MLS_gain_score,MLS_kendall_tau,life_history_kendal_tau,MLS_influence,causal,organs (r^2) in linear models
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000185880,TRIM69,689024.909,0.151,,weak pro,True,
ENSG00000133256,PDE6B,133800.141,0.545,gestation_days (0.6518),strong pro,True,
ENSG00000006282,SPATA20,93405.346,0.636,gestation_days (0.6903),strong pro,True,"Heart (pro r^2=0.6114), Lung (pro r^2=0.5763),..."
ENSG00000129187,DCTD,71669.584,0.656,gestation_days (0.7374),strong pro,True,"Liver (pro r^2=0.6644), Brain (pro r^2=0.3519)..."
ENSG00000157343,ARMC12,19732.875,0.628,gestation_days (0.6295),strong pro,True,
ENSG00000107815,TWNK,19014.891,-0.715,,strong anti,True,
ENSG00000164879,CA3,18519.766,-0.499,,strong anti,False,Liver (anti r^2=0.4193)
ENSG00000107551,RASSF4,16217.082,0.6,,strong pro,True,"Brain (pro r^2=0.5112), Kidney (pro r^2=0.3926..."
ENSG00000198663,C6orf89,15864.829,-0.681,gestation_days (-0.7211),strong anti,True,"Liver (anti r^2=0.3951), Kidney (anti r^2=0.3338)"
ENSG00000149577,SIDT2,12894.13,-0.534,,strong anti,False,Liver (anti r^2=0.4537)


In [74]:
intersections = locations.output.intersections
all = shap_with_linear(causal, linear_organ)

In [75]:
all_ext = all.merge(current_7, on=["reference_gene", "symbol"], how="outer", indicator = True) \
  .sort_values(by="_merge", ascending=False)
all_ext["_merge"] = all_ext["_merge"].replace({"left_only": "five_tissues_model_only", "right_only": "seven_tissues_model_only"})
all_ext

Unnamed: 0,reference_gene,symbol,MLS_gain_score,MLS_kendall_tau,life_history_kendal_tau,MLS_influence_x,causal,organs (r^2) in linear models,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,life_history,MLS_influence_y,_merge
0,ENSG00000185880,TRIM69,689024.909,0.151,,weak pro,True,,675048.001,0.088,,weak pro,both
9,ENSG00000149577,SIDT2,12894.13,-0.534,,strong anti,False,Liver (anti r^2=0.4537),14385.231,-0.576,,strong anti,both
1,ENSG00000133256,PDE6B,133800.141,0.545,gestation_days (0.6518),strong pro,True,,128276.065,0.504,gestation_days (0.6405),strong pro,both
20,ENSG00000166436,TRIM66,1093.345,0.696,,strong pro,True,Liver (pro r^2=0.4859),360.945,0.545,,strong pro,both
19,ENSG00000160323,ADAMTS13,1240.185,0.659,,strong pro,False,Kidney (pro r^2=0.3314),1573.559,0.743,,strong pro,both
16,ENSG00000177084,POLE,1820.758,0.458,gestation_days (0.5213),strong pro,True,,4865.386,0.295,gestation_days (0.5522),pro,both
15,ENSG00000169189,NSMCE1,2700.554,0.465,gestation_days (0.6451),strong pro,False,Liver (pro r^2=0.4798),13737.792,0.743,gestation_days (0.7144),strong pro,both
12,ENSG00000214827,MTCP1,9390.777,-0.603,,strong anti,True,"Brain (anti r^2=0.4052), Kidney (anti r^2=0.3005)",7219.915,-0.618,,strong anti,both
21,ENSG00000069275,NUCKS1,577.205,0.65,,strong pro,True,Liver (pro r^2=0.4727),5379.475,0.74,,strong pro,both
8,ENSG00000198663,C6orf89,15864.829,-0.681,gestation_days (-0.7211),strong anti,True,"Liver (anti r^2=0.3951), Kidney (anti r^2=0.3338)",26612.642,-0.782,"gestation_days (-0.7173), temperature_celsius ...",strong anti,both


# Genage annotations #

In [76]:
genage_folder = locations.input.annotations.genage

genage_conversions = pd.read_csv(genage_folder.conversion, sep="\t")
genage = genage_conversions[["Ensembl","Gene Symbol", "Organism", "Lifespan Effect", "Longevity Influence", "Method"]].drop_duplicates()
genage.head(10)

Unnamed: 0,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method
0,YPL174C,NIP100,Saccharomyces cerevisiae,decrease,fitness,Deletion
1,YER091C,MET6,Saccharomyces cerevisiae,decrease,fitness,Deletion
2,YDR108W,TRS85,Saccharomyces cerevisiae,decrease,fitness,Deletion
4,YMR135C,GID8,Saccharomyces cerevisiae,decrease,fitness,Deletion
6,YCR024C-A,PMP1,Saccharomyces cerevisiae,decrease,fitness,Deletion
7,YDR181C,SAS4,Saccharomyces cerevisiae,increase,anti,Deletion
8,YJL210W,PEX2,Saccharomyces cerevisiae,decrease,fitness,Deletion
9,YNL079C,TPM1,Saccharomyces cerevisiae,decrease,fitness,Deletion
10,YJR127C,RSF2,Saccharomyces cerevisiae,decrease,fitness,Deletion
11,YMR307W,GAS1,Saccharomyces cerevisiae,decrease,fitness,Deletion


In [77]:
genage["Ensembl"].drop_duplicates().shape

(2090,)

In [78]:
genage_genes = pd.read_csv(genage_folder.orthologs.dir / "all.tsv", sep="\t").rename(columns={"Homo_sapiens":"reference_gene"})
genage_genes.head(5)

Unnamed: 0,reference_gene,Caenorhabditis_elegans,Drosophila_melanogaster,Mus_musculus,Mesocricetus_auratus,Danio_rerio
0,ENSG00000242265,,,ENSMUSG00000092035,ENSMAUG00000018656,ENSDARG00000109342
1,ENSG00000139990,WBGene00011242,FBgn0250755,ENSMUSG00000049106,ENSMAUG00000004350,ENSDARG00000060320
2,ENSG00000073921,WBGene00006751,FBgn0086372,ENSMUSG00000039361,ENSMAUG00000011448,ENSDARG00000012866;ENSDARG00000014137
3,ENSG00000139687,WBGene00003020,,ENSMUSG00000022105,ENSMAUG00000020865,ENSDARG00000006782
4,ENSG00000119977,WBGene00017120,,ENSMUSG00000025008,ENSMAUG00000021808,


In [79]:
genage_humanized = genage.head(0)
for c in genage_genes.columns[1:]:
    col = c.replace("_", " ")
    selected_genes = genage_genes[["reference_gene",c]].rename(columns = {c:"Ensembl"}).dropna()
    selected_genes["Ensembl"] = selected_genes.apply(lambda row: row["Ensembl"].split(";"),1)
    genage_org = genage[genage["Organism"]==col]
    merged = selected_genes.explode("Ensembl").merge(genage_org, on="Ensembl", how="inner")
    genage_humanized = pd.concat([genage_humanized, merged])
genage_humanized = genage_humanized.drop_duplicates()
genage_humanized.head(10)

Unnamed: 0,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method,reference_gene
0,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000095917
1,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000116176
2,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000172236
3,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000197253
4,WBGene00004481,rps-12,Caenorhabditis elegans,increase,anti,Post-developmental RNA interference,ENSG00000112306
5,WBGene00004789,sgk-1,Caenorhabditis elegans,increase,anti,Deletion,ENSG00000101049
6,WBGene00004789,sgk-1,Caenorhabditis elegans,increase,anti,Deletion,ENSG00000104205
7,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000087053
8,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000063601
9,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000003987


In [80]:
all_ext_genage = all_ext.merge(genage_humanized, on="reference_gene")
all_ext_genage.head(5)

Unnamed: 0,reference_gene,symbol,MLS_gain_score,MLS_kendall_tau,life_history_kendal_tau,MLS_influence_x,causal,organs (r^2) in linear models,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,life_history,MLS_influence_y,_merge,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method
0,ENSG00000204231,RXRB,,,,,,,3657.395,-0.413,,strong anti,seven_tissues_model_only,WBGene00007547,nhr-154,Caenorhabditis elegans,increase,anti,RNA interference
1,ENSG00000121310,ECHDC2,,,,,,,1451.325,0.299,,pro,seven_tissues_model_only,FBgn0035169,CG13890,Drosophila melanogaster,increase,pro,Overexpression


### Add genage human info ###

In [81]:
genage_human = pd.read_csv(genage_folder.human, sep='\t')
genage_human.shape

(307, 6)

In [82]:
all_ext_human_genage = genage_human.merge(all_ext, on="symbol")
all_ext_human_genage

Unnamed: 0,GenAge ID,symbol,name,entrez gene id,uniprot,why,reference_gene,MLS_gain_score,MLS_kendall_tau,life_history_kendal_tau,MLS_influence_x,causal,organs (r^2) in linear models,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,life_history,MLS_influence_y,_merge
0,211,SUMO1,small ubiquitin-like modifier 1,7341,SUMO1_HUMAN,upstream,ENSG00000116030,,,,,,,8916.589,0.538,,strong pro,seven_tissues_model_only


# Saving results #

In [83]:
all.to_csv(intersections / "intersections.tsv", sep = "\t", index = True)
all_ext.to_csv(intersections / "intersections_5_and_7.tsv", sep = "\t", index = False)

In [84]:
all_ext_genage.to_csv(intersections / "genage_model_intersections.tsv", sep = "\t", index = False)
all_ext_human_genage.to_csv(intersections / "genage_human_intersections.tsv", sep = "\t", index = False)