In [2]:
from yspecies import *
from yspecies.enums import *
from yspecies.dataset import *
from yspecies.misc import *

In [3]:
from typing import *
import pandas as pd
import shap
from pprint import pprint
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from scipy.stats import kendalltau
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, recall_score, precision_score, f1_score
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# save label encoder to global scope
le = LabelEncoder()
NUMBER_OF_BOOTSTRAPS = 5

### Load pathes ###

In [8]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [9]:
external = shap_results = locations.output / "external" 
shap = external / "shap"
linear = external / "linear"
causality = external / "causality"

### Load linear ###

In [164]:
def lag_linear(df: pd.DataFrame, sign: int)->pd.DataFrame:
    selected = df[df["maxlifespan_sign"]==sign][["symbol","organ","maxlifespan_adjpval","maxlifespan_r2_adj"]].sort_values(by="maxlifespan_r2_adj", ascending=False).drop_duplicates()
    return selected #.groupby("symbol","organ", as_index=False).agg({'organ': ' '.join, 'maxlifespan_r2_adj': ' '.join})


def pro_linear(df: pd.DataFrame)->pd.DataFrame:
    return lag_linear(df, 1)
   
def anti_linear(df: pd.DataFrame)->pd.DataFrame:
    return lag_linear(df, -1)

In [169]:
anti_linear_organ =  anti_linear(linear_organ).groupby(["reference_gene","symbol"]).agg({'organ': lambda value: value.to_list(), "maxlifespan_r2_adj": lambda value: value.to_list()})
pro_linear_organ =  pro_linear(linear_organ).groupby(["reference_gene","symbol"]).agg({'organ': lambda value: value.to_list(), "maxlifespan_r2_adj": lambda value: value.to_list()})
pro_linear_organ.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,organ,maxlifespan_r2_adj
reference_gene,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000003,TSPAN6,[Liver],[0.35423811520000004]
ENSG00000000457,SCYL3,[Lung],[0.35033806590000005]
ENSG00000001461,NIPAL3,[Liver],[0.322943983]
ENSG00000003249,DBNDD1,[Kidney],[0.3246580106]
ENSG00000003402,CFLAR,[Heart],[0.3863581705]


In [168]:
linear_blk = load_table(linear /'Significant in B,L,K.tsv').set_index("reference_gene")
linear_organ = load_table(linear / 'with_organ_indicator_columns.tsv').set_index("reference_gene")
pro_linear(linear_organ).head(10)

Unnamed: 0_level_0,symbol,organ,maxlifespan_adjpval,maxlifespan_r2_adj
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000136436,CALCOCO2,Liver,0.0,0.857
ENSG00000136436,CALCOCO2,Lung,0.0,0.834
ENSG00000101190,TCFL5,Lung,0.0,0.766
ENSG00000100889,PCK2,Liver,0.0,0.76
ENSG00000088992,TESC,Lung,0.0,0.753
ENSG00000141736,ERBB2,Liver,0.0,0.744
ENSG00000170190,SLC16A5,Lung,0.0,0.742
ENSG00000164649,CDCA7L,Lung,0.0,0.74
ENSG00000177000,MTHFR,Lung,0.0,0.711
ENSG00000112699,GMDS,Liver,0.0,0.704


In [114]:
linear_models = load_table(linear / 'linear_models_on_species_vars.tsv').set_index("reference_gene")
linear_all = load_table(linear / 'Significant in all organs.tsv').set_index("reference_gene")
linear_pro = linear_all[linear_all["maxlifespan_sign"]==1].sort_values(by="maxlifespan_r2_adj", ascending = False)
linear_anti = linear_all[linear_all["maxlifespan_sign"]==-1].sort_values(by="maxlifespan_r2_adj", ascending = True)
linear_pro

Unnamed: 0_level_0,symbol,organ,human_samples,maxlifespan_adjpval,maxlifespan_r2_adj,maxlifespan_n_obs,maxlifespan_sign,mass_adjpval,mass_r2_adj,mass_n_obs,mass_sign,temperature_adjpval,temperature_r2_adj,temperature_n_obs,temperature_sign,metabolicRate_adjpval,metabolicRate_r2_adj,metabolicRate_n_obs,metabolicRate_sign,gestation_adjpval,gestation_r2_adj,gestation_n_obs,gestation_sign,mtGC_adjpval,mtGC_r2_adj,mtGC_n_obs,mtGC_sign,maxlifespan_uniquely_associated,mass_uniquely_associated,temperature_uniquely_associated,metabolicRate_uniquely_associated,gestation_uniquely_associated,mtGC_uniquely_associated,is_maxls_associated_in_Brain,is_maxls_associated_in_Heart,is_maxls_associated_in_Lung,is_maxls_associated_in_Liver,is_maxls_associated_in_Kidney
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
ENSG00000136436,CALCOCO2,Liver,withoutHumans,0.0,0.857,83,1,0.116,0.068,83,1,0.216,0.03,83,-1,0.306,0.043,78.0,1,0.0,0.333,83.0,1,0.0,0.315,83,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000136436,CALCOCO2,Lung,withoutHumans,0.0,0.834,22,1,0.342,0.116,22,1,0.534,0.011,22,1,0.288,0.175,21.0,1,0.004,0.461,22.0,1,0.413,0.032,22,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000269190,FBXO17,Liver,withoutHumans,0.0,0.693,97,1,0.003,0.153,97,1,0.147,0.035,97,-1,0.079,0.078,92.0,1,0.0,0.586,96.0,1,0.0,0.233,96,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000136436,CALCOCO2,Heart,withoutHumans,0.0,0.693,29,1,0.28,0.108,29,1,0.885,-0.033,29,-1,0.262,0.146,27.0,1,0.002,0.406,29.0,1,0.013,0.297,29,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000136436,CALCOCO2,Brain,withoutHumans,0.0,0.691,89,1,0.052,0.088,89,1,0.928,-0.011,89,-1,0.227,0.054,80.0,1,0.0,0.361,89.0,1,0.0,0.31,89,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000006282,SPATA20,Heart,withoutHumans,0.0,0.611,29,1,0.36,0.08,29,1,0.866,-0.031,29,-1,0.074,0.262,27.0,1,0.0,0.691,29.0,1,0.001,0.478,29,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000006282,SPATA20,Lung,withoutHumans,0.0,0.576,23,1,0.477,0.06,23,1,0.645,-0.011,23,1,0.112,0.277,22.0,1,0.0,0.748,23.0,1,0.023,0.318,23,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000269190,FBXO17,Kidney,withoutHumans,0.0,0.562,53,1,0.937,-0.018,53,1,0.888,-0.017,53,-1,0.974,-0.02,51.0,-1,0.015,0.162,53.0,1,0.035,0.126,53,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000006282,SPATA20,Brain,withoutHumans,0.0,0.531,100,1,0.777,-0.005,100,1,0.617,-0.002,100,1,0.94,-0.01,91.0,1,0.0,0.397,98.0,1,0.0,0.558,98,1,False,False,False,False,False,False,True,True,True,True,True
ENSG00000165501,LRR1,Lung,withoutHumans,0.001,0.519,23,1,0.427,0.078,23,1,0.603,-0.003,23,1,0.407,0.117,22.0,1,0.006,0.419,23.0,1,0.271,0.077,23,1,False,False,False,False,False,False,True,True,True,True,True


In [113]:
linear_all.shape

(20, 38)

### Load shap ###

In [104]:
def pro_genes(df: pd.DataFrame, threshold:float = 0.3) -> pd.DataFrame:
    return df[df["kendall_tau_to_max_lifespan"]>threshold].sort_values(by="kendall_tau_to_max_lifespan", ascending=False)

def pro(df: pd.DataFrame, threshold:float = 0.3) -> pd.DataFrame:
    return pro_genes(df, threshold)[['ids','name', 'gain_score_to_max_lifespan', 'kendall_tau_to_max_lifespan']]

def anti_genes(df: pd.DataFrame, threshold:float = -0.3) -> pd.DataFrame:
    return df[df["kendall_tau_to_max_lifespan"]<threshold].sort_values(by="kendall_tau_to_max_lifespan", ascending=True)

def anti(df: pd.DataFrame, threshold:float = -0.3) -> pd.DataFrame:
    return anti_genes(df, threshold)[['ids','name', 'gain_score_to_max_lifespan', 'kendall_tau_to_max_lifespan']]

In [186]:
anton_5_5 = pd.read_csv(shap / "5_tissues_anton_species_5_bootstraps.csv", index_col=0)
eugen_5_5 = pd.read_csv(shap / "5_tissues_data_11_06_eugene_species.csv", index_col=0)
anton_7_4 = pd.read_csv(shap / "7_tissues_anton_species_4_bootstraps.csv", index_col=0)
current = anton_5_5
pro_short(current)

Unnamed: 0,ids,name,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan
89,ENSG00000069275,NUCKS1,2750.898,0.715
14,ENSG00000160323,ADAMTS13,4504.608,0.693
39,ENSG00000129187,DCTD,83059.823,0.64
53,ENSG00000146215,CRIP3,1580.386,0.639
6,ENSG00000157343,ARMC12,14611.846,0.637
75,ENSG00000116030,SUMO1,1250.186,0.62
65,ENSG00000148175,STOM,1011.608,0.607
43,ENSG00000006282,SPATA20,67496.807,0.593
45,ENSG00000073146,MOV10L1,4217.064,0.584
25,ENSG00000168060,NAALADL1,4438.045,0.58


In [191]:
pro_short(anton_5_5)[["ids" ,"name"]].merge(pro_short(anton_7_4)[["ids" ,"name"]], on=["ids" ,"name"], how="outer", indicator=True).sort_values("_merge", ascending=False)

Unnamed: 0,ids,name,_merge
0,ENSG00000069275,NUCKS1,both
12,ENSG00000107551,RASSF4,both
20,ENSG00000083896,YTHDC1,both
18,ENSG00000106066,CPVL,both
16,ENSG00000134308,YWHAQ,both
1,ENSG00000160323,ADAMTS13,both
14,ENSG00000133256,PDE6B,both
13,ENSG00000166436,TRIM66,both
15,ENSG00000099783,HNRNPM,both
7,ENSG00000006282,SPATA20,both


In [183]:
anti_short(current)

Unnamed: 0,ids,name,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan
45,ENSG00000170222,ADPRM,17010.513,-0.765
32,ENSG00000047230,CTPS2,11354.686,-0.717
8,ENSG00000198663,C6orf89,18147.176,-0.716
64,ENSG00000135940,COX5B,909.851,-0.687
1,ENSG00000214827,MTCP1,6574.88,-0.656
125,ENSG00000147123,NDUFB11,32642.913,-0.641
53,ENSG00000107815,TWNK,16575.278,-0.602
103,ENSG00000143771,CNIH4,94.746,-0.545
38,ENSG00000069869,NEDD4,22288.713,-0.542
62,ENSG00000163584,RPL22L1,376.824,-0.522


### Causality ###

### Intesection ###

In [107]:
causal_anton_5_5 = pd.read_csv(causality / "causal_relations_5_tissues_species_5.csv")
fathers = (causal_anton_5_5[causal_anton_5_5["predicate"] == "is father of"])[["node1"]].drop_duplicates().rename(columns={"node1": "name"})
fathers

Unnamed: 0,name
0,PICALM
8,SPATA20
14,NUCKS1
18,HNRNPM
28,DCTD
34,LIMD2
36,NDUFB11
37,STOM
39,SIDT2
41,ADAMTS13


### pro-longevity ###

In [108]:
pro_cur_merge = pro(current).merge(fathers, on="name", how="left", indicator = True).sort_values(by="_merge", ascending=False)
pro_cur_merge["_merge"] = pro_cur_merge["_merge"]=="both"
pro_cur = pro_cur_merge.rename(columns={"_merge": "causal", "ids": "reference_gene", "name": "symbol"}).set_index("reference_gene")
pro_cur

Unnamed: 0_level_0,symbol,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,causal
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000069275,NUCKS1,2750.898,0.715,True
ENSG00000006282,SPATA20,67496.807,0.593,True
ENSG00000198746,GPATCH3,632.153,0.431,True
ENSG00000099783,HNRNPM,477.721,0.458,True
ENSG00000166436,TRIM66,398.831,0.533,True
ENSG00000160323,ADAMTS13,4504.608,0.693,True
ENSG00000170417,TMEM182,623.907,0.573,True
ENSG00000148175,STOM,1011.608,0.607,True
ENSG00000129187,DCTD,83059.823,0.64,True
ENSG00000073146,MOV10L1,4217.064,0.584,False


In [173]:
pro_all = pro_cur.merge(pro_linear_organ,on="reference_gene").rename({"organ": "organs_in_linear_models", "maxlifespan_r2_adj": "r2_adjusted_in_linear_models"})
pro_all

Unnamed: 0_level_0,symbol,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,causal,organ,maxlifespan_r2_adj
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000069275,NUCKS1,2750.898,0.715,True,[Liver],[0.4726572737]
ENSG00000006282,SPATA20,67496.807,0.593,True,"[Heart, Lung, Brain, Kidney, Liver]","[0.6113832167000001, 0.5762836314, 0.531100971..."
ENSG00000166436,TRIM66,398.831,0.533,True,[Liver],[0.4858931016]
ENSG00000160323,ADAMTS13,4504.608,0.693,True,[Kidney],[0.3313540606]
ENSG00000148175,STOM,1011.608,0.607,True,"[Liver, Brain, Heart]","[0.5172275418, 0.4163506322, 0.3428729781]"
ENSG00000129187,DCTD,83059.823,0.64,True,"[Liver, Brain, Heart, Lung]","[0.6643585026, 0.3519032766, 0.3409784978, 0.3..."
ENSG00000073146,MOV10L1,4217.064,0.584,False,[Brain],[0.35540965729999996]
ENSG00000168060,NAALADL1,4438.045,0.58,False,[Lung],[0.37849600710000003]
ENSG00000088992,TESC,8600.729,0.545,False,"[Lung, Heart, Liver, Brain]","[0.7526756858, 0.5605093436, 0.410325594099999..."
ENSG00000107551,RASSF4,39677.401,0.542,False,"[Brain, Kidney, Heart, Liver]","[0.5111682018, 0.3925895854, 0.367612583500000..."


In [180]:
intersections = locations.output / "intersections"
pro_all.to_csv(intersections / "pro_intersections.tsv", sep = "\t", index = True)

### anti-longevity ###

In [109]:
anti_cur_merge = anti(current).merge(fathers, on="name", how="left", indicator = True).sort_values(by="_merge", ascending=False)
anti_cur_merge["_merge"] = anti_cur_merge["_merge"]=="both"
anti_cur = anti_cur_merge.rename(columns={"_merge": "causal", "ids": "reference_gene", "name": "symbol"}).set_index("reference_gene")
anti_cur

Unnamed: 0_level_0,symbol,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,causal
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000198663,C6orf89,7371.253,-0.746,True
ENSG00000147123,NDUFB11,7288.644,-0.703,True
ENSG00000149577,SIDT2,17588.014,-0.54,True
ENSG00000185271,KLHL33,5888.169,-0.525,True
ENSG00000214827,MTCP1,36295.636,-0.484,True
ENSG00000107815,TWNK,29538.657,-0.717,False
ENSG00000085840,ORC1,17409.994,-0.589,False
ENSG00000132646,PCNA,319.946,-0.439,False
ENSG00000152580,IGSF10,5499.678,-0.397,False


In [174]:
anti_all = anti_cur.merge(anti_linear_organ,on="reference_gene").rename({"organ": "organs_in_linear_models", "maxlifespan_r2_adj": "r2_adjusted_in_linear_models"})
anti_all

Unnamed: 0_level_0,symbol,gain_score_to_max_lifespan,kendall_tau_to_max_lifespan,causal,organ,maxlifespan_r2_adj
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000198663,C6orf89,7371.253,-0.746,True,"[Liver, Kidney]","[0.39511734670000004, 0.3338116818]"
ENSG00000149577,SIDT2,17588.014,-0.54,True,[Liver],[0.45368042049999996]
ENSG00000214827,MTCP1,36295.636,-0.484,True,"[Brain, Kidney]","[0.40521433409999996, 0.3005427027]"
ENSG00000132646,PCNA,319.946,-0.439,False,[Brain],[0.3307226882]


In [181]:
anti_all.to_csv(intersections / "anti_intersections.tsv", sep = "\t", index = True)