In [1]:
from yspecies import *
from yspecies.enums import *
from typing import *

In [2]:
import pandas as pd
import shap
from pprint import pprint
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from scipy.stats import kendalltau
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, recall_score, precision_score, f1_score
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# save label encoder to global scope
le = LabelEncoder()
NUMBER_OF_BOOTSTRAPS = 5

In [3]:
from IPython.display import HTML, display
import tabulate
def tab(table: List[List[str]]):
    display(HTML(tabulate.tabulate(table, tablefmt='html')))

In [158]:
models = pd.read_csv(linear / "all_models.tsv", sep="\t").sort_values(by=["maxlifespan_sign","maxlifespan_r2_adj"], ascending=False)
models.head(10)

Unnamed: 0,reference_gene,symbol,organ,human_samples,maxlifespan_adjpval,maxlifespan_r2_adj,maxlifespan_n_obs,maxlifespan_sign,mass_adjpval,mass_r2_adj,mass_n_obs,mass_sign,temperature_adjpval,temperature_r2_adj,temperature_n_obs,temperature_sign,metabolicRate_adjpval,metabolicRate_r2_adj,metabolicRate_n_obs,metabolicRate_sign,gestation_adjpval,gestation_r2_adj,gestation_n_obs,gestation_sign,mtGC_adjpval,mtGC_r2_adj,mtGC_n_obs,mtGC_sign,maxlifespan_uniquely_associated,mass_uniquely_associated,temperature_uniquely_associated,metabolicRate_uniquely_associated,gestation_uniquely_associated,mtGC_uniquely_associated
1909,ENSG00000136436,CALCOCO2,Liver,withoutHumans,0.0,0.857,83,1,0.116,0.068,83,1,0.216,0.03,83,-1,0.306,0.043,78,1,0.0,0.333,83,1,0.0,0.315,83,1,False,False,False,False,False,False
1127,ENSG00000136436,CALCOCO2,Lung,withoutHumans,0.0,0.834,22,1,0.342,0.116,22,1,0.534,0.011,22,1,0.288,0.175,21,1,0.004,0.461,22,1,0.413,0.032,22,1,False,False,False,False,False,False
874,ENSG00000101190,TCFL5,Lung,withoutHumans,0.0,0.766,22,1,0.761,-0.017,22,1,0.434,0.036,22,1,0.743,0.005,21,1,0.002,0.515,22,1,0.017,0.356,22,1,False,False,False,False,False,False
1718,ENSG00000100889,PCK2,Liver,withoutHumans,0.0,0.76,103,1,0.092,0.061,103,1,0.86,-0.008,103,-1,0.311,0.034,98,1,0.0,0.332,102,1,0.0,0.386,102,1,False,False,False,False,False,False
843,ENSG00000088992,TESC,Lung,withoutHumans,0.0,0.753,20,1,0.017,0.469,20,1,0.108,0.213,20,1,0.07,0.368,19,1,0.001,0.604,20,1,0.415,0.035,20,1,False,False,False,False,False,False
1941,ENSG00000141736,ERBB2,Liver,withoutHumans,0.0,0.744,103,1,0.0,0.201,103,1,0.149,0.033,103,-1,0.126,0.061,98,1,0.0,0.56,102,1,0.0,0.386,102,1,False,False,False,False,False,False
1404,ENSG00000170190,SLC16A5,Lung,withoutHumans,0.0,0.742,23,1,0.233,0.162,23,1,0.901,-0.043,23,1,0.393,0.122,22,1,0.001,0.519,23,1,0.178,0.123,23,1,False,False,False,False,False,False
1342,ENSG00000164649,CDCA7L,Lung,withoutHumans,0.0,0.74,23,1,0.395,0.089,23,1,0.088,0.205,23,1,0.369,0.131,22,1,0.002,0.491,23,1,0.055,0.242,23,1,False,False,False,False,False,False
1461,ENSG00000177000,MTHFR,Lung,withoutHumans,0.0,0.711,23,1,0.452,0.068,23,1,0.119,0.175,23,1,0.12,0.27,22,1,0.0,0.711,23,1,0.014,0.36,23,1,False,False,False,False,False,False
1790,ENSG00000112699,GMDS,Liver,withoutHumans,0.0,0.704,101,1,0.014,0.111,101,1,0.04,0.066,101,-1,0.198,0.049,96,1,0.0,0.423,100,1,0.0,0.235,100,1,False,False,False,False,False,False


In [159]:
large = models[models["maxlifespan_r2_adj"] >= 0.3][["reference_gene", "symbol","maxlifespan_r2_adj","maxlifespan_sign", "organ"]].drop_duplicates()#.set_index("reference_gene")
print(large.shape)
large.head(10)

(2544, 5)


Unnamed: 0,reference_gene,symbol,maxlifespan_r2_adj,maxlifespan_sign,organ
1909,ENSG00000136436,CALCOCO2,0.857,1,Liver
1127,ENSG00000136436,CALCOCO2,0.834,1,Lung
874,ENSG00000101190,TCFL5,0.766,1,Lung
1718,ENSG00000100889,PCK2,0.76,1,Liver
843,ENSG00000088992,TESC,0.753,1,Lung
1941,ENSG00000141736,ERBB2,0.744,1,Liver
1404,ENSG00000170190,SLC16A5,0.742,1,Lung
1342,ENSG00000164649,CDCA7L,0.74,1,Lung
1461,ENSG00000177000,MTHFR,0.711,1,Lung
1790,ENSG00000112699,GMDS,0.704,1,Liver


In [167]:
shap = paper / "light_gbm"
prev = pd.read_csv(shap / "significant_gene_results_without_skin_blood.csv")#.drop(['Unnamed']) 
cur = pd.read_csv(shap / "significant_gene_results_latest_5_tissues.csv")#.drop(['Unnamed'])
threshold = 0.2
gs_pro = (cur[cur["kendall_tau_to_max_lifespan"] > threshold])
gs_anti = (cur[cur["kendall_tau_to_max_lifespan"] < (0-threshold)])

gs = gs_pro[["name","kendall_tau_to_max_lifespan","ids"]].rename(columns={'name':'symbol', "ids":"reference_gene"})#.set_index("ids")

In [168]:
merged = gs.merge(large, on=["reference_gene","symbol"], how="outer", indicator=True).drop_duplicates()
intersection = merged[merged["_merge"]!="right_only"].sort_values(by="_merge", ascending=False).set_index("reference_gene")
intersection

Unnamed: 0_level_0,symbol,kendall_tau_to_max_lifespan,maxlifespan_r2_adj,maxlifespan_sign,organ,_merge
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000154328,NEIL2,0.578,0.535,1.0,Liver,both
ENSG00000107551,RASSF4,0.579,0.323,1.0,Liver,both
ENSG00000154328,NEIL2,0.578,0.477,1.0,Brain,both
ENSG00000004799,PDK4,0.677,0.614,1.0,Lung,both
ENSG00000004799,PDK4,0.677,0.388,1.0,Brain,both
ENSG00000107551,RASSF4,0.579,0.511,1.0,Brain,both
ENSG00000107551,RASSF4,0.579,0.393,1.0,Kidney,both
ENSG00000107551,RASSF4,0.579,0.368,1.0,Heart,both
ENSG00000006282,SPATA20,0.59,0.611,1.0,Heart,both
ENSG00000154328,NEIL2,0.578,0.541,1.0,Lung,both


In [169]:
intersection.to_csv("/data/PAPER/intersection_pro.tsv", sep="\t", index=True)