# Results intersections #

### Parameters ###
Will be overriden by results_intersections.yaml

In [336]:
#Parameters
threshold = 0.3
min_main_repeats = 9
min_other_repeats = 3
trait = "lifespan"
life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
debug_local = True

In [337]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '..', '..', '..', '..', '..', '..', '..', '..', '..', '..', '..', '/data/sources/yspecies/notebooks', '/data/miniconda3/envs/yspecies/lib/python38.zip', '/data/miniconda3/envs/yspecies/lib/python3.8', '/data/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/data/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/data/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/anton/.ipython']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [338]:
from typing import *
from yspecies import *
from yspecies.workflow import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.config import *

In [339]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [340]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

### Load pathes ###

In [341]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [342]:
external = shap_results = locations.output.external 
intersections = locations.output.intersections
shap = external.shap
linear = external.linear
causality = external.dir / "causality"
folder = linear / f"R2_THRESHOLD_{int(threshold * 100)}"

In [343]:
stage_one = locations.output.stage_one
stage_two = locations.output.stage_two
(stage_one, stage_two)

(PosixPath('../data/output/stage_1'), PosixPath('../data/output/stage_2'))

### Function to mark pro and anti genes ###

In [344]:
#used to evaluate the direction
def mark_direction(row, threshold: float, column: str = "mean_kendall_tau", max_repeats: int = 10):    
    cor = row[column] / max_repeats * row["repeats"]
    if cor >= threshold * 2:
        return "strong pro"
    if cor >= threshold:
        return "pro"
    elif cor > 0.0:
        return "weak pro"
    elif round(cor, 4) == 0.0:
        return "neutral"
    elif cor >= -threshold:
        return "weak anti"
    elif cor <= -threshold * 2:
        return "strong anti"
    else:
        return "anti"

## Summarize lifespan results by lifehistories ##

In [317]:
value_cols = ['repeats', 'mean_shap', 'mean_kendall_tau']
value_cols_directed = value_cols + ["direction"]
cols = ['reference_gene','symbol'] + value_cols
cols

['reference_gene', 'symbol', 'repeats', 'mean_shap', 'mean_kendall_tau']

In [318]:
def load_trait(path: Path, cols: list, min_repeats = 1):
    df = pd.read_csv(path, sep="\t")[cols].set_index("reference_gene")
    df = df[df.repeats >= min_repeats]
    df["direction"] = df.apply(lambda row: mark_direction(row, threshold=threshold), axis=1)
    return df

In [319]:
main_trait_stage_1 = load_trait(stage_one / f"{trait}_selected.tsv", cols, 1)
main_trait_stage_2 = load_trait(stage_two / f"{trait}.tsv", cols, min_main_repeats)
main_trait = main_trait_stage_2.join(main_trait_stage_1[value_cols_directed], rsuffix=f"_{trait}_stage_1")
print(main_trait.shape)
main_trait

(50, 9)


Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSG00000167515,TRAPPC2L,10.0,13.78,-0.414,anti,10.0,6.48,-0.279,weak anti
ENSG00000165501,LRR1,10.0,12.9,0.702,strong pro,9.0,3.244,0.688,strong pro
ENSG00000010219,DYRK4,10.0,12.8,0.406,pro,10.0,10.22,0.325,pro
ENSG00000204498,NFKBIL1,10.0,12.42,-0.466,anti,10.0,7.58,-0.128,weak anti
ENSG00000211454,AKR7L,10.0,11.98,-0.015,weak anti,,,,
ENSG00000204231,RXRB,10.0,11.96,-0.602,strong anti,4.0,4.75,-0.152,weak anti
ENSG00000142002,DPP9,10.0,11.88,-0.756,strong anti,7.0,2.371,-0.716,anti
ENSG00000170835,CEL,10.0,10.94,0.685,strong pro,10.0,4.74,0.693,strong pro
ENSG00000165555,NOXRED1,10.0,9.54,0.81,strong pro,8.0,3.25,0.775,strong pro
ENSG00000066923,STAG3,10.0,8.84,0.47,pro,9.0,6.422,0.48,pro


In [320]:
other_life_history = [t for t in life_history if not t == trait]
other_traits = OrderedDict([(t, load_trait(stage_one / (t + "_selected.tsv"), cols, min_other_repeats)) for t in other_life_history])
other_traits["gestation_days"].head(10)

Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,direction
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000006282,SPATA20,10.0,6.92,0.671,strong pro
ENSG00000171121,KCNMB3,10.0,5.46,0.635,strong pro
ENSG00000105672,ETV2,10.0,4.0,0.681,strong pro
ENSG00000172531,PPP1CA,9.0,5.867,-0.768,strong anti
ENSG00000165568,AKR1E2,9.0,4.311,-0.504,anti
ENSG00000188747,NOXA1,9.0,3.667,0.625,pro
ENSG00000101190,TCFL5,9.0,3.289,0.66,pro
ENSG00000010219,DYRK4,8.0,9.725,0.454,pro
ENSG00000066923,STAG3,8.0,6.975,0.43,pro
ENSG00000168569,TMEM223,7.0,3.943,-0.512,anti


In [321]:
joined = main_trait
for t,v in other_traits.items():
    joined = joined.join(v, rsuffix = f"_{t}")#
#joined.to_csv(locations.output.stage_two / "lifespan_with_traits.tsv", sep="\t", index_label="reference_gene")
joined.columns

Index(['symbol', 'repeats', 'mean_shap', 'mean_kendall_tau', 'direction',
       'repeats_lifespan_stage_1', 'mean_shap_lifespan_stage_1',
       'mean_kendall_tau_lifespan_stage_1', 'direction_lifespan_stage_1',
       'symbol_mass_kg', 'repeats_mass_kg', 'mean_shap_mass_kg',
       'mean_kendall_tau_mass_kg', 'direction_mass_kg', 'symbol_mtGC',
       'repeats_mtGC', 'mean_shap_mtGC', 'mean_kendall_tau_mtGC',
       'direction_mtGC', 'symbol_metabolic_rate', 'repeats_metabolic_rate',
       'mean_shap_metabolic_rate', 'mean_kendall_tau_metabolic_rate',
       'direction_metabolic_rate', 'symbol_temperature', 'repeats_temperature',
       'mean_shap_temperature', 'mean_kendall_tau_temperature',
       'direction_temperature', 'symbol_gestation_days',
       'repeats_gestation_days', 'mean_shap_gestation_days',
       'mean_kendall_tau_gestation_days', 'direction_gestation_days'],
      dtype='object')

In [322]:
row = joined.iloc[0]
def summarize_life_history(row: pd.Series):   
    acc = []
    for t in other_life_history:
         rep = row[f"repeats_{t}"]
         if rep >= min_other_repeats:
           sh = row[f"mean_shap_{t}"]
           tau = row[f"mean_kendall_tau_{t}"]
           direction = row[f"direction_{t}"]
           acc.append(f"{t} ({direction} | {int(rep)} | {round(sh,2)} | {round(tau,4)})")
    return ", ".join(acc)
summarize_life_history(row)

'mtGC (anti | 7 | 5.4 | -0.4983)'

In [323]:
joined["other_life_history_traits"] = joined.apply(summarize_life_history, axis=1)
shap_results = joined[main_trait.columns.to_list()+["other_life_history_traits"]]
shap_results.to_csv(locations.output.stage_two / "shap_results.tsv", sep="\t", index_label="reference_gene")
shap_results

Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1,other_life_history_traits
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000167515,TRAPPC2L,10.0,13.78,-0.414,anti,10.0,6.48,-0.279,weak anti,mtGC (anti | 7 | 5.4 | -0.4983)
ENSG00000165501,LRR1,10.0,12.9,0.702,strong pro,9.0,3.244,0.688,strong pro,
ENSG00000010219,DYRK4,10.0,12.8,0.406,pro,10.0,10.22,0.325,pro,"mass_kg (weak pro | 3 | 19.13 | 0.2595), gesta..."
ENSG00000204498,NFKBIL1,10.0,12.42,-0.466,anti,10.0,7.58,-0.128,weak anti,"mass_kg (weak anti | 4 | 20.5 | -0.1118), meta..."
ENSG00000211454,AKR7L,10.0,11.98,-0.015,weak anti,,,,,mtGC (weak anti | 4 | 5.45 | -0.3805)
ENSG00000204231,RXRB,10.0,11.96,-0.602,strong anti,4.0,4.75,-0.152,weak anti,
ENSG00000142002,DPP9,10.0,11.88,-0.756,strong anti,7.0,2.371,-0.716,anti,
ENSG00000170835,CEL,10.0,10.94,0.685,strong pro,10.0,4.74,0.693,strong pro,
ENSG00000165555,NOXRED1,10.0,9.54,0.81,strong pro,8.0,3.25,0.775,strong pro,
ENSG00000066923,STAG3,10.0,8.84,0.47,pro,9.0,6.422,0.48,pro,"mass_kg (weak pro | 4 | 23.6 | 0.2807), gestat..."


### Load linear ###

In [324]:
def lag_linear(df: pd.DataFrame)->pd.DataFrame:
    df["MLS_influence_linear"] = df.apply(lambda row: "pro" if row["maxlifespan_sign"]>0 else "anti", axis = 1)
    selected = df[["reference_gene","symbol","organ","maxlifespan_adjpval","maxlifespan_r2_adj", "MLS_influence_linear"]].sort_values(by="maxlifespan_r2_adj", ascending=False).drop_duplicates()
    selected["organ"] = selected.apply(lambda row: row["organ"] + " (" +row["MLS_influence_linear"]+" r^2=" + str(round(row["maxlifespan_r2_adj"],4)) +")", axis=1)
    return selected[["reference_gene","symbol","organ"]].groupby(["reference_gene", "symbol"], as_index=False).agg({'organ': ', '.join}).set_index("reference_gene")


In [325]:
linear_organ = lag_linear(pd.read_csv(folder / "linear_models_on_species_vars.csv").rename(columns={"Unnamed: 0": "reference_gene"}))
linear_organ.head()

Unnamed: 0_level_0,symbol,organ
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000003,TSPAN6,Liver (pro r^2=0.4136)
ENSG00000001036,FUCA2,"Brain (anti r^2=0.3894), Heart (anti r^2=0.3263)"
ENSG00000001497,LAS1L,Heart (anti r^2=0.507)
ENSG00000003402,CFLAR,Heart (pro r^2=0.3485)
ENSG00000003436,TFPI,Brain (anti r^2=0.3418)


# Join with linear models #

In [326]:
shap_with_linear = shap_results.join(linear_organ["organ"])
shap_with_linear.to_csv(locations.output.dir / "results" / "shap_with_linear.tsv", sep="\t", index_label="reference_gene")
shap_with_linear

Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1,other_life_history_traits,organ
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000167515,TRAPPC2L,10.0,13.78,-0.414,anti,10.0,6.48,-0.279,weak anti,mtGC (anti | 7 | 5.4 | -0.4983),"Brain (anti r^2=0.5246), Heart (anti r^2=0.3671)"
ENSG00000165501,LRR1,10.0,12.9,0.702,strong pro,9.0,3.244,0.688,strong pro,,"Heart (pro r^2=0.4585), Lung (pro r^2=0.4286),..."
ENSG00000010219,DYRK4,10.0,12.8,0.406,pro,10.0,10.22,0.325,pro,"mass_kg (weak pro | 3 | 19.13 | 0.2595), gesta...",Heart (pro r^2=0.3529)
ENSG00000204498,NFKBIL1,10.0,12.42,-0.466,anti,10.0,7.58,-0.128,weak anti,"mass_kg (weak anti | 4 | 20.5 | -0.1118), meta...",
ENSG00000211454,AKR7L,10.0,11.98,-0.015,weak anti,,,,,mtGC (weak anti | 4 | 5.45 | -0.3805),
ENSG00000204231,RXRB,10.0,11.96,-0.602,strong anti,4.0,4.75,-0.152,weak anti,,
ENSG00000142002,DPP9,10.0,11.88,-0.756,strong anti,7.0,2.371,-0.716,anti,,"Lung (anti r^2=0.5074), Heart (anti r^2=0.3048)"
ENSG00000170835,CEL,10.0,10.94,0.685,strong pro,10.0,4.74,0.693,strong pro,,Brain (pro r^2=0.4815)
ENSG00000165555,NOXRED1,10.0,9.54,0.81,strong pro,8.0,3.25,0.775,strong pro,,Liver (pro r^2=0.3163)
ENSG00000066923,STAG3,10.0,8.84,0.47,pro,9.0,6.422,0.48,pro,"mass_kg (weak pro | 4 | 23.6 | 0.2807), gestat...",


# Genage annotations #

In [327]:
genage_folder = locations.input.annotations.genage

genage_conversions = pd.read_csv(genage_folder.conversion, sep="\t")
genage = genage_conversions[["Ensembl","Gene Symbol", "Organism", "Lifespan Effect", "Longevity Influence", "Method"]].drop_duplicates()
genage.head(10)

Unnamed: 0,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method
0,YPL174C,NIP100,Saccharomyces cerevisiae,decrease,fitness,Deletion
1,YER091C,MET6,Saccharomyces cerevisiae,decrease,fitness,Deletion
2,YDR108W,TRS85,Saccharomyces cerevisiae,decrease,fitness,Deletion
4,YMR135C,GID8,Saccharomyces cerevisiae,decrease,fitness,Deletion
6,YCR024C-A,PMP1,Saccharomyces cerevisiae,decrease,fitness,Deletion
7,YDR181C,SAS4,Saccharomyces cerevisiae,increase,anti,Deletion
8,YJL210W,PEX2,Saccharomyces cerevisiae,decrease,fitness,Deletion
9,YNL079C,TPM1,Saccharomyces cerevisiae,decrease,fitness,Deletion
10,YJR127C,RSF2,Saccharomyces cerevisiae,decrease,fitness,Deletion
11,YMR307W,GAS1,Saccharomyces cerevisiae,decrease,fitness,Deletion


In [328]:
genage_genes = pd.read_csv(genage_folder.orthologs.dir / "all.tsv", sep="\t").rename(columns={"Homo_sapiens":"reference_gene"})
genage_genes.head(5)

Unnamed: 0,reference_gene,Caenorhabditis_elegans,Drosophila_melanogaster,Mus_musculus,Mesocricetus_auratus,Danio_rerio
0,ENSG00000242265,,,ENSMUSG00000092035,ENSMAUG00000018656,ENSDARG00000109342
1,ENSG00000139990,WBGene00011242,FBgn0250755,ENSMUSG00000049106,ENSMAUG00000004350,ENSDARG00000060320
2,ENSG00000073921,WBGene00006751,FBgn0086372,ENSMUSG00000039361,ENSMAUG00000011448,ENSDARG00000012866;ENSDARG00000014137
3,ENSG00000139687,WBGene00003020,,ENSMUSG00000022105,ENSMAUG00000020865,ENSDARG00000006782
4,ENSG00000119977,WBGene00017120,,ENSMUSG00000025008,ENSMAUG00000021808,


In [329]:
genage_humanized = genage.head(0)
for c in genage_genes.columns[1:]:
    col = c.replace("_", " ")
    selected_genes = genage_genes[["reference_gene",c]].rename(columns = {c:"Ensembl"}).dropna()
    selected_genes["Ensembl"] = selected_genes.apply(lambda row: row["Ensembl"].split(";"),1)
    genage_org = genage[genage["Organism"]==col]
    merged = selected_genes.explode("Ensembl").merge(genage_org, on="Ensembl", how="inner")
    genage_humanized = pd.concat([genage_humanized, merged])
genage_humanized = genage_humanized.drop_duplicates()
genage_humanized.head(10)

Unnamed: 0,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method,reference_gene
0,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000095917
1,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000116176
2,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000172236
3,WBGene00006619,try-1,Caenorhabditis elegans,increase,anti,RNA interference,ENSG00000197253
4,WBGene00004481,rps-12,Caenorhabditis elegans,increase,anti,Post-developmental RNA interference,ENSG00000112306
5,WBGene00004789,sgk-1,Caenorhabditis elegans,increase,anti,Deletion,ENSG00000101049
6,WBGene00004789,sgk-1,Caenorhabditis elegans,increase,anti,Deletion,ENSG00000104205
7,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000087053
8,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000063601
9,WBGene00003476,mtm-3,Caenorhabditis elegans,decrease,pro,RNA interference,ENSG00000003987


### Add genage human info ###

In [330]:
genage_human = pd.read_csv(genage_folder.human, sep='\t')
genage_human.shape

(307, 6)

### Check for genage genes ###

In [331]:
shap_with_linear_genage = shap_with_linear.merge(genage_humanized, on="reference_gene")
shap_with_linear_genage.to_csv(locations.output.dir / "results" / "genage_intersection.tsv", index_label="reference_gene", sep="\t")
shap_with_linear_genage

Unnamed: 0,reference_gene,symbol,repeats,mean_shap,mean_kendall_tau,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1,other_life_history_traits,organ,Ensembl,Gene Symbol,Organism,Lifespan Effect,Longevity Influence,Method
0,ENSG00000204231,RXRB,10.0,11.96,-0.602,strong anti,4.0,4.75,-0.152,weak anti,,,WBGene00007547,nhr-154,Caenorhabditis elegans,increase,anti,RNA interference
1,ENSG00000140398,NEIL1,10.0,7.98,0.771,strong pro,2.0,2.0,0.661,weak pro,,,ENSMUSG00000032298,Neil1,Mus musculus,decrease,pro,Knockout
2,ENSG00000164362,TERT,10.0,6.86,-0.491,anti,4.0,2.5,-0.466,weak anti,"mtGC (weak anti | 5 | 3.76 | -0.517), gestatio...","Heart (anti r^2=0.4954), Liver (anti r^2=0.4836)",ENSMUSG00000021611,Tert,Mus musculus,increase,pro,Overexpression
3,ENSG00000164362,TERT,10.0,6.86,-0.491,anti,4.0,2.5,-0.466,weak anti,"mtGC (weak anti | 5 | 3.76 | -0.517), gestatio...","Heart (anti r^2=0.4954), Liver (anti r^2=0.4836)",ENSMUSG00000021611,Tert,Mus musculus,increase,pro,Knockin
4,ENSG00000164362,TERT,10.0,6.86,-0.491,anti,4.0,2.5,-0.466,weak anti,"mtGC (weak anti | 5 | 3.76 | -0.517), gestatio...","Heart (anti r^2=0.4954), Liver (anti r^2=0.4836)",ENSDARG00000042637,tert,Danio rerio,decrease,pro,Mutation
5,ENSG00000162959,MEMO1,10.0,5.82,-0.583,anti,,,,,gestation_days (weak anti | 7 | 3.17 | -0.2135),"Heart (anti r^2=0.4534), Brain (anti r^2=0.444...",WBGene00016500,memo-1,Caenorhabditis elegans,increase,anti,Mutation


# Causality #

### Intesection ###

In [332]:
shap_with_linear_causal_old = shap_with_linear_causal
shap_with_linear_causal_old_inner = shap_with_linear_causal[shap_with_linear_causal.frequency > 0.3]
print(shap_with_linear_causal_old_inner.shape)
shap_with_linear_causal_old_inner

(7, 12)


Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,frequency,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1,other_life_history_traits,organ
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000165501,LRR1,10.0,12.9,0.702,0.34,strong pro,9.0,3.244,0.688,strong pro,,"Heart (pro r^2=0.4585), Lung (pro r^2=0.4286),..."
ENSG00000170835,CEL,10.0,10.94,0.685,0.6,strong pro,10.0,4.74,0.693,strong pro,,Brain (pro r^2=0.4815)
ENSG00000136436,CALCOCO2,10.0,5.98,0.769,0.42,strong pro,4.0,2.1,0.722,weak pro,,"Lung (pro r^2=0.5586), Brain (pro r^2=0.5369),..."
ENSG00000188747,NOXA1,10.0,5.24,0.673,1.0,strong pro,9.0,2.111,0.689,strong pro,"mtGC (pro | 8 | 6.55 | 0.649), gestation_days ...","Brain (pro r^2=0.6271), Kidney (pro r^2=0.6017..."
ENSG00000171121,KCNMB3,10.0,4.44,0.595,0.72,pro,8.0,3.375,0.61,pro,"mtGC (strong pro | 10 | 11.0 | 0.6245), gestat...","Brain (pro r^2=0.4583), Lung (pro r^2=0.4067),..."
ENSG00000198663,C6orf89,9.0,7.156,-0.8,0.94,strong anti,,,,,metabolic_rate (anti | 6 | 6.97 | -0.5674),"Heart (anti r^2=0.6159), Liver (anti r^2=0.4011)"
ENSG00000092529,CAPN3,9.0,5.889,0.471,0.32,pro,4.0,3.25,0.414,weak pro,,"Kidney (pro r^2=0.4455), Heart (pro r^2=0.4098)"


In [333]:
causal = pd.read_csv(causality / "causal_selection.tsv", sep="\t").rename(columns={"Gene Code" : "reference_gene", "Gene Name": "symbol", "Relative Frequency": "frequency"}).set_index("reference_gene")
causal.head(10)

Unnamed: 0_level_0,symbol,frequency
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000188747,NOXA1,1.0
ENSG00000198663,C6orf89,0.94
ENSG00000115488,NEU2,0.94
ENSG00000184983,NDUFA6,0.9
ENSG00000151962,RBM46,0.82
ENSG00000171121,KCNMB3,0.72
ENSG00000170835,CEL,0.6
ENSG00000136436,CALCOCO2,0.42
ENSG00000129187,DCTD,0.4
ENSG00000165501,LRR1,0.34


In [335]:
shap_with_linear_causal = shap_with_linear.join(causal["frequency"])[shap_with_linear.columns.insert(4,'frequency')]
shap_with_linear_causal_inner = shap_with_linear_causal[shap_with_linear_causal.frequency>0]
print(shap_with_linear_causal_inner.shape)
shap_with_linear_causal_inner

(12, 12)


Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,frequency,direction,repeats_lifespan_stage_1,mean_shap_lifespan_stage_1,mean_kendall_tau_lifespan_stage_1,direction_lifespan_stage_1,other_life_history_traits,organ
reference_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000165501,LRR1,10.0,12.9,0.702,0.34,strong pro,9.0,3.244,0.688,strong pro,,"Heart (pro r^2=0.4585), Lung (pro r^2=0.4286),..."
ENSG00000142002,DPP9,10.0,11.88,-0.756,0.3,strong anti,7.0,2.371,-0.716,anti,,"Lung (anti r^2=0.5074), Heart (anti r^2=0.3048)"
ENSG00000170835,CEL,10.0,10.94,0.685,0.6,strong pro,10.0,4.74,0.693,strong pro,,Brain (pro r^2=0.4815)
ENSG00000132436,FIGNL1,10.0,7.78,0.587,0.02,pro,9.0,3.422,0.57,pro,gestation_days (weak pro | 5 | 2.88 | 0.4068),"Lung (pro r^2=0.4682), Brain (pro r^2=0.3752)"
ENSG00000143443,C1orf56,10.0,7.14,0.68,0.04,strong pro,,,,,mtGC (weak pro | 4 | 1.75 | 0.6189),"Heart (pro r^2=0.5341), Brain (pro r^2=0.333)"
ENSG00000137343,ATAT1,10.0,7.1,-0.557,0.06,anti,6.0,2.867,-0.457,weak anti,,"Lung (anti r^2=0.6775), Heart (anti r^2=0.3218)"
ENSG00000136436,CALCOCO2,10.0,5.98,0.769,0.42,strong pro,4.0,2.1,0.722,weak pro,,"Lung (pro r^2=0.5586), Brain (pro r^2=0.5369),..."
ENSG00000188747,NOXA1,10.0,5.24,0.673,1.0,strong pro,9.0,2.111,0.689,strong pro,"mtGC (pro | 8 | 6.55 | 0.649), gestation_days ...","Brain (pro r^2=0.6271), Kidney (pro r^2=0.6017..."
ENSG00000171121,KCNMB3,10.0,4.44,0.595,0.72,pro,8.0,3.375,0.61,pro,"mtGC (strong pro | 10 | 11.0 | 0.6245), gestat...","Brain (pro r^2=0.4583), Lung (pro r^2=0.4067),..."
ENSG00000198663,C6orf89,9.0,7.156,-0.8,0.94,strong anti,,,,,metabolic_rate (anti | 6 | 6.97 | -0.5674),"Heart (anti r^2=0.6159), Liver (anti r^2=0.4011)"


# Saving results #

In [345]:
shap_with_linear_causal.to_csv(intersections / "intersections.tsv", sep = "\t", index = True)
#all_ext.to_csv(intersections / "intersections_5_and_7.tsv", sep = "\t", index = False)

In [220]:
#all_ext_genage.to_csv(intersections / "genage_model_intersections.tsv", sep = "\t", index = False)
#all_ext_human_genage.to_csv(intersections / "genage_human_intersections.tsv", sep = "\t", index = False)