In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import scanpy as sc

import matplotlib.pyplot as plt 
import seaborn as sns


In [3]:
import sys, os
sys.path.append('../../src')

# from interaction import Interaction
from util import compute_auc

In [4]:
y_path = '/ix/djishnu/Jane/SLIDESWING/jing_data/KIR+TEDDY/data/KIR+TEDDY_Yexpanded_filtered85.csv'
y = pd.read_csv(y_path)['Y'].values
y.shape

(10191,)

In [5]:
slide_outs = '/ix/djishnu/Jane/SLIDESWING/jing_data/KIR+TEDDY/KIR+TEDDY_filtered85/KIR+TEDDY_filtered85_noint_output/0.01_0.5_out'

In [6]:
from util import get_genes_from_slide_outs

lf_dict = get_genes_from_slide_outs(slide_outs)
lf_dict.keys()

dict_keys(['Z17', 'Z5', 'Z18', 'Z20', 'Z30'])

In [7]:
all_genes = np.unique(np.concatenate([lf_dict[lf] for lf in lf_dict]))
len(all_genes)

55

In [8]:
from genept import GenePTEmbedder

genept = GenePTEmbedder()
gene_embeddings = genept.get_gene_info(all_genes)
gene_embeddings.shape

(55, 1536)

In [9]:
# Create 0, 1 presence/absence matrix

gex_df = pd.read_csv('/ix/djishnu/Jane/SLIDESWING/jing_data/KIR+TEDDY/data/KIR+TEDDY_rna_filtered85.csv', 
                     usecols=list(all_genes))
gex_threshes = gex_df.mean(axis=0)

In [10]:
mask_df = pd.DataFrame(
    np.where(gex_df > gex_threshes, 1, 0), 
    index=gex_df.index, 
    columns=gex_df.columns
)

mask_df.shape

(10191, 55)

In [11]:
gex_df.shape, mask_df.shape, gene_embeddings.shape

((10191, 55), (10191, 55), (55, 1536))

In [12]:
genept_df = np.einsum('ij,jk->ijk', mask_df.values, gene_embeddings)
genept_df.shape

(10191, 55, 1536)

In [13]:
genept_df = genept_df.reshape(gex_df.shape[0], -1)
genept_df.shape

(10191, 84480)

In [14]:
wgenept_df = gex_df @ gene_embeddings
wgenept_df.shape

(10191, 1536)

In [15]:
# Lasso regression on LF gene expression matrix

from sklearn.linear_model import Lasso

lasso1 = Lasso(alpha=0.1)
lasso1.fit(gex_df.values, y)

# lasso.score(gex_df.values, y)
yhat = lasso1.predict(gex_df.values)
compute_auc(yhat, y)

0.7901003346607715

In [16]:
# Lasso regression on mean-thresholded gene expression

lasso2 = Lasso(alpha=0.1)
lasso2.fit(mask_df.values, y)

yhat = lasso2.predict(mask_df.values)
compute_auc(yhat, y)

0.7811798978403961

In [17]:
# Lasso regression on semantic embeddings

lasso3 = Lasso(alpha=0.1)
lasso3.fit(genept_df, y)

yhat = lasso3.predict(genept_df)
compute_auc(yhat, y)

0.5

In [18]:
# Lasso regression on semantic embeddings

lasso4 = Lasso(alpha=0.1)
lasso4.fit(wgenept_df, y)

yhat = lasso4.predict(wgenept_df)
compute_auc(yhat, y)

0.7182413959435096

In [19]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(gex_df.values, y)

yhat = lr.predict(gex_df.values)
compute_auc(yhat, y)

0.8060473258768566

In [20]:
lr = LinearRegression()
lr.fit(mask_df.values, y)

yhat = lr.predict(mask_df.values)
compute_auc(yhat, y)

0.800227993852562

In [21]:
lr = LinearRegression()
lr.fit(genept_df, y)

yhat = lr.predict(genept_df)
compute_auc(yhat, y)

0.7978301395019409

In [22]:
lr = LinearRegression()
lr.fit(wgenept_df, y)

yhat = lr.predict(wgenept_df)
compute_auc(yhat, y)

0.8043803984671098

In [29]:
from models import SimpleNN

nn = SimpleNN(input_dim=gex_df.shape[1], output_dim=1)
nn.init_weights(nn.model)

nn.fit(gex_df.values, y, epochs=50, batch_size=100)
yhat = nn.predict(gex_df.values)
compute_auc(yhat, y)

using cuda device...


Epoch 49 loss: 0.05127400904893875: 100%|██████████| 50/50 [00:04<00:00, 10.84it/s]


0.920689087367176

In [30]:
nn = SimpleNN(input_dim=mask_df.shape[1], output_dim=1)
nn.init_weights(nn.model)

nn.fit(mask_df.values, y, epochs=50, batch_size=100)
yhat = nn.predict(mask_df.values)
compute_auc(yhat, y)

using cuda device...


Epoch 49 loss: 0.08475366979837418: 100%|██████████| 50/50 [00:04<00:00, 10.83it/s]


0.9390547290680417

In [31]:
nn = SimpleNN(input_dim=genept_df.shape[1], output_dim=1)
nn.init_weights(nn.model)

nn.fit(genept_df, y, epochs=50, batch_size=100)
yhat = nn.predict(genept_df)
compute_auc(yhat, y)

using cuda device...


Epoch 49 loss: 0.17152614891529083: 100%|██████████| 50/50 [00:37<00:00,  1.32it/s]


0.8240360206874437

In [35]:
nn = SimpleNN(input_dim=wgenept_df.shape[1], output_dim=1)
nn.init_weights(nn.model)

nn.fit(wgenept_df.values, y, epochs=50, batch_size=100)
yhat = nn.predict(wgenept_df.values)
compute_auc(yhat, y)

using cuda device...


Epoch 49 loss: 0.16431468725204468: 100%|██████████| 50/50 [00:05<00:00,  9.51it/s]


0.8022074640949802

In [None]:
gene_embeddings.shape

In [None]:
genept_df[4, 4*1536:5*1536]

In [None]:
genept_df[4, 5*1536:6*1536]

In [None]:
gene_embeddings[5]

In [None]:
mask_df.iloc[4]