In [3]:
import os
import pandas as pd
import numpy as np
import sklearn
import h5py
import scanpy as sc
import anndata
import gc
import pickle

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, scale
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.multioutput import MultiOutputRegressor
import lightgbm as lgb
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# %pip install scanpy
# %pip install anndata
# %pip install lightgbm
# %pip install numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.


# Load data 

In [3]:
DATA_DIR = "/home/wuxc/data/open_multiome_compete/data/"
CELL_METADATA = os.path.join(DATA_DIR, "metadata.csv")
CITE_TRAIN_INPUTS = os.path.join(DATA_DIR, "train_cite_inputs.h5")
CITE_TRAIN_TARGETS = os.path.join(DATA_DIR, "train_cite_targets.h5")
CITE_TEST_INPUTS = os.path.join(DATA_DIR, "test_cite_inputs.h5")

MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR, "train_multi_inputs.h5")
MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR, "train_multi_targets.h5")
MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR, "test_multi_inputs.h5")

SUBMISSION = os.path.join(DATA_DIR, "sample_submission.csv")
EVALUATION_IDS = os.path.join(DATA_DIR, "evaluation_ids.csv")

In [4]:
df_cell = pd.read_csv(CELL_METADATA)
df_cell_cite = df_cell[df_cell.technology=="citeseq"]
df_cell_multi = df_cell[df_cell.technology=="multiome"]
df_cell_cite.shape, df_cell_multi.shape

((119651, 5), (161877, 5))

# Preprocess for CITEseq

In [None]:
%%time

col_start = 10000

class PreprocessCiteseq(BaseEstimator, TransformerMixin):
    columns_to_use = 13000

    @staticmethod
    def take_columnn_subset(X):
        return X[:, -(PreprocessCiteseq.columns_to_use + col_start):-col_start]
    
    def transform(self, X):
        print(X.shape)
        X = X[:, ~self.all_zero_columns]
        print(X.shape)
        X = PreprocessCiteseq.take_column_subset(X)
        print(X.shape)
        gc.collect()

        X = self.pca.transform(X)
        print(X.shape)
        return X
    
    def fit_transform(self, X):
        gc.collect()
        print(X.shape)
        self.all_zero_columns = (X == 0).all(axis=0)
        X = X[:, ~self.all_zero_columns]
        print(X.shape)
        X = PreprocessCiteseq.take_column_subset(X)
        print(X.shape)
        gc.collect()

        self.pca = PCA(n_components=240, copy=False, random_state=1)
        X = self.pca.fit_transform(X)
        print(X.shape)
        return X

preprocesser = PreprocessCiteseq()
cite_train_x = None
cite_train_x = preprocesser.fit_transform(pd.read_hdf(CITE_TRAIN_INPUTS).values)

cite_train_y = pd.read_hdf(CITE_TRAIN_TARGETS).values
print(cite_train_x.shape, cite_train_y.shape)

# Modeling & Prediction

In [None]:
params = {
    'learning_rate': 0.5,
    'metric': 'mae',
    'seed': 42,
    'reg_alpha': 0.0014,
    'reg_lambda': 0.2,
    'colsample_bytree': 0.8,
    'subsample': 0.5,
    'max_depth': 12,
    'num_leaves': 722,
    'min_child_samples': 85,
}

model = MultiOutputRegressor(lgb.LGBMRegressor(**params, n_estimators=1000))
print("fitting")
model.fit(cite_train_x, cite_train_y)
print("fit proces done")

y_va_pred = model.predict(cite_train_x)
mse = mean_squared_error(cite_train_y, y_va_pred)
print(mse)
del cite_train_x, cite_train_y
gc.collect()

In [None]:
cite_test_x = preprocesser.transform(pd.read_hdf(CITE_TEST_INPUTS).values)
test_pred = model.predict(cite_test_x)
del cite_test_x
test_pred.shape

# Gene expression embedding

In [23]:
# %pip install torch
%pip install einops

Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 285 kB/s eta 0:00:01
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [24]:
import torch
import torch.nn as nn
from einops import rearrange
import numpy as np

In [17]:
x = torch.randn(5,5,5,5)
y = torch.randn(5,5,5,5)

In [19]:
# torch.einsum('b c h w -> b*w c h',x)
torch.einsum('b k h w, b k h w -> b h w', x, y).shape

torch.Size([5, 5, 5])

In [21]:
a = torch.randn(10,20,30) # b -> 10, i -> 20, k -> 30
c = torch.randn(10,50,30) # b -> 10, j -> 50, k -> 30
y1 = torch.einsum('b i k, b j k -> b i j', a , c) # shape [10, 20, 50]
print(y1.shape)

torch.Size([10, 20, 50])


In [27]:
x = torch.randn(10, 100, 1000) # batch, tokens, dim
dim = 1000 ## simulated # of gene
to_qkv = nn.Linear(dim, dim*3, bias=False) ## init qkv matrix
qkv = to_qkv(x) ## x is the gene expression embedding vector
q, k, v = tuple(rearrange(qkv, 'b t (d k) -> k b t d', k=3))
print(q.shape, k.shape, v.shape)

torch.Size([10, 100, 1000]) torch.Size([10, 100, 1000]) torch.Size([10, 100, 1000])
