In [1]:
import os
import glob
import time
import numpy as np
import scipy as sp
import pandas as pd
import scanpy as sc
import pickle as pk
import anndata as ad
import requests as rq
import multiprocessing as mp
from tqdm import tqdm,trange
from functools import partial
from scipy.sparse import csr_matrix as csr
from scipy.sparse import csc_matrix as csc
from multiprocessing import Process,Pool
from sklearn.neighbors import NearestNeighbors as NN
from scipy.spatial.distance import cdist
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
gene_info=pd.read_csv(f'../csv/gene_info.csv',header=0,index_col=0)
geneset=gene_info.index
genemap={j:i+1 for i,j in enumerate(gene_info.index)}
hgcn=pd.read_csv('../csv/updated_hgcn.tsv',index_col=1,header=0,sep='\t')
hgcn=hgcn[hgcn['Status']=='Approved']
map_dict={}
alias=hgcn['Alias symbols']
prev=hgcn['Previous symbols']
for i in hgcn.index:
    if alias.loc[i] is not np.nan:
        for j in alias.loc[i].split(', '):
            if j not in hgcn.index:
                map_dict[j]=i
for i in hgcn.index:
    if prev.loc[i] is not np.nan:
        for j in prev.loc[i].split(', '):
            if j not in hgcn.index:
                map_dict[j]=i
egsn=pd.read_csv('../csv/updated_hgcn.tsv',index_col=None,header=0,sep='\t')
egsn=egsn.dropna(subset=['Ensembl gene ID'])
egsn=egsn.set_index('Ensembl gene ID')

In [3]:
def recover(adata):
    raw=np.expm1(adata.X.A)
    scale=np.ones((raw.shape[0],1))
    for i in trange(raw.shape[0]):
        scale[i]=raw[i][raw[i]>0].min()
    raw=csr(raw/scale)
    raw.data+=0.499
    adata.X=raw.astype(int)
def map_gene(adata,geneset,map_dict,key=None):
    if key==None:
        adata.var['gene']=adata.var_names
        key='gene'
    adata.var[key]=adata.var[key].astype(str)
    diff=np.setdiff1d(adata.var[key],geneset)
    print(diff.shape)
    adata.var['old_name']=adata.var[key].values.astype(str)
    adata.var['new_name']=adata.var[key]
    adata.var['origin']=adata.var_names
    adata.var_names=adata.var[key]
    for i in diff:
        x=map_dict.get(i)
        if x is None:
            continue
        if x not in adata.var['new_name'].values:
            adata.var.loc[i,'new_name']=x
    diff=np.setdiff1d(adata.var['new_name'],geneset)
    adata.var_names=adata.var['new_name']
    print(diff.shape)
    return diff
def train_split(adata,celltype_key,val_rate=0,test_rate=0.3,seed=0,batch_key=None):
    np.random.seed(seed)
    adata.obs['train']=0
    if batch_key is None or batch_key not in adata.obs.columns:
        for i in adata.obs[celltype_key].unique():
            idx=adata.obs[celltype_key]==i
            size=idx.sum()
            order=np.random.permutation(size)
            num1=int(np.ceil(size*test_rate))
            num2=int(np.ceil(size*val_rate))
            test=order[:num1]
            val=order[num1:num1+num2]
            test=idx.values.nonzero()[0][test]
            val=idx.values.nonzero()[0][val]
            adata.obs['train'][test]=2
            adata.obs['train'][val]=1
    else:
        adatas=[adata[adata.obs[batch_key]==i].copy() for i in adata.obs[batch_key].unique()]
        for adatai in adatas:
            for i in adatai.obs[celltype_key].unique():
                idx=adatai.obs[celltype_key]==i
                size=idx.sum()
                order=np.random.permutation(size)
                num1=int(np.ceil(size*test_rate))
                num2=int(np.ceil(size*val_rate))
                test=order[:num1]
                val=order[num1:num1+num2]
                test=idx.values.nonzero()[0][test]
                val=idx.values.nonzero()[0][val]
                adatai.obs['train'][test]=2
                adatai.obs['train'][val]=1
        adata=sc.concat(adatas,merge='same')
    return adata

## Preprocess for Cell Annotation

In [4]:
adata=sc.read_h5ad('../datasets/Pancrm.h5ad')
adata.X=csr(adata.X)
adata

AnnData object with n_obs × n_vars = 14767 × 15558
    obs: 'cell_type', 'batch'

Check whether the dataset has already been normalized, usually the gene expression value of normalized data won't exceed 9.21\
If so, recover it to the raw expression value by uncommenting this line.

In [5]:
#recover(adata)

In [6]:
sc.pp.filter_cells(adata,min_genes=1)
sc.pp.filter_genes(adata,min_cells=1)

In [7]:
diff=map_gene(adata,geneset,map_dict,key=None)
adata.var_names=adata.var['new_name']

(1017,)
(273,)


In [8]:
# each celltype in each batch will be uniformly randomly split into trainset, valset and testset
train_split(adata,'cell_type',val_rate=0,test_rate=0.3,seed=0,batch_key='batch')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adatai.obs['train'][test]=2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adatai.obs['train'][val]=1


AnnData object with n_obs × n_vars = 14767 × 15558
    obs: 'cell_type', 'batch', 'n_genes', 'train'
    var: 'n_cells', 'gene', 'old_name', 'new_name', 'origin'

In [9]:
sc.write('../datasets/processed/Pancrm.h5ad',adata)

## Gene Function Prediction

In [None]:
adatas=[sc.read_h5ad(f'../datasets/genefunction//T{i}.h5ad') for i in range(1,4)]
for a in adatas:
    a.X=csr(a.X)

In [None]:
adata=adatas[0].copy()

In [None]:
folds=5
for i in range(3):
    np.random.seed(0)
    adata.var[f'train_t{i+1}']=-1
    for j in [0,1]:
        idx=adata.var[f't{i+1}']==j
        rate=idx.sum()
        order=np.random.permutation(rate)
        num=int(np.round(rate/folds))
        for k in range(folds):
            fold=order[num*k:num*(k+1)+(k==folds-1)*(rate%num)]
            fold=idx.values.nonzero()[0][fold]
            adata.var[f'train_t{i+1}'][fold]=k