In [1]:
from zipfile import ZipFile
import pandas as pd
import math
import numpy as np

In [7]:
data_path =  "./data/PBMC-A/data.csv"
label_path =  "./data/PBMC-A/label.csv"

In [8]:
data = pd.read_csv(data_path, header=0, index_col=0, sep=',')
y = pd.read_csv(label_path, index_col=0, header=0,sep = ',')

In [9]:
data

Unnamed: 0,AL627309.1,RP11-206L10.2,RP11-206L10.9,LINC00115,FAM41C,NOC2L,KLHL17,PLEKHN1,RP11-54O7.17,HES4,...,MT-ND4,MT-ND5,MT-ND6,MT-CYB,AC145212.1,AL592183.1,AL354822.1,KIR2DL2,PNRC2-1,SRSF10-1
GAGAAATGCCTATT-1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,4,0,0,0,0,0,0
GATTTAGATGCTAG-1,0,0,0,0,0,0,0,0,0,0,...,1,1,0,6,0,0,0,0,0,0
GATCTTTGGGACGA-1,0,0,0,0,0,0,0,0,0,0,...,8,0,0,4,0,0,0,0,0,0
AATACCCTCGAATC-1,0,0,0,0,0,0,0,0,0,0,...,6,0,0,1,0,0,0,0,0,0
GTCACAGAACCTTT-1,0,0,0,0,0,0,0,0,0,0,...,3,3,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACCGAAACTAGACC-1,0,0,0,0,0,0,0,0,0,0,...,6,0,0,2,0,0,0,0,0,0
CGGATAACGTTTCT-1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
AGCCGGTGATCAGC-1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
GAGTCAACCACTGA-1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,3,0,0,0,0,0,0


In [10]:
y

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
GAGAAATGCCTATT-1,CD4 T cells
GATTTAGATGCTAG-1,CD8 T cells
GATCTTTGGGACGA-1,FCGR3A+ Monocytes
AATACCCTCGAATC-1,B cells
GTCACAGAACCTTT-1,B cells
...,...
ACCGAAACTAGACC-1,CD4 T cells
CGGATAACGTTTCT-1,CD4 T cells
AGCCGGTGATCAGC-1,FCGR3A+ Monocytes
GAGTCAACCACTGA-1,CD4 T cells


In [6]:
import scanpy as sc
def Selecting_highly_variable_genes(X, highly_genes):
    adata = sc.AnnData(X)
    adata.var_names_make_unique()
    sc.pp.filter_genes(adata, min_cells=3)
    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.log1p(adata)
    adata.raw = adata
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, n_top_genes=highly_genes)
    adata = adata[:, adata.var['highly_variable']].copy()
    data = adata.X

    return data


def binomial_deviance(data,p,n):    
    i=len(n)
    j=len(p)
    A=[]

    for x in range(j):
        sum=0
        for y in range(i):
            sum=sum+data[y,x]*np.log(data[y,x]/(n[y])*p[x]+1e-5)+(n[y]-data[y,x])*np.log((n[y]-data[y,x])/(n[y]*(1-p[x]))+1e-5)
        A.append(sum)
    A=np.array(A)
    return A     


def UMI_cell(data):          
    ni= np.sum(data,axis=1)   
    return ni

def abundant(data):   
    t=data.shape[0]   
    y=np.sum(data,axis=0)   
    ni_total=np.sum(UMI_cell(data))
    return y/ni_total

def selection_gene_ftest():
    adata=sc.read_csv("./data/PBMC-A/finall.csv")
    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.log1p(adata)
    data = adata.X
    return data

def selection_gene(data,highly_genes):
    ni = UMI_cell(data)
    pai = abundant(data)
    de = binomial_deviance(data, pai, ni)
    data = np.row_stack((de, data))  
    data.T[np.lexsort(data[::-1, :])].T  
      
    data = np.delete(data, 0, axis=0)  
    adata = sc.AnnData(data)
    adata.var_names_make_unique()
    sc.pp.filter_genes(adata, min_cells=3)
    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.log1p(adata)
    adata.raw = adata  # save the raw data
    data = adata.X
    data = data[:, 0:highly_genes]
    return data


def reshapeX(data):
    data = np.array(data).astype('float32')
    data = data[:, 0:1600]
    (a, b) = data.shape
    X = []
    tmp = data[0, :]
    print(len(tmp))
    for i in range(a):
        tmp = data[i,:]
        tmp = tmp.reshape(( 1, 40, 40))
        X.append(tmp)
    X = np.array(X)
    return X



def reshapeY(y):
    y = np.array(y)
    y = y-1
    [a,b] = y.shape
    y = y.reshape((a,))
    return y






In [115]:
def preprocessing(a, y, highly_genes):
    #####Three methods
    #data = Selecting_highly_variable_genes(a, highly_genes)
    #data=selection_gene(a,highly_genes)
    data=selection_gene_ftest()
    data = np.array(data).astype('float32')
    data = data[:, 0:1600]
    X = reshapeX(data)
    y = reshapeY(y)

    return X, y


In [116]:
# X,y = preprocessing(data.values.astype('float32'),label.values,highly_genes=1601)
data = np.array(data)

print(np.shape(data))
    # print(data)
y = np.array(y)
X, y = preprocessing(data, y, highly_genes=1601)

(704, 2000)
1600


  This is separate from the ipykernel package so we can avoid doing imports until


In [117]:
import pickle
pickle.dump([X,y],open('PBMC-A.pkl','wb'))#save to .pkl