In [6]:
import gc
import collections
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
adata = sc.read_h5ad('../data/scLevyAll.h5ad')
batch = ['a','b','c','d','e','f','g','h']

In [3]:
metadata = pd.read_csv('../data/scLevyAll_metadata.txt',sep='\t',header=0,index_col=False)
index = [metadata['PREFIX'][i]+'_'+metadata['CELL_BARCODE'][i] for i in range(metadata.shape[0])]
metadata.index = index
metadata = metadata.loc[adata.obs.index]
adata.obs = metadata

In [4]:
cohort_info = pd.read_csv('../data/McleanLevy_Dropulation_Cohort.csv',header=0,index_col=False)

In [7]:
gc.collect()

472

In [8]:
names = adata.obs.index
celltype = []
condition = []
for i in names:
    s = i.split("_")
    celltype.append(s[0])
    if s[1] in batch:
        condition.append('Control')
    else:
        condition.append(s[1])
    
adata.obs['cell_type']=celltype
adata.obs['condition']=condition

##Downsample to 20%
sc.pp.subsample(adata, fraction=0.2)

##Only keep neuron cells
adata = adata[adata.obs['cell_type']!='Astrocyte']
adata = adata[adata.obs['cell_type']!='iPSC']
adata = adata[adata.obs['cell_type']!='NPC']

gc.collect()
adata.layers["counts"] = adata.X.copy()

In [9]:
##Select HVGs for training model
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

sc.pp.filter_genes(adata, min_cells=500)
adata.raw = adata

sc.pp.highly_variable_genes(adata, flavor='seurat_v3', layer='counts', n_top_genes=3000, subset=True)

In [10]:
cohort = {'Clinical':[],'Sex':[],'Age':[]}
for i in range(adata.X.shape[0]):
    donor = adata.obs['DONOR'][i]
    sub_cohort = cohort_info[cohort_info['Linking Donor ID']==donor]
    if sub_cohort.shape[0]>0:
        cohort['Clinical'].append(sub_cohort['Clinical Diagnosis'].values[0])
        cohort['Sex'].append(sub_cohort['Sex'].values[0])
        cohort['Age'].append(sub_cohort['Age'].values[0])
    else:
        cohort['Clinical'].append(np.nan)
        cohort['Sex'].append(np.nan)
        cohort['Age'].append(np.nan)
        
cohort = pd.DataFrame(cohort)

adata.obs['Clinical'] = cohort['Clinical'].values
adata.obs['Sex'] = cohort['Sex'].values
adata.obs['Age'] = cohort['Age'].values

In [11]:
controls = np.repeat(0,adata.X.shape[0])
controls[adata.obs['condition']=='Control']=1
adata.obs['control']=controls

In [12]:
adata.write("../data/scLevyAll_neuron20.h5ad")