In [1]:
# Import packages 
import pandas as pd
import numpy as np
from scipy import stats
import random
import scanpy as sc
from tqdm import tqdm

In [2]:
# Load PFC data
adata = sc.read_h5ad('RNA_240201_PFC_13samples_normalized_doublet_82510cells.h5ad')

In [41]:
# Create list of cell types in the dataset
cell_type_list = adata.obs.subclass_name.unique().tolist()
cell_type_list

['Oligodendrocyte',
 'Astrocyte',
 'Granule',
 'OPC',
 'Microglia',
 'Molecular Layer Interneuron Type 1',
 'Molecular Layer Neuron Type 2',
 'OPC? or Granule',
 'Parenchymal Astrocyte',
 'Interneuron',
 'Oligodendrocyte/White matter',
 'Doublets?']

In [46]:
# Function to create psuedobulk files to read into R 

# Change depending on brain region for file save purposes
brain_region = 'PFC'

for cell_type in tqdm(cell_type_list):
    print('Collecting data for cell type '+cell_type+'...')
    # Create cell type subset
    cell_subset = adata[adata.obs['subclass_name'] == cell_type]
    # print(cell_subset)

    # Get cell count per donor
    cell_counts = cell_subset.obs.groupby('donor').size()
    include_list = list(cell_counts[cell_counts >= 3].index)
    # Only include donors with at least 3 cells of that type
    cell_subset = cell_subset[cell_subset.obs['donor'].isin(include_list)]

    # Create psueodbulk for that cell type
    pbs = [] # psuedobulk
    for sample in cell_subset.obs.donor.unique():
        samp_cell_subset = cell_subset[cell_subset.obs['donor'] == sample]
    
        samp_cell_subset.X = samp_cell_subset.layers['RNA_raw'] #using raw data
    
        rep_adata = sc.AnnData(X = samp_cell_subset.X.sum(axis = 0),
                               var = samp_cell_subset.var['features'].values)
    
        rep_adata.obs_names = [sample]
        rep_adata.obs['condition'] = samp_cell_subset.obs['condition'].iloc[0]
        rep_adata.obs['age'] = samp_cell_subset.obs['age'].iloc[0]
        rep_adata.obs['sex'] = samp_cell_subset.obs['sex'].iloc[0]
        rep_adata.obs['donor'] = samp_cell_subset.obs['donor'].iloc[0]
        rep_adata.obs['cell_count'] = cell_counts
    
        pbs.append(rep_adata)
        # print(pbs)
        
    pb = sc.concat(pbs)
    counts_array = np.asarray(pb.X)

    counts_df = pd.DataFrame(counts_array.T, columns=pb.obs['donor'].values, index=cell_subset.var['features'].values)
    # display(counts_df)
    metadata = pb.obs
    # display(metadata)

    # Save files
    # Need to modify because some cell types have '/' in their names
    modified_cell_name = cell_type.replace('/', '-')
    
    counts_df.to_csv('pb_files_2/'+str(modified_cell_name)+'_psuedobulk_'+brain_region+'_022224.csv')
    metadata.to_csv('pb_files_2/'+str(modified_cell_name)+'_metadata_'+brain_region+'_022224.csv')
    

  0%|          | 0/12 [00:00<?, ?it/s]

Collecting data for cell type Oligodendrocyte...


  cell_counts = cell_subset.obs.groupby('donor').size()
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_

KeyError: 'features'