In [13]:
import scanpy as sc
import pandas as pd 
import numpy as np
import anndata
import re
import h5py
import scipy.sparse as scs
import concurrent.futures
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csc_matrix
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

import umap
import random
import multiprocessing
random.seed(123)
from joblib import Parallel, delayed
import warnings
#warnings.filterwarnings('ignore')
import os


In [2]:
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat


def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values
    
    return obs_df
# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_file):
    h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata
def get_last_pattern(inputstr):
    pattern = r"[^/]+(?=$)"
    match = re.search(pattern, inputstr)
    if match:
        return match.group(0)
    else:
        return ""
def process_file(file_name):
    adata = read_h5_anndata(file_name)
    output_file = 'h5ad_flu_after_qc/'+adata.obs['pbmc_sample_id'][0]+'.h5ad'
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"File {output_file} already exists and is not empty. Skipping processing.")
        return
    adata.obs=adata.obs.merge(meta_data[col_list], on='pbmc_sample_id', how='left')
    doublet_scores=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Doublet_Scores/'+adata.obs['pbmc_sample_id'][0]+'.csv', index_col=0)
    L1_labels=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Labels/'+adata.obs['pbmc_sample_id'][0]+'_L1_predicted_labels.csv', index_col=0)
    L2_labels=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Labels/'+adata.obs['pbmc_sample_id'][0]+'_L2_predicted_labels.csv', index_col=0)
    L3_labels=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Labels/'+adata.obs['pbmc_sample_id'][0]+'_L3_predicted_labels.csv', index_col=0)
    L3_5_labels=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Labels/'+adata.obs['pbmc_sample_id'][0]+'_L3.5_predicted_labels.csv', index_col=0)
    L1_labels.columns=[ 'barcodes', 'AIFI_L1']
    L2_labels.columns=[ 'barcodes', 'AIFI_L2']
    L3_labels.columns=[ 'barcodes', 'AIFI_L3']
    L3_5_labels.columns=[ 'barcodes', 'AIFI_L3.5']
    print(doublet_scores['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L1_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L2_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L3_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L3_5_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist())
    adata.obs= pd.merge(adata.obs, doublet_scores, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L1_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L2_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L3_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L3_5_labels, on='barcodes', how='left')
    adata.obs.index=adata.obs['barcodes']
    adata.var["mito"] = adata.var_names.str.startswith("MT-")
    adata=adata[adata.obs['predicted_doublet']==False]
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True)
    adata=adata[(adata.obs["pct_counts_mito"] <10) & (adata.obs["n_genes"] <5000) &(adata.obs["n_genes"] >200) ]
    adata.write_h5ad('h5ad_flu_after_qc/'+adata.obs['pbmc_sample_id'][0]+'.h5ad')

In [3]:
col_list=['subject.biologicalSex','subject.ethnicity', 'subject.partnerCode',
          'subject.race', 'subject.subjectGuid', 'cohort.cohortGuid', 'sample.visitName', 
          'sample.visitDetails', 'subject.birthYear','CMV.IgG.Serology.Result.Interpretation', 
          'BMI','pbmc_sample_id']

In [2]:
meta_data=pd.read_csv('/home/jupyter/IH-A-Aging-Analysis-Notebooks/Qiuyu-Notebooks/scRNA/hise_meta_data_2024-01-23_fixed.csv')

# Read all h5 and split into individual H5

In [5]:
def load_file(file_name):
    try:
        result = sc.read_h5ad("/home/jupyter/BRI_Analysis/h5_cleaned_by_sample/" + file_name + '.h5ad')
        return result
    except Exception as e:
        print(f'Error reading {file_name}: {e}')
        return None

In [6]:
%%time
file_names= meta_data["pbmc_sample_id"].tolist()
h5_list = []
with ThreadPoolExecutor(max_workers=60) as executor:
    future_to_file = {executor.submit(load_file, file_name): file_name for file_name in file_names}
    for future in tqdm(as_completed(future_to_file), total=len(file_names)):
        result = future.result()
        if result is not None:
            h5_list.append(result)

100% 868/868 [32:34<00:00,  2.25s/it]  

CPU times: user 2min 40s, sys: 3min 28s, total: 6min 9s
Wall time: 32min 35s





In [7]:
def subset_adata(adata, celltype):
    adata_subset = adata[adata.obs['AIFI_L3'] == celltype].copy()
    return adata_subset

In [34]:
if h5_list[9].obs['AIFI_L3'].nunique() == 71:
    for i in range(0,71):
        celltypes_to_process = [h5_list[9].obs['AIFI_L3'].unique()[i]]
        adata_list = []
        print(celltypes_to_process[0])

        with ThreadPoolExecutor(max_workers=60) as executor:
            future_to_adata = {executor.submit(subset_adata, adata_file, celltype): adata_file for celltype in celltypes_to_process for adata_file in h5_list}
            for future in tqdm(as_completed(future_to_adata), total=len(future_to_adata)):
                result = future.result()
                if result is not None:
                    adata_list.append(result)
        combined= anndata.concat(adata_list)
        combined.write_h5ad('/home/jupyter/BRI_Analysis/h5_by_celltype/'+celltypes_to_process[0]+'.h5ad')

cDC1


  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  if not is

FileNotFoundError: [Errno 2] Unable to create file (unable to open file: name = '/home/jupyter/BRI_Analysis/h5_by_celltype/cDC1.h5ad', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 242)

In [20]:
h5_list[22].obs['AIFI_L3'].nunique()

70

In [30]:
h5_list[25].obs['AIFI_L3'].nunique() == 71

False

In [24]:
len(h5_list)

868

In [31]:
for i, item in enumerate(h5_list):
    if item.obs['AIFI_L3'].nunique() == 71:
        print(f"Index {i} satisfies the condition.")


Index 9 satisfies the condition.
Index 21 satisfies the condition.
Index 32 satisfies the condition.
Index 60 satisfies the condition.
Index 72 satisfies the condition.
Index 79 satisfies the condition.
Index 122 satisfies the condition.
Index 132 satisfies the condition.
Index 159 satisfies the condition.
Index 175 satisfies the condition.
Index 191 satisfies the condition.
Index 199 satisfies the condition.
Index 259 satisfies the condition.
Index 267 satisfies the condition.
Index 277 satisfies the condition.
Index 297 satisfies the condition.
Index 318 satisfies the condition.
Index 322 satisfies the condition.
Index 362 satisfies the condition.
Index 370 satisfies the condition.
Index 379 satisfies the condition.
Index 380 satisfies the condition.
Index 404 satisfies the condition.
Index 405 satisfies the condition.
Index 439 satisfies the condition.
Index 447 satisfies the condition.
Index 458 satisfies the condition.
Index 460 satisfies the condition.
Index 464 satisfies the con

# Other method

In [14]:
import scanpy as sc
import pandas as pd 
import numpy as np
import anndata
import re
import h5py
import scipy.sparse as scs
import concurrent.futures
import seaborn as sns
from scipy.sparse import csc_matrix
import random
random.seed(123)
from joblib import Parallel, delayed
from concurrent.futures import ThreadPoolExecutor, as_completed


In [5]:
meta_data.columns.tolist()

['Unnamed: 0',
 'X',
 'lastUpdated',
 'sample.id',
 'sample.bridgingControl',
 'sample.sampleKitGuid',
 'sample.visitName',
 'sample.visitDetails',
 'sample.drawDate',
 'sample.daysSinceFirstVisit',
 'file.id',
 'file.name',
 'file.batchID',
 'file.panel',
 'file.pool',
 'file.fileType',
 'file.majorVersion',
 'subject.id',
 'subject.biologicalSex',
 'subject.birthYear',
 'subject.ethnicity',
 'subject.partnerCode',
 'subject.race',
 'subject.subjectGuid',
 'cohort.cohortGuid',
 'sample.diseaseStatesRecordedAtVisit',
 'pbmc_sample_id',
 'Sample_ID',
 'Sample_Draw_date',
 'Covid_Dose_1',
 'Covid_Dose_2',
 'Covid_exlcusion',
 'CMV.IgG.Serology.Result.Interpretation',
 'BMI',
 'file.name.downloaded',
 'file.path']

In [6]:
meta_data['sample.visitName'].unique()

array(['Flu Year 1 Day 0', 'Flu Year 1 Day 7', 'Flu Year 1 Day 90',
       'Immune Variation Day 0', 'Immune Variation Day 7',
       'Immune Variation Day 90', 'Flu Year 2 Stand-Alone',
       'Flu Year 1 Stand-Alone', 'Flu Year 2 Day 0', 'Flu Year 2 Day 7',
       'Flu Year 2 Day 90', 'Flu Year 3 Stand-Alone'], dtype=object)

In [4]:
meta_data=pd.read_csv('/home/jupyter/IH-A-Aging-Analysis-Notebooks/Qiuyu-Notebooks/scRNA/hise_meta_data_2024-01-23_fixed.csv')
meta_data=meta_data[meta_data['Covid_exlcusion']=='no']
TimePoints=['Flu Year 1 Day 0','Flu Year 1 Day 7', 'Flu Year 1 Day 90', 'Flu Year 2 Day 0', 'Flu Year 2 Day 7', 'Flu Year 2 Day 90' ]
meta_data_subset=meta_data[meta_data['sample.visitName'].isin(TimePoints)]

In [7]:
meta_data_subset['sample.visitName'].unique()

array(['Flu Year 1 Day 0', 'Flu Year 1 Day 7', 'Flu Year 1 Day 90',
       'Flu Year 2 Day 0', 'Flu Year 2 Day 7', 'Flu Year 2 Day 90'],
      dtype=object)

In [None]:
from tqdm import tqdm
h5_list = []
for i in tqdm(range(len(meta_data_subset["pbmc_sample_id"].tolist()))):
    try:
        result = sc.read_h5ad("/home/jupyter/BRI_Analysis/h5_cleaned_by_sample/"+meta_data_subset["pbmc_sample_id"].tolist()[i]+'.h5ad')
        h5_list.append(result)
        print()
    except Exception as e:
        print(f'Error reading {file_name}: {e}')

  0% 1/512 [00:01<14:18,  1.68s/it]




  0% 2/512 [00:03<14:31,  1.71s/it]




  1% 3/512 [00:05<14:43,  1.74s/it]




  1% 4/512 [00:07<16:12,  1.91s/it]




  1% 5/512 [00:09<17:22,  2.06s/it]




  1% 6/512 [00:11<17:04,  2.02s/it]




  1% 7/512 [00:13<16:31,  1.96s/it]




  2% 8/512 [00:14<15:15,  1.82s/it]




  2% 9/512 [00:16<14:49,  1.77s/it]




  2% 10/512 [00:18<14:26,  1.73s/it]




  2% 11/512 [00:19<14:07,  1.69s/it]




  2% 12/512 [00:21<14:55,  1.79s/it]




  3% 13/512 [00:23<14:17,  1.72s/it]




  3% 14/512 [00:24<13:36,  1.64s/it]




  3% 15/512 [00:27<14:56,  1.80s/it]




  3% 16/512 [00:28<14:21,  1.74s/it]




  3% 17/512 [00:30<14:28,  1.76s/it]




  4% 18/512 [00:32<14:52,  1.81s/it]




  4% 19/512 [00:33<14:04,  1.71s/it]




  4% 20/512 [00:35<13:42,  1.67s/it]




  4% 21/512 [00:37<13:40,  1.67s/it]




  4% 22/512 [00:39<14:20,  1.76s/it]




  4% 23/512 [00:41<16:05,  1.97s/it]




  5% 24/512 [00:44<17:23,  2.14s/it]




  5% 25/512 [00:45<16:20,  2.01s/it]




  5% 26/512 [00:48<16:50,  2.08s/it]




  5% 27/512 [00:50<16:50,  2.08s/it]




  5% 28/512 [00:52<18:22,  2.28s/it]




  6% 29/512 [00:54<16:55,  2.10s/it]




  6% 30/512 [00:56<17:28,  2.17s/it]




  6% 31/512 [00:58<17:07,  2.14s/it]




  6% 32/512 [01:00<16:35,  2.07s/it]




  6% 33/512 [01:02<16:31,  2.07s/it]




  7% 34/512 [01:05<16:47,  2.11s/it]




  7% 35/512 [01:07<18:02,  2.27s/it]




  7% 36/512 [01:09<16:05,  2.03s/it]




  7% 37/512 [01:11<16:21,  2.07s/it]




  7% 38/512 [01:13<15:29,  1.96s/it]




  8% 39/512 [01:15<16:16,  2.06s/it]




  8% 40/512 [01:17<15:55,  2.02s/it]




  8% 41/512 [01:19<15:37,  1.99s/it]




  8% 42/512 [01:20<13:46,  1.76s/it]




  8% 43/512 [01:22<14:45,  1.89s/it]




  9% 44/512 [01:23<13:17,  1.70s/it]




  9% 45/512 [01:25<13:33,  1.74s/it]




  9% 46/512 [01:27<13:39,  1.76s/it]




  9% 47/512 [01:29<13:41,  1.77s/it]




  9% 48/512 [01:31<14:57,  1.93s/it]




 10% 49/512 [01:33<15:11,  1.97s/it]




 10% 50/512 [01:36<15:55,  2.07s/it]




 10% 51/512 [01:37<15:20,  2.00s/it]




 10% 52/512 [01:40<16:29,  2.15s/it]




 10% 53/512 [01:42<16:57,  2.22s/it]




 11% 54/512 [01:44<16:44,  2.19s/it]




 11% 55/512 [01:46<16:13,  2.13s/it]




 11% 56/512 [01:48<15:05,  1.98s/it]




 11% 57/512 [01:50<15:47,  2.08s/it]




 11% 58/512 [01:52<14:57,  1.98s/it]




 12% 59/512 [01:54<13:45,  1.82s/it]




 12% 60/512 [01:55<13:09,  1.75s/it]




 12% 61/512 [01:57<12:36,  1.68s/it]




 12% 62/512 [01:59<13:08,  1.75s/it]




 12% 63/512 [02:00<13:07,  1.75s/it]




 12% 64/512 [02:02<13:32,  1.81s/it]




 13% 65/512 [02:04<13:51,  1.86s/it]




 13% 66/512 [02:06<13:54,  1.87s/it]




 13% 67/512 [02:08<13:35,  1.83s/it]




 13% 68/512 [02:10<13:50,  1.87s/it]




 13% 69/512 [02:12<14:38,  1.98s/it]




 14% 70/512 [02:14<13:59,  1.90s/it]




 14% 71/512 [02:15<13:05,  1.78s/it]




 14% 72/512 [02:17<12:39,  1.73s/it]




 14% 73/512 [02:19<13:32,  1.85s/it]




 14% 74/512 [02:21<13:38,  1.87s/it]




 15% 75/512 [02:23<13:28,  1.85s/it]




 15% 76/512 [02:24<13:08,  1.81s/it]




 15% 77/512 [02:26<13:01,  1.80s/it]




 15% 78/512 [02:29<14:31,  2.01s/it]




 15% 79/512 [02:30<13:53,  1.92s/it]




 16% 80/512 [02:32<13:48,  1.92s/it]




 16% 81/512 [02:35<14:28,  2.02s/it]




 16% 82/512 [02:37<15:26,  2.15s/it]




 16% 83/512 [02:39<14:33,  2.04s/it]




 16% 84/512 [02:41<14:23,  2.02s/it]




 17% 85/512 [02:43<15:08,  2.13s/it]




 17% 86/512 [02:45<14:06,  1.99s/it]




 17% 87/512 [02:46<13:15,  1.87s/it]




 17% 88/512 [02:49<13:44,  1.94s/it]




 17% 89/512 [02:50<11:48,  1.67s/it]




 18% 90/512 [02:52<12:15,  1.74s/it]




 18% 91/512 [02:53<12:13,  1.74s/it]




 18% 92/512 [02:55<11:57,  1.71s/it]




 18% 93/512 [02:56<11:19,  1.62s/it]




 18% 94/512 [02:58<11:25,  1.64s/it]




 19% 95/512 [03:00<11:34,  1.67s/it]




 19% 96/512 [03:01<11:34,  1.67s/it]




 19% 97/512 [03:03<11:21,  1.64s/it]




 19% 98/512 [03:05<11:57,  1.73s/it]




 19% 99/512 [03:07<12:33,  1.82s/it]




 20% 100/512 [03:09<12:23,  1.80s/it]




 20% 101/512 [03:10<11:36,  1.69s/it]




 20% 102/512 [03:12<12:21,  1.81s/it]




 20% 103/512 [03:14<11:41,  1.72s/it]




 20% 104/512 [03:16<12:10,  1.79s/it]




 21% 105/512 [03:17<12:02,  1.78s/it]




 21% 106/512 [03:19<11:54,  1.76s/it]




 21% 107/512 [03:21<12:20,  1.83s/it]




 21% 108/512 [03:23<12:26,  1.85s/it]




 21% 109/512 [03:24<11:09,  1.66s/it]




 21% 110/512 [03:25<10:15,  1.53s/it]




 22% 111/512 [03:26<08:55,  1.33s/it]




 22% 112/512 [03:28<09:04,  1.36s/it]




 22% 113/512 [03:29<09:35,  1.44s/it]




 22% 114/512 [03:31<09:40,  1.46s/it]




 22% 115/512 [03:33<10:52,  1.64s/it]




 23% 116/512 [03:35<11:32,  1.75s/it]




 23% 117/512 [03:37<11:44,  1.78s/it]




 23% 118/512 [03:39<11:46,  1.79s/it]




 23% 119/512 [03:41<11:53,  1.82s/it]




 23% 120/512 [03:42<11:57,  1.83s/it]




 24% 121/512 [03:44<12:13,  1.88s/it]




 24% 122/512 [03:46<11:28,  1.77s/it]




 24% 123/512 [03:48<12:38,  1.95s/it]




 24% 124/512 [03:49<10:59,  1.70s/it]




 24% 125/512 [03:52<11:49,  1.83s/it]




 25% 126/512 [03:54<13:29,  2.10s/it]




 25% 127/512 [03:55<11:17,  1.76s/it]




 25% 128/512 [03:56<09:21,  1.46s/it]




 25% 129/512 [03:57<09:20,  1.46s/it]




 25% 130/512 [04:00<11:07,  1.75s/it]




 26% 131/512 [04:02<11:37,  1.83s/it]




 26% 132/512 [04:04<13:04,  2.06s/it]




 26% 133/512 [04:07<13:33,  2.15s/it]




 26% 134/512 [04:09<13:00,  2.06s/it]




 26% 135/512 [04:10<11:49,  1.88s/it]




 27% 136/512 [04:12<11:46,  1.88s/it]




 27% 137/512 [04:14<11:03,  1.77s/it]




 27% 138/512 [04:16<11:58,  1.92s/it]




 27% 139/512 [04:18<12:08,  1.95s/it]




 27% 140/512 [04:19<11:05,  1.79s/it]




 28% 141/512 [04:22<12:05,  1.95s/it]




 28% 142/512 [04:24<12:28,  2.02s/it]




 28% 143/512 [04:26<12:23,  2.01s/it]




 28% 144/512 [04:27<11:37,  1.89s/it]




 28% 145/512 [04:29<10:21,  1.69s/it]




 29% 146/512 [04:31<10:53,  1.79s/it]




 29% 147/512 [04:33<11:14,  1.85s/it]




 29% 148/512 [04:35<11:31,  1.90s/it]




 29% 149/512 [04:37<12:18,  2.04s/it]




 29% 150/512 [04:40<13:27,  2.23s/it]




 29% 151/512 [04:41<12:22,  2.06s/it]




 30% 152/512 [04:44<12:42,  2.12s/it]




 30% 153/512 [04:46<12:59,  2.17s/it]




 30% 154/512 [04:48<13:05,  2.19s/it]




 30% 155/512 [04:51<13:28,  2.26s/it]




 30% 156/512 [04:52<12:37,  2.13s/it]




 31% 157/512 [04:54<12:22,  2.09s/it]




 31% 158/512 [04:57<13:14,  2.24s/it]




 31% 159/512 [04:59<12:16,  2.09s/it]




 31% 160/512 [05:00<11:14,  1.91s/it]




 31% 161/512 [05:03<11:57,  2.04s/it]




 32% 162/512 [05:05<12:04,  2.07s/it]




 32% 163/512 [05:06<11:09,  1.92s/it]




 32% 164/512 [05:08<11:29,  1.98s/it]




 32% 165/512 [05:11<12:08,  2.10s/it]

In [None]:
len(h5_list)

In [None]:
%%time
combined= anndata.concat(h5_list)


In [None]:
combined

In [None]:
combined.write_h5ad('/home/jupyter/Myeloid_cells/files/All_Flu_timepoints_raw.h5ad')

In [16]:
combined=sc.read_h5ad('/home/jupyter/Myeloid_cells/files/All_Flu_timepoints_raw.h5ad')

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = '/home/jupyter/Myeloid_cells/files/All_Flu_timepoints_raw.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [10]:
combined

AnnData object with n_obs × n_vars = 1939058 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'seurat_pbmc_type', 'seurat_pbmc_type_score', 'umap_1', 'umap_2', 'well_id', 'subject.biologicalSex', 'subject.ethnicity', 'subject.partnerCode', 'subject.race', 'subject.subjectGuid', 'cohort.cohortGuid', 'sample.visitName', 'sample.visitDetails', 'subject.birthYear', 'CMV.IgG.Serology.Result.Interpretation', 'BMI', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3', 'AIFI_L3.5', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito'

In [11]:
b_cells_subset = combined.obs[combined.obs['AIFI_L1'] == 'B cell']
b_cells_subset

Unnamed: 0_level_0,barcodes,batch_id,cell_name,cell_uuid,chip_id,hto_barcode,hto_category,n_genes,n_mito_umis,n_reads,...,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,total_counts_mito,log1p_total_counts_mito,pct_counts_mito
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cfa6417648b611ea8957bafe6d70929e,cfa6417648b611ea8957bafe6d70929e,B001,gasolinic_vocational_adouri,cfa6417648b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,1650,280,15119,...,7.409136,4635.0,8.441607,35.102481,49.751888,60.474649,74.822006,280.0,5.638355,6.040993
cfad979648b611ea8957bafe6d70929e,cfad979648b611ea8957bafe6d70929e,B001,subdermic_hapless_leopard,cfad979648b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,1582,171,14574,...,7.367077,4295.0,8.365439,36.437718,49.592549,60.000000,74.807916,171.0,5.147494,3.981374
cfc2364248b611ea8957bafe6d70929e,cfc2364248b611ea8957bafe6d70929e,B001,flavorous_sterling_bactrian,cfc2364248b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,1728,310,20620,...,7.455298,6155.0,8.725183,39.561332,56.669374,66.547522,78.375305,310.0,5.739793,5.036556
70a8e30448b611eaa65d9e7d578d66f2,70a8e30448b611eaa65d9e7d578d66f2,B001,crumby_mini_genet,70a8e30448b611eaa65d9e7d578d66f2,B001-P1C1,TGATGGCCTATTGGG,singlet,2583,449,37365,...,7.857094,10503.0,9.259511,38.636580,53.956013,63.362849,74.635818,449.0,6.109248,4.274969
70aff80648b611eaa65d9e7d578d66f2,70aff80648b611eaa65d9e7d578d66f2,B001,clownish_like_kiskadee,70aff80648b611eaa65d9e7d578d66f2,B001-P1C1,TGATGGCCTATTGGG,singlet,2492,338,31035,...,7.821242,8950.0,9.099521,37.977654,53.173184,62.458101,73.407821,338.0,5.826000,3.776536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9f5d30e09a2711ed8e94f29eefb82f83,9f5d30e09a2711ed8e94f29eefb82f83,B148,chosen_uncleansed_monarch,9f5d30e09a2711ed8e94f29eefb82f83,B148-P2C3,TGTCTTTCCTGCCAG,singlet,1329,108,13718,...,7.192934,4622.0,8.438799,47.490264,62.115967,71.332756,82.064042,108.0,4.691348,2.336651
9f6f44b09a2711ed8e94f29eefb82f83,9f6f44b09a2711ed8e94f29eefb82f83,B148,wool_simple_lobo,9f6f44b09a2711ed8e94f29eefb82f83,B148-P2C3,TGTCTTTCCTGCCAG,singlet,1311,103,11260,...,7.179308,3735.0,8.225771,40.240964,53.895582,63.935743,78.286479,103.0,4.644391,2.757697
9f2484249a2811edaf6b121d2657380a,9f2484249a2811edaf6b121d2657380a,B148,lionesque_ghoulish_cob,9f2484249a2811edaf6b121d2657380a,B148-P2C3,TGTCTTTCCTGCCAG,singlet,1665,338,20192,...,7.418181,5470.0,8.607217,39.451554,54.661792,64.643510,77.111517,338.0,5.826000,6.179159
1e9efd729a2b11ed977ca669b8cd6cb0,1e9efd729a2b11ed977ca669b8cd6cb0,B148,dissonant_theocratic_tragopan,1e9efd729a2b11ed977ca669b8cd6cb0,B148-P2C3,TGTCTTTCCTGCCAG,singlet,1109,118,13545,...,7.012115,3767.0,8.234300,49.243430,63.074064,72.365277,83.833289,118.0,4.779123,3.132466


In [12]:
b_cells_subset['AIFI_L1'].unique()

['B cell']
Categories (9, object): ['B cell', 'DC', 'Erythrocyte', 'ILC', ..., 'NK cell', 'Platelet', 'Progenitor cell', 'T cell']

In [15]:
b_cells_subset['AIFI_L3'].unique()

['CD95 memory B cell', 'Transitional B cell', 'Core memory B cell', 'Activated memory B cell', 'CD27- effector B cell', ..., 'Platelet', 'Memory CD4 Treg', 'DN T cell', 'CMP cell', 'CD14+ cDC2']
Length: 43
Categories (71, object): ['ASDC', 'Activated memory B cell', 'Adaptive NK cell', 'BaEoMaP cell', ..., 'Transitional B cell', 'Type 2 polarized memory B cell', 'cDC1', 'pDC']

In [21]:
x = b_cells_subset['AIFI_L3'].value_counts()
x.to_csv("test.csv")

In [5]:
combined.to.csv("/home/jupyter/Myeloid_cells/files/AIFI_fluYr1Yr2_raw.csv")

AttributeError: 'AnnData' object has no attribute 'to'

In [20]:
# Convert the file

In [12]:
combined.obs.to_parquet("/home/jupyter/Myeloid_cells/files/All_Flu_timepoints_raw.parquet")

In [15]:
import pandas as pd
df = pd.read_parquet('/home/jupyter/Myeloid_cells/files/All_Flu_timepoints_raw.parquet')
df.to_csv('/home/jupyter/Myeloid_cells/files/All_Flu_timepoints_raw.csv')

In [16]:
len(df)

8277628

In [7]:
combined.obs.index=combined.obs['barcodes']

In [8]:
fitlered_gene=pd.DataFrame()
for i in combined.obs['AIFI_L3'].unique():
    print(i)
    adata_subset=combined[combined.obs['AIFI_L3']==i]
    sc.pp.filter_genes(adata_subset, min_cells=round(np.shape(adata_subset.X)[0]*0.1))
    gene_list=pd.DataFrame(list(adata_subset.var.index))
    gene_list.columns=['gene']
    gene_list['AIFI_L3']=i
    fitlered_gene=pd.concat([fitlered_gene,gene_list])


cDC1


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


KLRF1- effector Vd1 gdT


  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ CD16 monocyte


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CD14+ cDC2


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


HLA-DRhi cDC2


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


GZMK+ Vd2 gdT


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CD8aa


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


GZMK+ CD56dim NK cell


  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


C1Q+ CD16 monocyte


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CD95 memory B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


GZMK- CD27+ EM CD8 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


KLRF1+ GZMB+ CD27- EM CD8 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


GZMK- CD56dim NK cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Memory CD8 Treg
Proliferating NK cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CD4 MAIT


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Intermediate monocyte


  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CMP cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


SOX4+ Vd1 gdT
KLRB1+ memory CD8 Treg


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


pDC


  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Erythrocyte


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Core CD14 monocyte


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


GZMB+ Vd2 gdT


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Platelet


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ILC
GZMB- CD27+ EM CD4 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


GZMK+ memory CD4 Treg


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Transitional B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CD56bright NK cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Core memory B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


KLRB1+ memory CD4 Treg


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CM CD4 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


GZMK+ CD27+ EM CD8 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Activated memory B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


IL1B+ CD14 monocyte


  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


SOX4+ naive CD4 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


BaEoMaP cell
KLRF1+ effector Vd1 gdT


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CD27- effector B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ CD56dim NK cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ naive CD8 T cell
ISG+ CD14 monocyte


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ naive B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CD8 MAIT


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CM CD8 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Core CD16 monocyte


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Core naive CD8 T cell 


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ naive CD4 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Proliferating T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Plasma cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CD27+ effector B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


SOX4+ naive CD8 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Core naive CD4 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ memory CD8 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


Naive Vd1 gdT


  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Core naive B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


CLP cell
KLRF1- GZMB+ CD27- EM CD8 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Naive CD4 Treg


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


KLRF1- GZMB+ CD27- memory CD4 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Early memory B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):


GZMB- CD27- EM CD4 T cell


  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ memory CD4 T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ MAIT
Memory CD4 Treg


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):
  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Adaptive NK cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ISG+ cDC2


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


Type 2 polarized memory B cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


DN T cell


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


ASDC


  if not is_categorical_dtype(df_full[k]):
  adata.var['n_cells'] = number
  if not is_categorical_dtype(df_full[k]):


In [9]:
fitlered_gene.to_csv('filtered_gene_list.csv')