# ALTRA data scRNA psedubulk in Python - certPro deep clean data
## make psedubulk data by AIFI l3 labels for all aims

In [1]:
import pandas as pd
import os
import scanpy as sc
import decoupler as dc

In [3]:
# sc.settings.n_jobs = 58

In [4]:
# define some color patterns for plotting
nejm_color = ["#BC3C29FF", "#0072B5FF", "#E18727FF", "#20854EFF",
              "#7876B1FF", "#6F99ADFF", "#FFDC91FF", "#EE4C97FF"]
jama_color = ["#374E55FF", "#DF8F44FF", "#00A1D5FF",
              "#B24745FF", "#79AF97FF", "#6A6599FF", "#80796BFF"]

In [5]:
# define working path
data_path = '/home/jupyter/data/ra_longitudinal/scrna/cache/'
meta_path = '/home/jupyter/github/ra-longitudinal/metadata/'
fig_path = '/home/jupyter/data/ra_longitudinal/figures/deepclean/'
output_path = '/home/jupyter/data/ra_longitudinal/scrna/certPro/'
# os.mkdir(fig_path)
# os.mkdir(output_path)
# define a project name
proj_name = 'ALTRA_scRNA_AIFI_L3_certpro_pseudobulk_'
# sc.set_figure_params(fig_path)
sc.settings.figdir = fig_path
sc.settings.autosave = False
sc.set_figure_params(vector_friendly=True, dpi_save=300)

In [6]:
# import the helper functions
# importing the sys module
import sys         
 
# appending the directory of mod.py 
# in the sys.path list
sys.path.append('/home/jupyter/github/ra-longitudinal/scRNA/')        
# now we can import mod

from ALTRA_scRNA_py_helper_functions import generate_pseudobulk_data

# make pseudobulk object based on the cell types and samples

In [7]:
# load the deep clean data
joint_adata_fl = sc.read_h5ad(
    '/home/jupyter/data/ra_longitudinal/scrna/certPro/ALTRA_certPro_scRNA_141_samples_combined_adata.h5ad'
)



In [8]:
joint_adata_fl

AnnData object with n_obs × n_vars = 2059581 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L1_score', 'AIFI_L2', 'AIFI_L2_score', 'AIFI_L3', 'AIFI_L3_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden_harmony_2', 'doublets_manual', 'AIFI_L3_new', 'S

In [9]:
joint_adata_fl.obs[['pbmc_sample_id', 'sample.sampleKitGuid']]

Unnamed: 0,pbmc_sample_id,sample.sampleKitGuid
0,PB00118-01,KT00118
1,PB00416-01,KT00416
2,PB00103-01,KT00103
3,PB02957-001,KT02957
4,PB00118-01,KT00118
...,...,...
2059576,PB00052-02,KT00052
2059577,PB00087-01,KT00087
2059578,PB00087-01,KT00087
2059579,PB00052-02,KT00052


In [10]:
# doouble check the raw counts data is stored in X
joint_adata_fl.X[100:120, 500:520].toarray()

array([[0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 3, 0],
       [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 4, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 6, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0],
       [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0,

In [11]:
cell_type_col = 'AIFI_L3_new'
joint_adata_fl.obs.groupby(cell_type_col).size()

  joint_adata_fl.obs.groupby(cell_type_col).size()


AIFI_L3_new
ASDC                                353
ASDC_uk1_B                          164
Activated memory B cell             350
Activated memory B cell_uk1          32
Adaptive NK cell                  14495
                                  ...  
T2MBC_uk1                            47
Transitional B cell               15358
Type 2 polarized memory B cell     3267
cDC1                                894
pDC                                5579
Length: 89, dtype: int64

In [12]:
# clean up the cell type names from R to python
# joint_adata_fl.obs[cell_type_col] = [ct.replace(" ", "_") for ct in joint_adata_fl.obs[cell_type_col]]

In [13]:
# use decoupler to make pseudobulk
adata_pb = dc.get_pseudobulk(joint_adata_fl, sample_col="sample.sampleKitGuid", 
                             groups_col=cell_type_col, min_cells=10, min_counts=1000,
                              mode='sum')

In [14]:
adata_pb.obs = adata_pb.obs.astype('str')

In [15]:
# add counts layer
adata_pb.layers['counts'] = adata_pb.X.copy()

In [16]:
adata_pb.write_h5ad(output_path + proj_name + 'all_samples_data.h5ad')

In [17]:
proj_name

'ALTRA_scRNA_AIFI_L3_certpro_pseudobulk_'

In [18]:
adata_pb = sc.read_h5ad(output_path + proj_name + 'all_samples_data.h5ad')
adata_pb

AnnData object with n_obs × n_vars = 7616 × 29842
    obs: 'batch_id', 'hto_barcode', 'hto_category', 'pbmc_sample_id', 'pool_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'AIFI_L3', 'doublets_manual', 'AIFI_L3_new', 'Status_Xsec', 'Status_Long', 'anti_ccp3_finalCombined', 'days_to_conversion', 'BMI', 'CMV_Status_Subj', 'age_conv', 'bmi_conv', 'file.batchID', 'status', 'psbulk_n_cells', 'psbulk_counts'
    layers: 'counts', 'psbulk_props'

In [19]:
# load metadata
aim1_meta = pd.read_csv(meta_path + 'ALTRA_RA_Aim1_ALTRA_at_risk_vs_HCs_scrna_metadata.csv')
aim2_meta = pd.read_csv(meta_path + 'ALTRA_RA_Aim2_ALTRA_early_RA_vs_HCs_scrna_metadata.csv')
aim3_meta = pd.read_csv(meta_path + 'ALTRA_RA_Aim3_ALTRA_converters_longitudinal_scrna_metadata.csv')
lab_meta = pd.read_csv('/home/jupyter/data/ra_longitudinal/metadata/2023-11-22_ALTRA_Metadata_labs.csv')

In [20]:
# seperate the aims
aim1_pb = adata_pb[adata_pb.obs[
    'sample.sampleKitGuid'].isin(aim1_meta['sample.sampleKitGuid'])].copy()
aim2_pb = adata_pb[adata_pb.obs[
    'sample.sampleKitGuid'].isin(aim2_meta['sample.sampleKitGuid'])].copy()
aim3_pb = adata_pb[adata_pb.obs[
    'sample.sampleKitGuid'].isin(aim3_meta['sample.sampleKitGuid'])].copy()

# loop to make cell type specific counts table and metadata for each aim

## AIM1

In [21]:
aim1_pb

AnnData object with n_obs × n_vars = 4061 × 29842
    obs: 'batch_id', 'hto_barcode', 'hto_category', 'pbmc_sample_id', 'pool_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'AIFI_L3', 'doublets_manual', 'AIFI_L3_new', 'Status_Xsec', 'Status_Long', 'anti_ccp3_finalCombined', 'days_to_conversion', 'BMI', 'CMV_Status_Subj', 'age_conv', 'bmi_conv', 'file.batchID', 'status', 'psbulk_n_cells', 'psbulk_counts'
    layers: 'counts', 'psbulk_props'

In [22]:
# set up loop to produce all cell type pseudobulk data
output_path = '/home/jupyter/data/ra_longitudinal/scrna/certPro/counts/aifi_l3/aim1/'
os.makedirs(output_path)
cell_type_col = 'AIFI_L3_new'
generate_pseudobulk_data(aim1_pb, output_path, cell_type_col, proj_name=proj_name)

generating pseudobulk data for ASDC
remaining 1668 genes in ASDC
generating pseudobulk data for Activated memory B cell
remaining 888 genes in Activated memory B cell
generating pseudobulk data for Activated memory B cell_uk1
remaining 763 genes in Activated memory B cell_uk1
generating pseudobulk data for Adaptive NK cell
remaining 1917 genes in Adaptive NK cell
generating pseudobulk data for Adaptive NK cell_uk1_T
remaining 1195 genes in Adaptive NK cell_uk1_T
generating pseudobulk data for C1Q+ CD16 monocyte
remaining 4371 genes in C1Q+ CD16 monocyte
generating pseudobulk data for CD14+ cDC2
remaining 6186 genes in CD14+ cDC2
generating pseudobulk data for CD27+ effector B cell
remaining 3566 genes in CD27+ effector B cell
generating pseudobulk data for CD27- effector B cell
remaining 2451 genes in CD27- effector B cell
generating pseudobulk data for CD4 MAIT
remaining 1772 genes in CD4 MAIT
generating pseudobulk data for CD56bright NK cell
remaining 4868 genes in CD56bright NK cell

## AIM2

In [23]:
aim2_pb

AnnData object with n_obs × n_vars = 2976 × 29842
    obs: 'batch_id', 'hto_barcode', 'hto_category', 'pbmc_sample_id', 'pool_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'AIFI_L3', 'doublets_manual', 'AIFI_L3_new', 'Status_Xsec', 'Status_Long', 'anti_ccp3_finalCombined', 'days_to_conversion', 'BMI', 'CMV_Status_Subj', 'age_conv', 'bmi_conv', 'file.batchID', 'status', 'psbulk_n_cells', 'psbulk_counts'
    layers: 'counts', 'psbulk_props'

In [24]:
# add back bath id back to the metadata
# load the scrna metadata
aim2_meta = pd.read_csv(meta_path + 'ALTRA_RA_Aim2_ALTRA_early_RA_vs_HCs_scrna_metadata.csv', 
                              index_col='sample.sampleKitGuid')
aim2_meta.head()

Unnamed: 0_level_0,lastUpdated,sample.id,sample.bridgingControl,sample.visitName,sample.visitDetails,sample.drawDate,sample.daysSinceFirstVisit,sample.diseaseStatesRecordedAtVisit,file.id,file.name,...,subject.id,subject.biologicalSex,subject.birthYear,subject.ethnicity,subject.partnerCode,subject.race,subject.subjectGuid,cohort.cohortGuid,status,age
sample.sampleKitGuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KT02135,2023-11-09T02:52:35.494Z,fa01b4a2-b150-4ac9-9ab8-4c63861fefad,False,Flu Year 3 Stand-Alone,RA Conversion,2021-06-01T00:00:00Z,615,Rheumatoid arthritis,251b30f7-5db3-4f25-9040-6563d8855205,automated/merged/2021-10-07T16:25:55.38418956Z...,...,abfda3b3-5523-4698-a622-80b873e4de83,Female,1998,Non-Hispanic origin,CU,Caucasian,CU1003,CU1,early RA,23
KT02845,2023-11-09T02:52:35.494Z,ef72a407-8aee-4fe5-89dc-d7085a64ff42,False,Flu Year 3 Day 0,N/A - Flu-Series Timepoint Only,2021-10-01T00:00:00Z,722,Rheumatoid arthritis,520d6585-0b6c-44e2-9716-5d9a0c9afd13,automated/merged/2022-02-11T20:59:44.622315014...,...,015fe357-ae97-48d4-8f91-bd79162978e1,Female,1977,Non-Hispanic origin,CU,Caucasian,CU1007,CU1,early RA,44
KT00055,2023-11-09T02:52:35.494Z,d53fe09c-8f68-48e3-9950-23b3d81f204e,False,Flu Year 1 Day 0,N/A - Flu-Series Timepoint Only,2019-11-01T00:00:00Z,0,Rheumatoid arthritis,58e70a1e-ff15-4efc-934f-d1b2906b2b9a,automated/merged/B002/labeled/B002-P1_PB00055-...,...,fce82002-928c-4e64-880b-a287b7a67fd0,Female,1966,Non-Hispanic origin,CU,African American,CU1008,CU1,early RA,53
KT04113,2023-11-09T02:52:35.494Z,f281afbd-0a0d-4637-8b37-cd6e50079959,False,Flu Year 2 Day 90,N/A - Flu-Series Timepoint Only,2022-04-01T00:00:00Z,874,Rheumatoid arthritis,679bbca6-14ab-4718-bbe4-e3715f9ca1ff,automated/merged/2022-08-29T19:08:41.772899705...,...,e87c5a8b-79ab-44d5-a3a8-45734e671ceb,Female,1963,Non-Hispanic origin,CU,Caucasian,CU1009,CU1,early RA,59
KT04108,2023-11-09T02:52:35.494Z,5567fe1f-142d-4478-85ba-6dc4b579443c,False,RA Year 4 Visit 2,N/A - stand-alone collection,2023-01-01T00:00:00Z,1144,CCP3+ At Risk for RA,df47dbac-808e-43af-ba30-749a6e763134,automated/merged/2023-07-02T18:21:52.848666396...,...,4c12480d-b9cf-481b-a1e9-ee3902d63ecc,Female,1961,Non-Hispanic origin,CU,Caucasian,CU1010,CU1,early RA,62


In [25]:
# check the sample match
aim2_meta.index.isin(aim2_pb.obs['sample.sampleKitGuid'])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [26]:
# check the sample numbers
aim2_pb.obs.drop_duplicates(
    ['sample.sampleKitGuid']).groupby(['Status_Xsec']).size()

  ['sample.sampleKitGuid']).groupby(['Status_Xsec']).size()


Status_Xsec
ALTRA_healthy    31
early_RA         25
dtype: int64

In [27]:
aim2_pb.obs.drop_duplicates(
    ['sample.sampleKitGuid']).groupby(['Status_Xsec']).size()

  ['sample.sampleKitGuid']).groupby(['Status_Xsec']).size()


Status_Xsec
ALTRA_healthy    31
early_RA         25
dtype: int64

In [28]:
# set up loop to produce all cell type pseudobulk data
output_path = '/home/jupyter/data/ra_longitudinal/scrna/certPro/counts/aifi_l3/aim2/'
cell_type_col = 'AIFI_L3_new'
os.makedirs(output_path)
generate_pseudobulk_data(aim2_pb, output_path, cell_type_col, proj_name)

generating pseudobulk data for ASDC
remaining 1668 genes in ASDC
generating pseudobulk data for Activated memory B cell
remaining 1239 genes in Activated memory B cell
generating pseudobulk data for Adaptive NK cell
remaining 1503 genes in Adaptive NK cell
generating pseudobulk data for Adaptive NK cell_uk1_T
remaining 1135 genes in Adaptive NK cell_uk1_T
generating pseudobulk data for C1Q+ CD16 monocyte
remaining 3912 genes in C1Q+ CD16 monocyte
generating pseudobulk data for CD14+ cDC2
remaining 5755 genes in CD14+ cDC2
generating pseudobulk data for CD27+ effector B cell
remaining 3005 genes in CD27+ effector B cell
generating pseudobulk data for CD27- effector B cell
remaining 2288 genes in CD27- effector B cell
generating pseudobulk data for CD4 MAIT
remaining 1809 genes in CD4 MAIT
generating pseudobulk data for CD56bright NK cell
remaining 4737 genes in CD56bright NK cell
generating pseudobulk data for CD8 MAIT
remaining 6713 genes in CD8 MAIT
generating pseudobulk data for CD8a

In [None]:
aim2_pb

## AIM3

In [30]:
aim3_pb

AnnData object with n_obs × n_vars = 3601 × 29842
    obs: 'batch_id', 'hto_barcode', 'hto_category', 'pbmc_sample_id', 'pool_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'AIFI_L3', 'doublets_manual', 'AIFI_L3_new', 'Status_Xsec', 'Status_Long', 'anti_ccp3_finalCombined', 'days_to_conversion', 'BMI', 'CMV_Status_Subj', 'age_conv', 'bmi_conv', 'file.batchID', 'status', 'psbulk_n_cells', 'psbulk_counts'
    layers: 'counts', 'psbulk_props'

In [31]:
# add back bath id back to the metadata
# load the scrna metadata
aim3_meta = pd.read_csv(meta_path + 'ALTRA_RA_Aim3_ALTRA_converters_longitudinal_scrna_metadata.csv', 
                              index_col='sample.sampleKitGuid')
# aim3_meta = aim3_meta[aim3_meta['days_to_conversion'] >= (-750)]
aim3_meta

Unnamed: 0_level_0,lastUpdated,sample.id,sample.bridgingControl,sample.visitName,sample.visitDetails,sample.drawDate,sample.daysSinceFirstVisit,sample.diseaseStatesRecordedAtVisit,file.id,file.name,...,cohort.cohortGuid,file.userTags.details,file.userTags.group,file.userTags.name,file.userTags.origin,file.userTags.other,file.userTags.version,days_to_conversion,status,age
sample.sampleKitGuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KT00493,2023-11-09 02:52:35.494,d4ea69f7-37a6-4e91-839a-e93ab80bbfc5,False,Flu Year 2 Day 90,N/A - Flu-Series Timepoint Only,2021-02-01,488,Rheumatoid arthritis,8e3f3734-d60c-4716-8c36-6bff6eb9acd9,automated/merged/2023-10-25T16:28:36.48842651Z...,...,CU1,,,,,,,-127,pre,23
KT02135,2023-11-09 02:52:35.494,fa01b4a2-b150-4ac9-9ab8-4c63861fefad,False,Flu Year 3 Stand-Alone,RA Conversion,2021-06-01,615,Rheumatoid arthritis,251b30f7-5db3-4f25-9040-6563d8855205,automated/merged/2021-10-07T16:25:55.38418956Z...,...,CU1,,,,,,,0,conversion,23
KT00440,2023-11-09 02:52:35.494,7f5a284a-b300-4bf3-ac67-8d9d52db97e6,False,Flu Year 2 Day 0,N/A - Flu-Series Timepoint Only,2020-10-01,355,Rheumatoid arthritis,0a3c3ce1-2006-4a07-a086-362a1eda303b,automated/merged/2021-02-11T20:48:07.416042815...,...,CU1,,,,,,,-260,pre,22
KT00227,2023-11-09 02:52:35.494,3d176e60-ec9e-4745-9c5b-e019b39b41d8,False,Flu Year 2 Stand-Alone,N/A - Flu-Series Timepoint Only,2020-07-01,271,Rheumatoid arthritis,84844967-a3c6-42db-b8a8-f0c255d4df60,automated/merged/2020-10-10T20:59:49.850564489...,...,CU1,,,,,,,-344,pre,22
KT00099,2023-11-09 02:52:35.494,145f6021-9749-40af-b7c5-612e477776b6,False,Flu Year 1 Day 90,N/A - Flu-Series Timepoint Only,2020-01-01,99,Rheumatoid arthritis,b7601874-9628-498a-a1a9-3d3a1d9de2e6,automated/merged/2020-08-04T01:55:19.807034625...,...,CU1,,,,,,,-516,pre,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KT00118,2023-11-09 02:52:35.494,13a662c1-87ae-475d-a8fa-686b98759a8b,False,Flu Year 1 Stand-Alone,N/A - Flu-Series Timepoint Only,2020-08-01,0,Rheumatoid arthritis,e9740f82-2c03-49cf-bd22-6665ee8d7f42,automated/merged/2021-05-28T19:13:24.536634916...,...,SD1,,,,,,,-204,pre,31
KT00105,2023-11-09 02:52:35.494,cdf80d0d-3221-43b1-a5ae-6ad9045e222b,False,Flu Year 1 Day 0,N/A - Flu-Series Timepoint Only,2020-12-01,112,Rheumatoid arthritis,15e214ab-7230-4df9-beca-fce1c8c51f66,automated/merged/2021-05-28T19:13:24.536634916...,...,SD1,,,,,,,-92,pre,31
KT02975,2023-11-20T19:04:58.033Z,dc6588a9-ef3c-45be-8308-0f96fe5b4686,False,Flu Year 1 Day 90,N/A - Flu-Series Timepoint Only,2022-02-01T00:00:00Z,279,Rheumatoid arthritis,d30ec314-b75c-4644-a1ff-f36d371a438a,automated/merged/2023-11-14T19:05:56.147636047...,...,CU1,,,,,,,0,conversion,75
KT00057,2023-11-20T19:04:58.033Z,b606576c-0bad-4756-97b7-f275137c9d64,False,Flu Year 1 Day 0,N/A - Flu-Series Timepoint Only,2019-10-01T00:00:00Z,0,Rheumatoid arthritis,5e3ce6b9-95a9-4893-8123-b9b7a5d5c9d8,automated/merged/2023-11-17T21:38:04.103392546...,...,CU1,,,,,,,-615,pre,21


In [32]:
# check the sample match
aim3_meta.index.isin(aim3_pb.obs['sample.sampleKitGuid'])

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [33]:
aim3_pb.obs['sample.sampleKitGuid'].unique()

['KT00409', 'KT00443', 'KT02136', 'KT04474', 'KT04924', ..., 'KT04108', 'KT04462', 'KT00219', 'KT00416', 'KT03930']
Length: 65
Categories (65, object): ['KT00052', 'KT00056', 'KT00057', 'KT00064', ..., 'KT04108', 'KT04462', 'KT04474', 'KT04924']

In [34]:
aim3_pb.obs.drop_duplicates(
    ['sample.sampleKitGuid']).groupby(['Status_Long']).size()

  ['sample.sampleKitGuid']).groupby(['Status_Long']).size()


Status_Long
conversion    10
pre           55
dtype: int64

In [35]:
# set up loop to produce all cell type pseudobulk data
output_path = '/home/jupyter/data/ra_longitudinal/scrna/certPro/counts/aifi_l3/aim3/'
cell_type_col = 'AIFI_L3_new'
os.makedirs(output_path)
generate_pseudobulk_data(aim3_pb, output_path, cell_type_col, proj_name)

generating pseudobulk data for Activated memory B cell
remaining 970 genes in Activated memory B cell
generating pseudobulk data for Adaptive NK cell
remaining 3528 genes in Adaptive NK cell
generating pseudobulk data for Adaptive NK cell_uk1_T
remaining 2008 genes in Adaptive NK cell_uk1_T
generating pseudobulk data for C1Q+ CD16 monocyte
remaining 4238 genes in C1Q+ CD16 monocyte
generating pseudobulk data for CD14+ cDC2
remaining 6757 genes in CD14+ cDC2
generating pseudobulk data for CD27+ effector B cell
remaining 4302 genes in CD27+ effector B cell
generating pseudobulk data for CD27- effector B cell
remaining 2354 genes in CD27- effector B cell
generating pseudobulk data for CD4 MAIT
remaining 2183 genes in CD4 MAIT
generating pseudobulk data for CD56bright NK cell
remaining 5093 genes in CD56bright NK cell
generating pseudobulk data for CD8 MAIT
remaining 6163 genes in CD8 MAIT
generating pseudobulk data for CD8aa
remaining 1348 genes in CD8aa
generating pseudobulk data for CD9