# ALTRA data scRNA psedubulk in Python - Cert Pro deep clean data
## preprocessing
<a name = "contents"></a>

### Contents

- [Importing packages](#Importing-packages)
- [Reading h5 files](#Reading-h5-files)
    - [Gene x cell matrix](#Gene-x-cell-matrix)
    - [Observation metadata](#Observation-metadata)
- [Assembling AnnData](#Assembling-AnnData)
- [Combining multiple files](#Combining-multiple-files)
- [Saving and loading AnnData](#Saving-and-loading-AnnData)
- [Basic analysis with scanpy](#Basic-analysis-with-scanpy)
- [Session Info](#Session-Info)

In [1]:
import h5py
import scipy.sparse as scs
import pandas as pd
import anndata
import os
import glob
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import median_abs_deviation
import scanpy as sc
import random
# import sc_toolbox
# import pertpy 
import decoupler as dc

In [2]:
anndata.__version__
dc.__version__

'1.8.0'

In [3]:
# sc.settings.n_jobs = 58

In [4]:
# define some color patterns for plotting
nejm_color = ["#BC3C29FF", "#0072B5FF", "#E18727FF", "#20854EFF", "#7876B1FF", "#6F99ADFF", "#FFDC91FF", "#EE4C97FF"]
jama_color = ["#374E55FF", "#DF8F44FF", "#00A1D5FF", "#B24745FF", "#79AF97FF", "#6A6599FF", "#80796BFF"]

In [5]:
# define working path
data_path = '/home/jupyter/data/ra_longitudinal/scrna/'
meta_path = '/home/jupyter/github/ra-longitudinal/metadata/'
fig_path = '/home/jupyter/data/ra_longitudinal/figures/certPro/'
output_path = '/home/jupyter/data/ra_longitudinal/output_results/certPro/'
# os.mkdir(fig_path)
# os.mkdir(output_path)
# define a project name
proj_name = 'ALTRA_scRNA_AIFI_L3_deepclean_certpro_'
# sc.set_figure_params(fig_path)
sc.settings.figdir = fig_path
sc.settings.autosave=False
sc.set_figure_params(vector_friendly=True, dpi_save=300)

# load data

In [9]:
# load the deep clean data
joint_adata_fl = sc.read_h5ad(
    '/home/jupyter/data/ra_longitudinal/scrna/cache/0bf38363-b9b7-40bf-96f8-3b9f1f958983/preRA_dc_sample_selection_combined_adata_2024-06-25.h5ad'
)

In [10]:
joint_adata_fl

AnnData object with n_obs × n_vars = 2059581 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L1_score', 'AIFI_L2', 'AIFI_L2_score', 'AIFI_L3', 'AIFI_L3_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden_harmony_2', 'doublets_manual', 'AIFI_L3_new'
   

In [11]:
joint_adata_fl.obs['sample.sampleKitGuid'].unique()

['KT00118', 'KT00416', 'KT00103', 'KT02957', 'KT00068', ..., 'KT02974', 'KT00202', 'KT00467', 'KT04924', 'KT00055']
Length: 141
Categories (141, object): ['KT00052', 'KT00055', 'KT00056', 'KT00057', ..., 'KT04906', 'KT04924', 'KT04933', 'KT04937']

## add metadata

In [12]:
# load metadata and add to the overall dataset
lab_meta = pd.read_csv('/home/jupyter/data/ra_longitudinal/metadata/2023-11-22_ALTRA_Metadata_labs.csv')
lab_meta.columns.tolist()

['Status_Xsec',
 'Status_Long',
 'sample.sampleKitGuid',
 'sample.visitName',
 'sample.visitDetails',
 'sample.drawDate',
 'sample.daysSinceFirstVisit',
 'subject.subjectGuid',
 'subject.biologicalSex',
 'subject.birthYear',
 'subject.ethnicity',
 'subject.race',
 'cohort.cohortGuid',
 'Age2023',
 'ageAtDraw',
 'timeOnStudy',
 'timeOnStudyAsOf',
 'percent_basophils',
 'percent_eosinophils',
 'percent_immature_granulocytes',
 'percent_lymphocytes',
 'percent_monocytes',
 'percent_nucleated_red_blood_cells_nrbc',
 '0401_count',
 '0404_count',
 '0405_count',
 '0408_count',
 'absolute_basophil_count',
 'absolute_eosinophil_count_aec',
 'absolute_immature_granulocyte_count',
 'absolute_lymphocyte_count_alc',
 'absolute_monocyte_count_amc',
 'absolute_neutrophil_count_anc',
 'alanine_transaminase_alt',
 'albumin',
 'alkaline_phosphatase',
 'anion_gap',
 'anti_ccp3',
 'anti_ccp31',
 'bilirubin_total_t_bili',
 'c_reactive_protein_crp',
 'calcium',
 'carbon_dioxide_co2',
 'cholesterol_hdl',
 'c

In [13]:
lab_meta.shape

(157, 148)

In [14]:
# add metadat in if not avaiable
meta_cols = ['Status_Xsec','Status_Long',
              'anti_ccp3_finalCombined', 'days_to_conversion',
              'BMI', 'CMV_Status_Subj', 'age_conv', 'bmi_conv']
# for i in meta_cols:
#     if i not in joint_adata_fl.obs.columns:
#         print(i)
meta_cols = [x for x in meta_cols if x not in joint_adata_fl.obs.columns]
meta_cols.append('sample.sampleKitGuid')
meta_cols

['Status_Xsec',
 'Status_Long',
 'anti_ccp3_finalCombined',
 'days_to_conversion',
 'BMI',
 'CMV_Status_Subj',
 'age_conv',
 'bmi_conv',
 'sample.sampleKitGuid']

In [15]:
# add metadata to the anndata object
joint_adata_fl.obs = joint_adata_fl.obs.merge(
    lab_meta[meta_cols],
    how='left', on ='sample.sampleKitGuid')
joint_adata_fl

AnnData object with n_obs × n_vars = 2059581 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L1_score', 'AIFI_L2', 'AIFI_L2_score', 'AIFI_L3', 'AIFI_L3_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden_harmony_2', 'doublets_manual', 'AIFI_L3_new', 'S

In [16]:
# load the scrna metadata
scrna_meta = pd.read_csv(meta_path+'ALTRA_scRNA_combined_141_samples_info.csv', 
                              index_col='sample.sampleKitGuid')

In [17]:
scrna_meta.shape

(141, 32)

In [18]:
# 5 samples are not in the original scRNA data selection
# decide to remove; stick to the original dataset
joint_adata_fl.obs.loc[~joint_adata_fl.obs['sample.sampleKitGuid'].isin(scrna_meta.index), 'sample.sampleKitGuid'].unique()

array([], dtype=object)

In [19]:
joint_adata_fl.obs.loc[~joint_adata_fl.obs['sample.sampleKitGuid'].isin(lab_meta['sample.sampleKitGuid']),
    'sample.sampleKitGuid'].unique()

array([], dtype=object)

In [34]:
# remove the 5 extra sample
# joint_adata_fl = joint_adata_fl[joint_adata_fl.obs['sample.sampleKitGuid'].isin(scrna_meta.index)].copy()

In [20]:
# add batch id back
# add metadata to the anndata object
joint_adata_fl.obs = joint_adata_fl.obs.merge(
    scrna_meta[['file.batchID']],
    how='left', left_on ='sample.sampleKitGuid', right_index=True)
joint_adata_fl

AnnData object with n_obs × n_vars = 2059581 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L1_score', 'AIFI_L2', 'AIFI_L2_score', 'AIFI_L3', 'AIFI_L3_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden_harmony_2', 'doublets_manual', 'AIFI_L3_new', 'S

In [21]:
joint_adata_fl[joint_adata_fl.obs['file.batchID'].isna()]

View of AnnData object with n_obs × n_vars = 0 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L1_score', 'AIFI_L2', 'AIFI_L2_score', 'AIFI_L3', 'AIFI_L3_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden_harmony_2', 'doublets_manual', 'AIFI_L3_new', 

In [22]:
# add a new column for status
joint_adata_fl.obs['status'] = joint_adata_fl.obs['Status_Xsec'].astype('str')
joint_adata_fl.obs.loc[(joint_adata_fl.obs['Status_Xsec'].isnull()) & 
    (joint_adata_fl.obs['Status_Long']=='pre'), 'status']  = 'at_risk'
joint_adata_fl.obs.loc[joint_adata_fl.obs['Status_Long']=='conversion', 'status']  = 'conversion'
joint_adata_fl.obs['status'].unique()

array(['at_risk', 'ALTRA_healthy', 'early_RA', 'conversion'], dtype=object)

In [23]:
len(joint_adata_fl.obs['sample.sampleKitGuid'].unique())

141

In [24]:
joint_adata_fl

AnnData object with n_obs × n_vars = 2059581 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L1_score', 'AIFI_L2', 'AIFI_L2_score', 'AIFI_L3', 'AIFI_L3_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden_harmony_2', 'doublets_manual', 'AIFI_L3_new', 'S

In [25]:
# doouble check the raw counts data are stored in X
joint_adata_fl.X[1:50, 1:50].toarray()

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint16)

In [26]:
# save the deep clean data with metadata
joint_adata_fl.write_h5ad(
    '/home/jupyter/data/ra_longitudinal/scrna/certPro/ALTRA_certPro_scRNA_141_samples_combined_adata.h5ad'
)

## check the overlap between the cert pro and old data 

In [7]:
# load the deep clean data
joint_adata_fl = sc.read_h5ad(
    '/home/jupyter/data/ra_longitudinal/scrna/certPro/ALTRA_certPro_scRNA_141_samples_combined_adata.h5ad'
)



In [8]:
joint_adata_fl

AnnData object with n_obs × n_vars = 2059581 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L1_score', 'AIFI_L2', 'AIFI_L2_score', 'AIFI_L3', 'AIFI_L3_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden_harmony_2', 'doublets_manual', 'AIFI_L3_new', 'S

## output the cell type labels for DA analysis


In [30]:
joint_adata_fl.obs.to_csv(output_path + proj_name + 'meta_data.tsv', sep="\t")

# normalization

In [45]:
# save the raw counts
joint_adata_fl.layers['counts'] = joint_adata_fl.X.copy()

In [None]:
# mitochondrial genes
joint_adata_fl.var["mt"] = joint_adata_fl.var_names.str.startswith("MT-")
# ribosomal genes
joint_adata_fl.var["ribo"] = joint_adata_fl.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
joint_adata_fl.var["hb"] = joint_adata_fl.var_names.str.contains(("^HB[^(P)]"))
sc.pp.calculate_qc_metrics(
    joint_adata_fl, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)

In [None]:
joint_adata_fl.obs

In [None]:
plt.rcParams['figure.figsize'] = [12, 4]
p1 = sc.pl.violin(joint_adata_fl, ['pct_counts_ribo'],
                  save=proj_name+'pct_counts_ribo_batch_violin.png',
                  rotation=45, groupby='file.batchID')

In [None]:
p1 = sc.pl.violin(joint_adata_fl, ['pct_counts_ribo'],
                  save=proj_name+'pct_counts_ribo_cell_type_violin.png',
                  rotation=90, groupby='predicted_labelsL2')

In [None]:
p1 = sc.pl.violin(joint_adata_fl, ['pct_counts_mt'],
                  save=proj_name+'pct_counts_mt_batch_violin.png',
                  rotation=45, groupby='file.batchID')

In [None]:
p1 = sc.pl.violin(joint_adata_fl, ['pct_counts_hb'],
                  save=proj_name+'pct_counts_hb_batch_violin.png',
                  rotation=45, groupby='file.batchID')

In [None]:
# apply some filtering
sc.pp.filter_cells(joint_adata_fl, min_genes=200)
sc.pp.filter_genes(joint_adata_fl, min_cells=3)

In [None]:
p1 = sns.histplot(joint_adata_fl.obs["total_counts"], bins=100, kde=False)

In [None]:
# Shifted logarithm transform
sc.pp.normalize_total(joint_adata_fl, target_sum=None)

In [None]:
joint_adata_fl

In [None]:
# fig, axes = plt.subplots(1, 2, figsize=(10, 5))
# p1 = sns.histplot(joint_adata_fl.obs["total_counts"], bins=100, kde=False, ax=axes[0])
# axes[0].set_title("Total counts")
# p2 = sns.histplot(joint_adata_fl.layers["log1p_norm"].sum(1), bins=100, kde=False, ax=axes[1])
# axes[1].set_title("Shifted logarithm")
# plt.show()

In [None]:
%%time
# cpm normalization
# sc.pp.normalize_total(joint_adata_fl, target_sum=1e4, inplace=True)
sc.pp.log1p(joint_adata_fl)


In [None]:
# %%time
sc.pp.highly_variable_genes(joint_adata_fl, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(joint_adata_fl)

In [None]:
joint_adata_fl.var.highly_variable.value_counts()

In [None]:
# joint_adata_fl.raw = joint_adata_fl

In [None]:
sc.pp.scale(joint_adata_fl, max_value=10)

In [None]:
# setting highly variable as highly deviant to use scanpy 'use_highly_variable' argument in sc.pp.pca
sc.pp.pca(joint_adata_fl, svd_solver="arpack", use_highly_variable=True)

In [None]:
# plot the principle component variance explained
sc.pl.pca_variance_ratio(joint_adata_fl, log=True)

In [None]:
sc.pl.pca_scatter(joint_adata_fl, color=["pct_counts_ribo", 'pct_counts_mt'])

In [None]:
# run harmony
import scanpy.external as sce
sce.pp.harmony_integrate(joint_adata_fl, 'file.batchID', adjusted_basis='X_pca_harmony')


In [None]:
sc.pp.neighbors( joint_adata_fl, n_pcs=30, use_rep='X_pca_harmony')
sc.tl.umap(joint_adata_fl)

In [None]:
joint_adata_fl.obsm['X_harmony_umap'] = joint_adata_fl.obsm['X_umap'].copy()

In [None]:
sc.tl.leiden(joint_adata_fl, key_added="leiden_0_5", resolution=0.5, n_iterations=2)

In [None]:
sc.pl.umap(
    joint_adata_fl,
    color=['file.batchID',  "subject.subjectGuid", 'status'],
    ncols=3,
    frameon=False,wspace=0.5,
    save=  proj_name+'_rna_umap.png'
)

In [None]:
sc.pl.umap(
    joint_adata_fl, #legend_loc='on data',
    color=['pred_manual'],
    save= proj_name+'_immunehealth_l3_TBX21_umap.png'
)