In [27]:
import pandas as pd
import numpy as np
import re
import scanpy as sc
import anndata
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import scanpy.external as sce
import scyan
from concurrent.futures import ProcessPoolExecutor
import copy
import os
import gc
sc.settings.n_jobs = 60

In [30]:
def leiden_new(
    adata,
    resolution: float = 1,
    key_added: str = "leiden",
    n_neighbors: int = 15,
) -> None:
    """Leiden clustering

    Args:
        adata: AnnData object.
        resolution: Resolution of the clustering.
        key_added: Name of the key of adata.obs where clusters will be saved.
        n_neighbors: Number of neighbors.
    """
    try:
        import leidenalg
    except:
        raise ImportError(
            """To run leiden, you need to have 'leidenalg' installed. You can install the population discovery extra with "pip install 'scyan[discovery]'", or directly install leidenalg with "conda install -c conda-forge leidenalg"."""
        )

    import igraph as ig
    from sklearn.neighbors import kneighbors_graph

    if not "knn_graph" in adata.obsp:
        adata.obsp["knn_graph"] = kneighbors_graph(
            adata.obsm['X_pca_harmony'][:,0:10], n_neighbors=n_neighbors, metric="euclidean", include_self=False
        )

    # TODO (improvement): add weights according to euclidean distance
    graph = ig.Graph.Weighted_Adjacency(adata.obsp["knn_graph"], mode="DIRECTED")

    partition = leidenalg.find_partition(
        graph,
        leidenalg.RBConfigurationVertexPartition,
        resolution_parameter=resolution,
    )
    adata.obs[key_added] = pd.Categorical([str(x) for x in partition.membership])

def run_leiden(adata, resolution, key_added):
    # Make a copy of adata for thread safety
    adata_copy = copy.deepcopy(adata)
    leiden_new(adata_copy, resolution=resolution, key_added=key_added)
    return adata_copy.obs
 
def run_leiden_parallel(adata, tasks):
    with ProcessPoolExecutor(max_workers=5) as executor:
        # Make deep copies of adata for each task to ensure thread safety
        futures = [executor.submit(run_leiden, copy.deepcopy(adata), resolution, key_added) for resolution, key_added in tasks]
        
        results = [future.result() for future in futures]
 
    # Assign the results back to the original AnnData object
    for result, (_, key_added) in zip(results, tasks):
        adata.obs[key_added] = result[key_added]
 
    return adata
def read_one(file_path):
    adata = scyan.read_csv(file_path,
                        marker_regex='^cd|^hla|tcr|ig|^ccr|klrg|^cx', 
                        exclude_markers=None)
    adata.obs["batch"] = re.findall( 'B\\d\\d\\d', file_path)[0]
    adata.obs["panel"] = re.findall( 'PB1|PT1|PM1|PS1', file_path)[0]

    return adata

# Read Flow MetaData and scRNA MetaData

In [3]:
meta_data=pd.read_csv('/home/jupyter/BRI_Figures/Dataset/FlowCyto_BRI_uuid.csv')

In [4]:
meta_data['pbmc_sample_id'] = meta_data['file.name'].apply(lambda x: re.search(r'PB([^_]+)', x).group(1) if re.search(r'PB([^_]+)', x) else None)
meta_data['pbmc_sample_id']=["PB" +x for x in meta_data['pbmc_sample_id']]
meta_data = meta_data.drop_duplicates(subset=['sample.sampleKitGuid'], keep='first').sort_values(by='sample.sampleKitGuid').reset_index(drop=True)

In [5]:
meta_data_selected_RNA=pd.read_csv('/home/jupyter/BRI_Figures/Dataset/scRNA_meta_data-2024-05-09.csv')

In [6]:
conditions = [
    (meta_data_selected_RNA['sample.drawDate'] <= '2020-07-01'),
    (meta_data_selected_RNA['sample.drawDate'] > '2020-07-01') & (meta_data_selected_RNA['sample.drawDate'] <= '2021-07-01'),
    (meta_data_selected_RNA['sample.drawDate'] > '2021-07-01') & (meta_data_selected_RNA['sample.drawDate'] <= '2022-07-01'),
    (meta_data_selected_RNA['sample.drawDate'] > '2022-07-01') & (meta_data_selected_RNA['sample.drawDate'] <= '2023-07-01')
]

choices = ['2019', '2020', '2021', '2022']

meta_data_selected_RNA['Flu_Year'] = np.select(conditions, choices, default=np.nan)

In [7]:
meta_data_selected_RNA=meta_data_selected_RNA.query("Flu_Year.isin(['2020','2021']) &`sample.visitName`.isin(['Flu Year 1 Day 0','Flu Year 1 Day 7','Flu Year 2 Day 0','Flu Year 2 Day 7'])")

In [8]:
meta_data_selected_RNA = pd.merge(meta_data_selected_RNA, meta_data, how='left', left_on=['sample.visitName', 'subject.subjectGuid'], right_on=['sample.visitName', 'subject.subjectGuid'])

In [9]:
meta_data=meta_data[meta_data['pbmc_sample_id'].isin(meta_data_selected_RNA['pbmc_sample_id_y'])]

In [10]:
#no day 7 sample on this years sample for this donor
meta_data=meta_data.query("~(`subject.subjectGuid`=='BR1026' & `sample.visitName`=='Flu Year 1 Day 0')")

# Read Data

In [11]:
%%time
file_names= ["/home/jupyter/BRI_Figures/Dataset/FlowCyto/"+ os.path.basename(x) for x in meta_data.reset_index()["file.name"]]
adata_list = []
with ThreadPoolExecutor(max_workers=60) as executor:
    future_to_file = {executor.submit(read_one, file_name): file_name for file_name in file_names}
    for future in tqdm(as_completed(future_to_file), total=len(file_names)):
        result = future.result()
        if result is not None:
            adata_list.append(result)

100% 314/314 [05:06<00:00,  1.03it/s]

CPU times: user 9min 35s, sys: 1min 30s, total: 11min 5s
Wall time: 5min 6s





In [12]:
adata = anndata.concat(adata_list)
adata.obs.index=adata.obs["barcode"].tolist()

  utils.warn_names_duplicates("obs")


In [13]:
adata.obs['labels'].value_counts()

labels
naive_cd4_t_cells    14564020
myeloid_cells        10790869
Unknown               9741378
em_cd4_t_cells        9099846
cm_cd4_t_cells        8997829
cd56mid_nk_cells      8296546
b_cells               6473858
em_cd8_t_cells        5662379
naive_cd8_t_cells     5650478
temra_cd8_t_cells     4220975
gd_t_cells            2400148
memory_treg           1979230
debris                1264353
naive_treg             985046
cm_cd8_t_cells         872910
cd56hi_nk_cells        751737
dn_t_cells             747772
temra_cd4_t_cells      565121
dp_t_cells             429865
cd103_cd8_t_cells      410426
cd103_cd4_t_cells      174497
ilc_like_cells          66160
Name: count, dtype: int64

# Subset CM CD4 and Processing 

In [14]:
adata_sub=adata[adata.obs['labels'].isin(['cm_cd4_t_cells'])]
sc.pp.highly_variable_genes(adata_sub, n_top_genes=15, batch_key="batch")

  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  hvg = pd.concat([hvg, missing_hvg], ignore_inde

In [15]:
del adata
del adata_list
gc.collect()

792

In [16]:
adata_sub.raw=adata_sub
adata_sub = adata_sub[:, adata_sub.var.highly_variable]
scyan.preprocess.scale(adata_sub)
sc.pp.pca(adata_sub, svd_solver="arpack")

[36;20m[INFO] (scyan.preprocess)[0m Data will be centered and standardised. This is advised only when using spectral/flow data (if this is not your case, consider running 'asinh_transform' instead of 'auto_logicle_transform').


In [None]:
%%time

sce.pp.harmony_integrate(adata_sub, 'batch',max_iter_harmony = 20)


2024-06-04 19:22:33,596 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7f865316bd00>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/threadpoolctl.py", line 847, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/opt/conda/lib/python3.10/site-packages/threadpoolctl.py", line 984, in _make_controller_from_path
    lib_controller = controller_class(filepath=filepath, prefix=prefix)
  File "/opt/conda/lib/python3.10/site-packages/threadpoolctl.py", line 111, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
  File "/opt/conda/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: dlopen() error
2024-06-04 19:39:13,331 - harmonypy - INFO - sklearn.KMeans initializatio

In [None]:
adata_sub.write_h5ad('adata_cm_cd4_harmonized.h5ad')

In [31]:
adata_sub=sc.read_h5ad('adata_cm_cd4_harmonized.h5ad')

In [None]:
%%time
tasks = [(0.25, "leiden_res_0.25"),(0.5, "leiden_res_0.5"),(1.5, "leiden_res_1.5"),(2, "leiden_res_2"),(0.75, "leiden_res_0.75")]
leiden_res = ["leiden_res_0.25","leiden_res_0.5", "leiden_res_1.5", "leiden_res_2",  "leiden_res_0.75"]

adata_sub = run_leiden_parallel(adata_sub, tasks)

In [None]:
sc.pp.neighbors(adata_sub,use_rep='X_pca_harmony')
sc.tl.umap(adata_sub)
adata_sub.write_h5ad('adata_cm_cd4_leiden_var15_umap')