# Refine labels for Naive CD4 T cells

In this notebook, we load all Naive CD4 T cells across our subjects to refine final L3 labels. We'll combine, recluster, and assign final labels by taking the most frequent AIFI_L3 label in each cluster, and back-propogate those L3 labels to L2 and L1 based on our cell type hierarcy.

We'll also generate metadata, UMAP coordinates, and marker gene summaries for review of our final labels, then store all of the outputs in HISE for later use.

Because there are many Naive CD4 T cells, we'll divide them into smaller subsets based on cohort, sex, CMV status, and visit grouping (defined below). We'll then review each subset and assemble all labeled data in later notebooks.

## Load packages

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import tarfile

In [2]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

In [3]:
out_dir = 'output/review'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

In [4]:
class_name = 'naive_cd4_t_cell'

## Helper functions

### HISE data
These functions make it easy to utilize files from the HISE cache

In [5]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [6]:
def read_csv_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_csv(cache_file)
    return res

In [7]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

In [8]:
def read_obs_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file, backed = 'r')
    obs = res.obs.copy()
    return obs

### Label refinement

This function uses the most frequent label within each cluster to refine label assignments.

In [9]:
def single_value(series):
    res = []
    for value in series:
        if isinstance(value, list):
            res.append(value[0])
        else:
            res.append(value)
    return res

In [10]:
def assign_most_frequent(adata, clusters, labels, keep_original = False, original_prefix = 'predicted_'):
    obs = adata.obs
    
    most_frequent_labels = (
        adata.obs
        .groupby(clusters)[labels]
        .agg(pd.Series.mode)
        .to_frame()
        .reset_index()
    )
    most_frequent_labels[labels] = single_value(most_frequent_labels[labels])
    
    if keep_original:
        obs = obs.rename({labels: original_prefix + labels}, axis = 1)
    else:
        obs = obs.drop(labels, axis = 1)
    
    obs = obs.merge(most_frequent_labels, on = clusters, how = 'left')
    
    adata.obs = obs
    
    return adata

This function back-propagates hierarchical labeling from AIFI_L3 back to AIFI_L2 and AIFI_L1 to ensure our labels agree across levels of our cell type hierarchy.

In [11]:
def propagate_hierarchy(
    adata,
    hierarchy_df,
    from_level = 'AIFI_L3',
    to_levels = ['AIFI_L2', 'AIFI_L1'],
    keep_original = True,
    original_prefix = 'predicted_'
):
    obs = adata.obs
    
    for to_level in to_levels:
        prop_df = hierarchy_df[[from_level, to_level]]
        prop_df = prop_df.drop_duplicates()
        
        if keep_original:
            obs = obs.rename({to_level: original_prefix + to_level}, axis = 1)
        else:
            obs = obs.drop(to_level, axis = 1)

        obs[from_level] = obs[from_level].astype(str)
        obs = obs.merge(prop_df, on = from_level, how = 'left')
        obs[from_level] = obs[from_level].astype('category')
        obs[to_level] = obs[to_level].astype('category')
    
    adata.obs = obs

    return adata

### Review outputs

These functions are used to assemble marker gene expression tables for review

In [12]:
def marker_frac_df(adata, markers, clusters = 'louvain_2'):
    gene_cl_frac = sc.pl.dotplot(
        adata, 
        groupby = clusters,
        var_names = markers,
        return_fig = True
    ).dot_size_df
    return gene_cl_frac

def marker_mean_df(adata, markers, log = False, clusters = 'louvain_2'):
    gene_cl_mean = sc.pl.dotplot(
        adata, 
        groupby = clusters,
        var_names = markers,
        return_fig = True,
        log = log
    ).dot_color_df
    
    return gene_cl_mean

def tidy_marker_df(adata, markers, clusters = 'louvain_2'):
    gene_cl_frac = marker_frac_df(adata, markers, clusters)
    gene_cl_frac = gene_cl_frac.reset_index(drop = False)
    gene_cl_frac = pd.melt(gene_cl_frac, id_vars = clusters, var_name = 'gene', value_name = 'gene_frac')
    
    gene_cl_mean = marker_mean_df(adata, markers, log = False, clusters = clusters)
    gene_cl_mean = gene_cl_mean.reset_index(drop = False)
    gene_cl_mean = pd.melt(gene_cl_mean, id_vars = clusters, var_name = 'gene', value_name = 'gene_mean')

    marker_df = gene_cl_frac.merge(gene_cl_mean, on = [clusters, 'gene'], how = 'left')
    return marker_df

This function will select clusters based on gene detection above a specifiec fraction of cells (cutoff).

In [13]:
def select_clusters_above_gene_frac(adata, gene, cutoff, clusters = 'louvain_2'):
    gene_cl_frac = marker_frac_df(adata, gene, clusters)
    select_cl = gene_cl_frac.index[gene_cl_frac[gene] > cutoff].tolist()

    return select_cl

This function retrieves both observations and UMAP coordinates in a single table for review.

In [14]:
def obs_with_umap(adata):
    obs = adata.obs
    
    umap_mat = adata.obsm['X_umap']
    umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
    obs['umap_1'] = umap_df['umap_1']
    obs['umap_2'] = umap_df['umap_2']

    return obs

This function applies data analysis methods to our scRNA-seq data, including normalization, HVG selection, PCA, nearest neighbors, UMAP, and Leiden clustering.

In [15]:
def process_adata(adata, resolution = 2):
    
    # Keep a copy of the raw data
    adata = adata.raw.to_adata()
    adata.raw = adata

    print('Normalizing', end = "; ")
    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    print('Finding HVGs', end = "; ")
    # Restrict downstream steps to variable genes
    sc.pp.highly_variable_genes(adata)
    adata = adata[:, adata.var_names[adata.var['highly_variable']]].copy()

    print('Scaling', end = "; ")
    # Scale variable genes
    sc.pp.scale(adata)

    print('PCA', end = "; ")
    # Run PCA
    sc.tl.pca(adata, svd_solver = 'arpack')

    print('Harmony', end = "; ")
    # Integrate subjects
    sce.pp.harmony_integrate(
        adata, 
        'subject.subjectGuid',
        max_iter_harmony = 30,
        verbose = False
    )
    
    print('Neighbors', end = "; ")
    # Find nearest neighbors
    sc.pp.neighbors(
        adata, 
        n_neighbors = 20,
        n_pcs = 30,
        use_rep = 'X_pca_harmony'
    )

    print('Leiden', end = "; ")
    # Find clusters
    sc.tl.leiden(
        adata, 
        resolution = resolution, 
        key_added = 'leiden_{r}'.format(r = resolution),
        n_iterations = 5
    )

    print('UMAP', end = "; ")
    # Run UMAP
    sc.tl.umap(adata, min_dist = 0.05)
    
    print('Renormalizing')
    adata = adata.raw.to_adata()
    adata.raw = adata

    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    
    return adata

In [16]:
def isg_hi_refinement(adata, clusters = 'leiden_2', refine_res = 2):
    isg_hi_cl = select_clusters_above_gene_frac(
        adata, 'ISG15', 0.3, clusters = clusters
    )
    print(isg_hi_cl)
    
    isg_adata = adata[adata.obs[clusters].isin(isg_hi_cl)]
    
    isg_adata = process_adata(isg_adata, resolution = refine_res)
    isg_adata = assign_most_frequent(
        isg_adata, 
        'leiden_{r}'.format(r = refine_res), 'AIFI_L3',
        keep_original = True,
        original_prefix = 'predicted_'
    )

    isg_df = isg_adata.obs[['barcodes', 'AIFI_L3', 'predicted_AIFI_L3']]
    return isg_df

## Cell Type Markers

These are the set of marker genes that we'll use to review our cell type labels.

In [17]:
l2_markers = [
    'CD3E', 'CD4', 'CD8A', 'TRAC', 'TRDC', 
    'LEF1', 'TCF7', 'CCR7', 'SELL', 'ITGB1', 
    'FAS', 'CD27', 'CD28', 'FOXP3', 'RTKN2', 
    'IKZF2', 'ZNF683', 'NKG7', 'KLRC2', 'KLRF1', 
    'KLRB1', 'KLRD1', 'GZMA', 'GZMK', 'GZMB', 
    'PRF1', 'GNLY', 'IFI44L', 'IRF7', 'MKI67', 
    'TRDV1', 'TRDV2', 'TRDV3', 'MME', 'SOX4'
]

l3_markers = [
    'CD27', 'CD28', 'KLRF1', 'KLRB1', 'KLRD1', 
    'KLRC1', 'KLRC2', 'GZMA', 'GZMB', 'GZMK', 
    'GZMH', 'PRF1', 'CCL5', 'GNLY', 'GATA3', 
    'TBX21', 'EOMES', 'LEF1', 'TCF7', 'IKZF2',
    'SOX4', 'IFI44L'
]

## Read cell type hierarchy from HISE

As part of label refinement, we'll back-propagate our cell type labels from refined AIFI_L3 labels to their parent cell classes at AIFI_L2 and AIFI_L1. To do this, we need the hierarchical relationships between these levels, which have been generated for our cell type reference dataset.

In [18]:
hierarchy_uuid = '1a44252c-8cab-4c8f-92c9-d8f3af633790'
hierarchy_df = read_csv_uuid(hierarchy_uuid)

downloading fileID: 1a44252c-8cab-4c8f-92c9-d8f3af633790
Files have been successfully downloaded!


## Read sample metadata from HISE

We previously assembled sample metadata and CMV status for each subject. We'll retrieve and combine these to utilize for selecting subsets of samples.

In [19]:
sample_meta_uuid = 'd82c5c42-ae5f-4e67-956e-cd3b7bf88105'
sample_meta = read_csv_uuid(sample_meta_uuid)

downloading fileID: d82c5c42-ae5f-4e67-956e-cd3b7bf88105
Files have been successfully downloaded!


In [20]:
sample_meta.shape

(868, 32)

In [21]:
cmv_meta_uuid = '9469f67c-b09a-454d-9fb9-f50ff3494d69'
cmv_path = cache_uuid_path(cmv_meta_uuid)
cmv_meta = pd.read_csv(cmv_path, index_col = 0)
cmv_meta = cmv_meta.drop_duplicates()

downloading fileID: 9469f67c-b09a-454d-9fb9-f50ff3494d69
Files have been successfully downloaded!


In [22]:
cmv_meta.shape

(96, 4)

In [23]:
sample_meta = sample_meta.merge(cmv_meta, on = 'subject.subjectGuid', how = 'left')

In [24]:
sample_meta.shape

(868, 35)

## Assign sample groups

To subdivide the full set of cells, we'll use groups that include cohort, sex, CMV status, and a subset of visits. To group our visit data, we'll define 3 visit groups, and use those together with the other metadata to group samples.

In [25]:
visit_group_dict = {
    'Flu Year 1 Day 0': 'Year 1',
    'Flu Year 1 Day 7': 'Year 1',
    'Flu Year 1 Day 90': 'Year 1',
    'Flu Year 1 Stand-Alone': 'Year 1',
    'Flu Year 2 Day 0': 'Year 2',
    'Flu Year 2 Day 7': 'Year 2',
    'Flu Year 2 Day 90': 'Year 2',
    'Flu Year 2 Stand-Alone': 'Year 2',
    'Immune Variation Day 0': 'Immune Variation',
    'Immune Variation Day 7': 'Immune Variation',
    'Immune Variation Day 90': 'Immune Variation',
    'Flu Year 3 Stand-Alone': 'Immune Variation',
}

In [26]:
visit_groups = list(set(visit_group_dict.values()))

In [27]:
sample_meta['sample.visitGroup'] = [visit_group_dict[v] for v in sample_meta['sample.visitName']]

In [28]:
group_samples_by = ['cohort.cohortGuid', 'subject.biologicalSex']

In [29]:
grouped_meta = sample_meta.groupby(group_samples_by)

In [30]:
split_meta = {}
for group_tuple, meta in grouped_meta:
    split_name = '_'.join(group_tuple)
    split_meta[split_name] = meta

## Identify files in HISE

For this analysis, we'll read in these files from HISE storage from previous steps. We'll group these into "large" files, which are from the set of 5 very large cell type assignments, and "small" files, which are from other cell types.

In [31]:
large_uuids = {
    'BR1_Female': {
        'BR1_Female_Negative_Naive-CD4-T-cell': 'b55301f1-2289-45a6-b14d-b1ee31a7f11c',
        'BR1_Female_Positive_Naive-CD4-T-cell': '5a50a26e-1a56-4239-ba8e-dc0b8f3ef91d'
    },
    'BR1_Male': {
        'BR1_Male_Negative_Naive-CD4-T-cell': '9997103a-4f8a-4e00-abc1-61a9674cf01b',
        'BR1_Male_Positive_Naive-CD4-T-cell': '2aaa8f67-c64b-450c-9037-7a7cbde2c3e0'
    },
    'BR2_Female': {
        'BR2_Female_Negative_Naive-CD4-T-cell': '82a127b5-7025-41db-8d88-5347055a5268',
        'BR2_Female_Positive_Naive-CD4-T-cell': '56630d0b-cdd9-43b0-8da4-9f7227b35190'
    },
    'BR2_Male': {
        'BR2_Male_Negative_Naive-CD4-T-cell': '22d42b15-1ae2-4b3f-8b6d-39dce427f765',
        'BR2_Male_Positive_Naive-CD4-T-cell': '87c3b749-c177-4fa1-8747-c8faa4e4859e'
    }
}

## Read and process data per group

Here, we'll step through each group based on cohort, sex, and CMV, assemble all related data across our selected files for this cell type, then split the results back out to multiple files based on the Visit Groups defined above.

This way, we can combine across L2 cell classes without generating enormous datasets (almost 6M cells for CD4 T cells alone, for example).

Once split up, we'll reprocess each subset of data to generate nice clusters and UMAP projections, then refine L3 cell type labels by taking the most frequent label within each cluster. After refining labels, we'll propagate labels back to L2 and L1 so they're consistent.

Finally, we'll output these refined results per group, as well as a .csv file with updated labels to enable review of our analysis.

In [32]:
resolution = 2
clusters = 'leiden_{r}'.format(r = resolution)

In [33]:
for grouping, meta in split_meta.items():
    print(grouping)

    # Check if output files have been generated previously
    out_files = {}
    out_exists = []
    for visit_group in visit_groups:
        out_file = 'output/diha_{c}_{g}_{v}_AIFI_L3_review_{d}.h5ad'.format(
            c = class_name,
            g = grouping,
            v = visit_group,
            d = date.today()
        )
        out_files[visit_group] = [out_file]
        out_exists.append(os.path.isfile(out_file))
    
    if sum(out_exists) == len(visit_groups):
        print('{g} Previously processed. Skipping.'.format(g = grouping))
    else:
        
        # Read Large Files
        large_group_uuids = large_uuids[grouping]
        for group_name, uuid in large_group_uuids.items():
            group_adata = read_adata_uuid(uuid)
            print('{g}: {n} cells'.format(g = group_name, n = group_adata.shape[0]))
            if not "all_adata" in globals():
                all_adata = group_adata
            else:
                all_adata = sc.concat([all_adata, group_adata])
            del group_adata
        print('Total: {n} cells'.format(n = all_adata.shape[0]))
        
        visit_group_meta = meta.groupby('sample.visitGroup')

        for (visit_group, visit_meta) in visit_group_meta:
            out_file = out_files[visit_group][0]
            keep_cells = all_adata.obs['sample.sampleKitGuid'].isin(visit_meta['sample.sampleKitGuid'])
            visit_group_adata = all_adata[keep_cells]
            print('>>> Processing {v}: {n} cells'.format(v = visit_group, n = visit_group_adata.shape[0]))
            
            visit_group_adata = process_adata(visit_group_adata)
            
            # Refine labels based on in-group clustering
            isg_hi_labels = isg_hi_refinement(visit_group_adata, refine_res = 3)
            
            visit_group_adata = assign_most_frequent(
                visit_group_adata, 
                clusters, 'AIFI_L3', 
                keep_original = True, 
                original_prefix = 'predicted_'
            )

            # Correct ISG-high labels
            isg_idx = visit_group_adata.obs['barcodes'].isin(isg_hi_labels['barcodes'])
            isg_adata = visit_group_adata[isg_idx]
            visit_group_adata = visit_group_adata[[not x for x in isg_idx]]
            isg_obs = isg_adata.obs.drop(['AIFI_L3', 'predicted_AIFI_L3'], axis = 1)
            isg_obs = isg_obs.reset_index(drop = True)
            isg_obs = isg_obs.merge(isg_hi_labels, on = 'barcodes', how = 'left')
            isg_obs = isg_obs.set_index('barcodes', drop = False)
            isg_adata.obs = isg_obs
            
            visit_group_adata = sc.concat([visit_group_adata, isg_adata])
            
            ## Manual overrides
            type_name = 'SOX4+ naive CD4 T cell'
            sel_cl = select_clusters_above_gene_frac(
                visit_group_adata, 'SOX4', 0.2, clusters
            )
            #if not type_name in visit_group_adata.obs['AIFI_L3'].cat.categories:
            #    visit_group_adata.obs['AIFI_L3'] = adata.obs['AIFI_L3'].cat.add_categories([type_name])
            visit_group_adata.obs['AIFI_L3'] = visit_group_adata.obs['AIFI_L3'].astype(str)
            visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name

            visit_group_adata.obs['AIFI_L3'] = visit_group_adata.obs['AIFI_L3'].astype('category')
            
            # Ensure levels are consistent
            visit_group_adata = propagate_hierarchy(
                visit_group_adata,
                hierarchy_df,
                from_level = 'AIFI_L3',
                to_levels = ['AIFI_L2', 'AIFI_L1'],
                keep_original = True,
                original_prefix = 'predicted_'
            )
            
            # Save results for this visit group
            visit_group_adata.write_h5ad(out_file)

            # Save metadata and UMAP
            meta_csv = 'output/review/diha_{c}_{g}_{v}_AIFI_L3_review_meta_{d}.csv'.format(
                c = class_name,
                g = grouping,
                v = visit_group,
                d = date.today()
            )
            visit_group_obs = obs_with_umap(visit_group_adata)
            visit_group_obs.to_csv(meta_csv)
            out_files[visit_group].append(meta_csv)

            # Save marker expression summaries
            out_l2_markers = 'output/review/diha_{c}_{g}_{v}_AIFI_L2_review_markers_{d}.csv'.format(
                c = class_name,
                g = grouping,
                v = visit_group,
                d = date.today()
            )
            l2_marker_df = tidy_marker_df(
                visit_group_adata,
                l2_markers,
                'AIFI_L2'
            )
            l2_marker_df.to_csv(out_l2_markers)
            out_files[visit_group].append(out_l2_markers)

            out_l3_markers = 'output/review/diha_{c}_{g}_{v}_AIFI_L3_review_markers_{d}.csv'.format(
                c = class_name,
                g = grouping,
                v = visit_group,
                d = date.today()
            )
            l3_marker_df = tidy_marker_df(
                visit_group_adata,
                l3_markers,
                'AIFI_L3'
            )
            l3_marker_df.to_csv(out_l3_markers)
            out_files[visit_group].append(out_l3_markers)

        del all_adata

BR1_Female
downloading fileID: b55301f1-2289-45a6-b14d-b1ee31a7f11c
Files have been successfully downloaded!
BR1_Female_Negative_Naive-CD4-T-cell: 627635 cells
downloading fileID: 5a50a26e-1a56-4239-ba8e-dc0b8f3ef91d
Files have been successfully downloaded!
BR1_Female_Positive_Naive-CD4-T-cell: 236008 cells
Total: 863643 cells
>>> Processing Immune Variation: 286697 cells
Normalizing; Finding HVGs; Scaling; PCA; Harmony; 

2024-03-25 00:30:01,528 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-03-25 00:32:13,843 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['11', '21', '24', '26']
Finding HVGs; Scaling; PCA; 

2024-03-25 00:53:04,742 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 00:53:12,213 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


>>> Processing Year 1: 278990 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 00:55:50,006 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 00:58:06,542 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['12', '22', '23']
Finding HVGs; Scaling; PCA; 

2024-03-25 01:18:02,125 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 01:18:06,847 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


>>> Processing Year 2: 297956 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 01:20:22,394 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 01:22:47,948 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['14', '24']
Finding HVGs; Scaling; PCA; 

2024-03-25 01:40:59,363 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 01:41:04,039 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


BR1_Male
downloading fileID: 9997103a-4f8a-4e00-abc1-61a9674cf01b
Files have been successfully downloaded!
BR1_Male_Negative_Naive-CD4-T-cell: 336079 cells
downloading fileID: 2aaa8f67-c64b-450c-9037-7a7cbde2c3e0
Files have been successfully downloaded!
BR1_Male_Positive_Naive-CD4-T-cell: 217206 cells
Total: 553285 cells
>>> Processing Immune Variation: 217945 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 01:44:31,864 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 01:46:27,479 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['11', '25']
Finding HVGs; Scaling; PCA; 

2024-03-25 02:02:31,274 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 02:02:37,788 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


>>> Processing Year 1: 183064 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 02:04:34,021 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 02:06:14,428 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['16', '22']
Finding HVGs; Scaling; PCA; 

2024-03-25 02:18:38,246 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 02:18:41,938 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


>>> Processing Year 2: 152276 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 02:20:06,205 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 02:21:26,563 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['12']
Finding HVGs; Scaling; PCA; 

2024-03-25 02:29:58,708 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 02:30:04,155 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


BR2_Female
downloading fileID: 82a127b5-7025-41db-8d88-5347055a5268
Files have been successfully downloaded!
BR2_Female_Negative_Naive-CD4-T-cell: 323334 cells
downloading fileID: 56630d0b-cdd9-43b0-8da4-9f7227b35190
Files have been successfully downloaded!
BR2_Female_Positive_Naive-CD4-T-cell: 545052 cells
Total: 868386 cells
>>> Processing Immune Variation: 326162 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 02:33:51,736 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 02:36:27,354 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['12', '23', '24', '25']
Finding HVGs; Scaling; PCA; 

2024-03-25 03:05:15,385 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 03:05:22,096 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


>>> Processing Year 1: 275417 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 03:08:02,298 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 03:10:16,853 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['13', '23', '25']
Finding HVGs; Scaling; PCA; 

2024-03-25 03:29:43,857 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 03:29:48,359 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


>>> Processing Year 2: 266807 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 03:31:55,984 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 03:34:12,769 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['10', '19']
Finding HVGs; Scaling; PCA; 

2024-03-25 03:54:24,823 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 03:54:33,850 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


BR2_Male
downloading fileID: 22d42b15-1ae2-4b3f-8b6d-39dce427f765
Files have been successfully downloaded!
BR2_Male_Negative_Naive-CD4-T-cell: 319309 cells
downloading fileID: 87c3b749-c177-4fa1-8747-c8faa4e4859e
Files have been successfully downloaded!
BR2_Male_Positive_Naive-CD4-T-cell: 235131 cells
Total: 554440 cells
>>> Processing Immune Variation: 189772 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 03:59:03,004 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 04:00:40,785 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['15']
Finding HVGs; Scaling; PCA; 

2024-03-25 04:15:04,557 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 04:15:09,006 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


>>> Processing Year 1: 193181 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 04:16:43,000 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 04:18:24,077 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['13']
Finding HVGs; Scaling; PCA; 

2024-03-25 04:31:08,867 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 04:31:14,658 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


>>> Processing Year 2: 171487 cells
Normalizing; Finding HVGs; Scaling; PCA; 

2024-03-25 04:33:07,114 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 04:34:35,969 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing
['12']
Finding HVGs; Scaling; PCA; 

2024-03-25 04:46:14,008 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Harmony; 

2024-03-25 04:46:19,444 - harmonypy - INFO - sklearn.KMeans initialization complete.


Neighbors; Leiden; UMAP; Renormalizing


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visit_group_adata.obs['AIFI_L3'][visit_group_adata.obs[clusters].isin(sel_cl)] = type_name


## Assemble output files for upload

### h5ad files

In [34]:
output_files = os.listdir('output')
h5ad_files = []
for output_file in output_files:
    if class_name in output_file:
        h5ad_files.append('{d}/{f}'.format(d = 'output', f = output_file))

### Review files

In [35]:
rev_files = os.listdir('output/review')
review_files = []
for rev_file in rev_files:
    if class_name in rev_file:
        review_files.append('{d}/{f}'.format(d = 'output/review', f = rev_file))

### Combine metadata files to assemble a full set

In [36]:
meta_files = []
for review_file in review_files:
    if 'meta' in review_file:
        meta_files.append(review_file)

meta_list = []
for meta_file in meta_files:
    meta_list.append(pd.read_csv(meta_file, index_col = 0))
all_meta = pd.concat(meta_list)

In [37]:
meta_csv = 'output/diha_{c}_AIFI_L3_refinement_meta_{d}.csv'.format(c = class_name, d = date.today())
all_meta.to_csv(meta_csv)
meta_parquet = 'output/diha_{c}_AIFI_L3_refinement_meta_{d}.parquet'.format(c = class_name, d = date.today())
all_meta.to_parquet(meta_parquet)

### Bundle review files into a .tar for later use

In [38]:
review_tar = 'output/diha_{c}_AIFI_L3_refinement_review_{d}.tar.gz'.format(c = class_name, d = date.today())
tar = tarfile.open(review_tar, 'w:gz')
for review_file in review_files:
    tar.add(review_file)
tar.close()

## Upload assembled results to HISE

In [39]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA Naive CD4 T Label Refinement AIFI_L3 {d}'.format(d = date.today())

In [40]:
in_files = []
for group_name, file_dict in large_uuids.items():
    for type_group, uuid in file_dict.items():
        in_files.append(uuid)
in_files

['b55301f1-2289-45a6-b14d-b1ee31a7f11c',
 '5a50a26e-1a56-4239-ba8e-dc0b8f3ef91d',
 '9997103a-4f8a-4e00-abc1-61a9674cf01b',
 '2aaa8f67-c64b-450c-9037-7a7cbde2c3e0',
 '82a127b5-7025-41db-8d88-5347055a5268',
 '56630d0b-cdd9-43b0-8da4-9f7227b35190',
 '22d42b15-1ae2-4b3f-8b6d-39dce427f765',
 '87c3b749-c177-4fa1-8747-c8faa4e4859e']

In [41]:
out_files = h5ad_files + [meta_csv, meta_parquet, review_tar]

In [42]:
out_files

['output/diha_naive_cd4_t_cell_BR1_Male_Year 1_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR2_Female_Year 2_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR2_Female_Year 1_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR2_Male_Year 1_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR1_Female_Year 1_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR2_Female_Immune Variation_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR2_Male_Year 2_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR1_Female_Immune Variation_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR1_Female_Year 2_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR1_Male_Year 2_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR2_Male_Immune Variation_AIFI_L3_review_2024-03-25.h5ad',
 'output/diha_naive_cd4_t_cell_BR1_Male_Immune Variation_AIFI_L3_review_2024-0

In [43]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = 'refinement_t_cd4_naive'
)

output/diha_naive_cd4_t_cell_BR1_Male_Year 1_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR2_Female_Year 2_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR2_Female_Year 1_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR2_Male_Year 1_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR1_Female_Year 1_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR2_Female_Immune Variation_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR2_Male_Year 2_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR1_Female_Immune Variation_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR1_Female_Year 2_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR1_Male_Year 2_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR2_Male_Immune Variation_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_BR1_Male_Immune Variation_AIFI_L3_review_2024-03-25.h5ad
output/diha_naive_cd4_t_cell_AIFI_L3

(y/n) y


{'trace_id': '41b407a5-8637-4c19-9efc-1e163ef4704a',
 'files': ['output/diha_naive_cd4_t_cell_BR1_Male_Year 1_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR2_Female_Year 2_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR2_Female_Year 1_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR2_Male_Year 1_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR1_Female_Year 1_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR2_Female_Immune Variation_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR2_Male_Year 2_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR1_Female_Immune Variation_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR1_Female_Year 2_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR1_Male_Year 2_AIFI_L3_review_2024-03-25.h5ad',
  'output/diha_naive_cd4_t_cell_BR2_Male_Immune Variation_AIFI_L3_review_2024-03-25.h5ad',
  'ou

In [44]:
import session_info
session_info.show()