# 2025-02-24: Alluvial Plot Input
#### By [Aishwarya Chander](aishwarya.chander@alleninstitute.org), High Resolution Translational Immunology, Allen Institute for Immunology

**Main aim**: Compute frequencies of cells and log normalize them based on a condition to use as input for the aaluvial plot.

In [1]:
import pandas as pd
import numpy as np 

In [2]:
def calculate_frequencies(ndmm_meta, categories, output_file):
    # Filter the DataFrame based on the given categories and treatment
    filtered_df = ndmm_meta[
        ndmm_meta['manual.label_l3'].isin(categories) & 
        (ndmm_meta['manual.treatment_dara'] == 'non_dara')
    ]

    # Select relevant columns
    data = filtered_df[['manual.label_l3', 'sample.visitDetails']]

    # Calculate frequency data
    # Here, adjust to match the name of your metadata field
    frequency_data = data.groupby(['sample.visitDetails', 'manual.label_l3']).size().reset_index(name='count')
    total_counts = data.groupby('sample.visitDetails').size().reset_index(name='total_count')
    frequency_data = frequency_data.merge(total_counts, on='sample.visitDetails')
    frequency_data['frequency'] = frequency_data['count'] / frequency_data['total_count']

    # Normalize frequencies
    frequency_data['normalized_frequency'] = frequency_data.groupby('manual.label_l3')['frequency'].transform(lambda x: x / x.sum())

    # Apply log transformation
    frequency_data['log_normalized_frequency'] = np.log1p(frequency_data['normalized_frequency'])

    # Save to CSV
    frequency_data.to_csv(output_file, index=False)

In [3]:
## This is your adata.obs or any table with the counts of labels
ndmm_meta = pd.read_csv('../../certpro-ndmm-data/ndmm-bmmc-csvs_pkls/2025-02-25-ndmm-bmmc-labelled-all-cells-obs-metadata.csv')

  ndmm_meta = pd.read_csv('../../certpro-ndmm-data/ndmm-bmmc-csvs_pkls/2025-02-25-ndmm-bmmc-labelled-all-cells-obs-metadata.csv')


In [4]:
## Check to select the cell types you need for your analysis.
sorted(ndmm_meta['manual.label_l3'].unique()) 

['b_memory_cd95',
 'b_memory_core',
 'b_memory_effector',
 'b_naive',
 'b_naive_activated',
 'b_naive_isg_pos',
 'b_precursor',
 'b_precursor_igk',
 'b_precursor_vdj',
 'b_transitional',
 'b_transitional_isg_pos',
 'dc_asdc',
 'dc_cdc1',
 'dc_cdc2',
 'dc_cdc2-cd14.pos',
 'dc_cdc2-isg.pos',
 'dc_pdc',
 'mono_cd14',
 'mono_cd14_isg_high',
 'mono_cd16',
 'mono_intermediate',
 'msc_fibroblasts',
 'msc_platelet',
 'nk_adaptive',
 'nk_cd56_bright',
 'nk_cd56_dim-gzmk_neg',
 'nk_cd56_dim-gzmk_pos',
 'nk_cd56_dim-isg_pos',
 'nk_effector',
 'nk_t_proliferating_nk_like',
 'nk_t_proliferating_t_like',
 'nk_tissue_resident',
 'plasma',
 'prog_b_proliferating',
 'prog_clp',
 'prog_cmp',
 'prog_cmp_mono',
 'prog_dc',
 'prog_dc_cdc',
 'prog_dc_pdc',
 'prog_hspc',
 'prog_hspc_proliferating',
 'prog_lmpp',
 'prog_mature_ery',
 'prog_mature_ery_polychromatic',
 'prog_mep',
 'prog_mk',
 'prog_pre_ery',
 't_cd4_central_memory',
 't_cd4_effector_1',
 't_cd4_effector_2',
 't_cd4_memory',
 't_cd4_naive',
 't

In [7]:
## If your hierarchy is perfect, you can use just 1 inout (for example all cell types under L1 B cells)

categories = [
    'b_memory_cd95', 'b_memory_core', 'b_memory_effector', 'b_naive',
    'b_naive_activated', 'b_naive_isg_pos', 'b_transitional',
    'b_transitional_isg_pos', 'prog_b_proliferating',
    'b_precursor_proliferating', 'b_precursor_lcr', 'b_precursor_hcr'
]

calculate_frequencies(ndmm_meta, categories, 'l3_alu_b_cells.csv')

In [8]:
## Output is a table of frequencies with different normalizations as defined in the function above. 
## Feel free to adjust your log scaling per your preference. 

view_freq = pd.read_csv('frequency-csvs/l3_alu_b_cells.csv')
view_freq

Unnamed: 0,sample.visitDetails,manual.label_l3,count,total_count,frequency,normalized_frequency,log_normalized_frequency
0,MM End Induction 1st Draw,b_memory_cd95,188,5003,0.037577,0.407966,0.342146
1,MM End Induction 1st Draw,b_memory_core,401,5003,0.080152,0.32283,0.279773
2,MM End Induction 1st Draw,b_memory_effector,63,5003,0.012592,0.290027,0.254663
3,MM End Induction 1st Draw,b_naive,2478,5003,0.495303,0.204247,0.185855
4,MM End Induction 1st Draw,b_naive_isg_pos,35,5003,0.006996,0.095624,0.091324
5,MM End Induction 1st Draw,b_transitional,1115,5003,0.222866,0.15712,0.145934
6,MM End Induction 1st Draw,b_transitional_isg_pos,62,5003,0.012393,0.124308,0.117168
7,MM End Induction 1st Draw,prog_b_proliferating,661,5003,0.132121,0.22178,0.200309
8,MM Post Transplant 1 year,b_memory_cd95,129,8743,0.014755,0.160187,0.148581
9,MM Post Transplant 1 year,b_memory_core,212,8743,0.024248,0.097664,0.093184
