In [28]:
import os
import pandas as pd
import re  
from pathlib import Path
import numpy as np
import flowkit as fk
import bokeh
from bokeh.plotting import show
from bokeh.io import export_png
import json

In [5]:
os.getcwd()

'/gpfs/projects/b1042/MisharinLab/anna/code_folder'

Using FlowKit Python library, we are going to open FlowJo workspace. IMPORTANT! To avoid errors, specify the path to the FCS files even if they are located in the same folder. Documentation for the FlowKit library can be found here: https://flowkit.readthedocs.io/en/latest/index.html

In [7]:
import os
path = "/gpfs/projects/b1042/MisharinLab/anna/flow_code/AnnaFlowData/2023_05"
print(os.path.exists(path))  # Should print True
print(os.listdir(path))     # Should show the files

False


FileNotFoundError: [Errno 2] No such file or directory: '/gpfs/projects/b1042/MisharinLab/anna/flow_code/AnnaFlowData/2023_05'

In [9]:
#please replace this path with your own path where you have files containing workspace and other fcs files
wsp = fk.Workspace("/home/ero6410/flow_code/AnnaFlowData/2023_05/2023_05.wsp", fcs_samples = "/home/ero6410/flow_code/AnnaFlowData/2023_05")

Following tutorial FlowKit Part 6 - The Workspace Class, we run necessary commands to:
    1) Werify that the workspace has loaded all the samples
    2) Analyze workspace to retriee gating strategy and verify that it is correct
    3) Save gate_names as a unqiue list for future steps

In [10]:
sample_ids = wsp.get_sample_ids() #print out sample ids - verify that they match the directory
print(sample_ids)
wsp_groups = wsp.get_sample_groups() #print out sample groups - debugging step per tutorial
print(wsp_groups)
wsp.summary()

['20230502_1707-BAL-20_001.fcs', '20230519_1714-BAL-00_001.fcs', '20230522_1713-BAL-04_001.fcs', '20230526_1715-BAL-00_001.fcs']
['All Samples']


Unnamed: 0_level_0,samples,loaded_samples,gates,max_gate_depth
group_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All Samples,5,5,21,8


In [12]:
sample_group = 'All Samples'
wsp.analyze_samples(group_name=sample_group, verbose=False, use_mp=False) #you need to run this step to access dataframe that contains info about the workspace
results_report = wsp.get_analysis_report(sample_group) #necessary step for getting a dataframe about the workspace
results_report.head()

Unnamed: 0,sample,gate_path,gate_name,gate_type,quadrant_parent,parent,count,absolute_percent,relative_percent,level
0,20230502_1707-BAL-20_001.fcs,"(root,)",Singlets,PolygonGate,,root,260271,78.267106,78.267106,0
1,20230502_1707-BAL-20_001.fcs,"(root, Singlets)",Live,PolygonGate,,Singlets,232059,69.783366,89.160529,1
2,20230502_1707-BAL-20_001.fcs,"(root, Singlets, Live)",CD45+,PolygonGate,,Live,229465,69.003314,98.882181,2
3,20230502_1707-BAL-20_001.fcs,"(root, Singlets, Live, CD45+)",T-cells,PolygonGate,,CD45+,4609,1.38599,2.008585,3
4,20230502_1707-BAL-20_001.fcs,"(root, Singlets, Live, CD45+)",T-cells-,BooleanGate,,CD45+,224856,67.617324,97.991415,3


In [13]:
gate_names = results_report["gate_name"].unique() #necessay step for concatenating gates together
print(gate_names) 

['Singlets' 'Live' 'CD45+' 'T-cells' 'T-cells-' 'CD4+' 'CD8+'
 'Granulocytes' 'Granulocytes-' 'CD14+CD206+' 'CD14+CD206+-'
 'CD4+ not T-regs' 'T-regs' 'Macrophages' 'Monocytes' 'NK cells'
 'NK cells-' 'B-cells and Plasma cells' 'CD206 high macorphages'
 'CD206 low macorphages' 'others']


Sasha says: "Never trust a Hippie!", so we are actually going to check if the gates produced by this extraction are valid. 

In [13]:
save_folder = "flow_plots"
os.makedirs(save_folder, exist_ok=True)
for sample_id in sample_ids:
    for gate in gate_names:
        gate_obj = wsp.get_gate(sample_id, gate)

        if isinstance(gate_obj, fk.gates.BooleanGate):
            print(f"Skipping boolean gate: {gate}")
            continue  # Skip boolean gates
        
        p = wsp.plot_gate(sample_id, gate)
        filename = os.path.join(save_folder, f"{sample_id}_{gate}.png")

        export_png(p, filename=filename)
        print(f"Saved {filename}")
  

Saved flow_plots/20230502_1707-BAL-20_001.fcs_Singlets.png
Saved flow_plots/20230502_1707-BAL-20_001.fcs_Live.png
Saved flow_plots/20230502_1707-BAL-20_001.fcs_CD45+.png
Saved flow_plots/20230502_1707-BAL-20_001.fcs_T-cells.png
Skipping boolean gate: T-cells-
Saved flow_plots/20230502_1707-BAL-20_001.fcs_CD4+.png
Saved flow_plots/20230502_1707-BAL-20_001.fcs_CD8+.png
Saved flow_plots/20230502_1707-BAL-20_001.fcs_Granulocytes.png
Skipping boolean gate: Granulocytes-
Saved flow_plots/20230502_1707-BAL-20_001.fcs_CD14+CD206+.png
Skipping boolean gate: CD14+CD206+-
Skipping boolean gate: CD4+ not T-regs
Saved flow_plots/20230502_1707-BAL-20_001.fcs_T-regs.png
Skipping boolean gate: Macrophages
Saved flow_plots/20230502_1707-BAL-20_001.fcs_Monocytes.png
Saved flow_plots/20230502_1707-BAL-20_001.fcs_NK cells.png
Skipping boolean gate: NK cells-
Saved flow_plots/20230502_1707-BAL-20_001.fcs_B-cells and Plasma cells.png
Saved flow_plots/20230502_1707-BAL-20_001.fcs_CD206 high macorphages.png
S

Now we loaded the information in our space, but it is not yet usuable for the model. We need to extract information for each SAMPLE. For each sample, we iterate through all the gates associated with it. Gate names are saved as dictionary keys. For each gate, we collect and append dataframes associated with that gate. Besides that we are creating a column samole_id that stores information of the origin sample and additional binary column named after the gate currently processing. 

In [14]:
def collect_gated_events_by_gate(wsp):
    all_gated = dict()
    for sample_id in sample_ids:
        for gate in gate_names:
            try:
                df = wsp.get_gate_events(sample_id, gate_name=gate)
                df['sample_id'] = sample_id 
                df[f"{gate}"] = 1
                if gate not in all_gated:
                    all_gated[gate] = [df]
                else:
                    all_gated[gate].append(df)
            except Exception as e:
                print(f"Failed to get gate '{gate}' for sample '{sample_id}': {e}")
                continue
    return all_gated

In [15]:
all_gated = collect_gated_events_by_gate(wsp)

In [16]:
print(all_gated.keys())

dict_keys(['Singlets', 'Live', 'CD45+', 'T-cells', 'T-cells-', 'CD4+', 'CD8+', 'Granulocytes', 'Granulocytes-', 'CD14+CD206+', 'CD14+CD206+-', 'CD4+ not T-regs', 'T-regs', 'Macrophages', 'Monocytes', 'NK cells', 'NK cells-', 'B-cells and Plasma cells', 'CD206 high macorphages', 'CD206 low macorphages', 'others'])


In [17]:
print(all_gated["T-cells-"])

[                           sample_id     FSC-A     FSC-H     FSC-W     SSC-A  \
0       20230502_1707-BAL-20_001.fcs  0.208211  0.176109  0.295571  0.442216   
1       20230502_1707-BAL-20_001.fcs  0.605212  0.407520  0.371277  0.524464   
2       20230502_1707-BAL-20_001.fcs  0.400838  0.326309  0.307100  0.400897   
4       20230502_1707-BAL-20_001.fcs  0.226865  0.180531  0.314164  0.385878   
5       20230502_1707-BAL-20_001.fcs  0.343612  0.279800  0.307015  0.404774   
...                              ...       ...       ...       ...       ...   
332530  20230502_1707-BAL-20_001.fcs  0.271319  0.218079  0.311034  0.355204   
332531  20230502_1707-BAL-20_001.fcs  0.384848  0.303959  0.316529  0.187107   
332535  20230502_1707-BAL-20_001.fcs  0.453492  0.283295  0.400195  0.817773   
332539  20230502_1707-BAL-20_001.fcs  0.204157  0.168629  0.302672  0.352111   
332540  20230502_1707-BAL-20_001.fcs  0.359719  0.277390  0.324201  0.349796   

           SSC-H     SSC-W  DAPI-A BUV

Now, we need to concatenate all the dataframes associated with each gate and also create other binary columns that are named after other gate names, but fill them with NaN for now. 

In [18]:
# test = all_gated.copy()
# for gate in test:
#     df = pd.concat(test[gate], ignore_index=True)
#     for col in gate_names:
#         if col not in df.columns:
#             df[col] = np.nan
#     test[gate] = df


# print(test["CD45+"].columns)

# print(test["CD45+"]["CD45+"])

In [17]:
for gate in all_gated:
    df = pd.concat(all_gated[gate], ignore_index=True)
    for col in gate_names:
        if col not in df.columns:
            df[col] = np.nan
    all_gated[gate] = df    

In [18]:
print(wsp.get_gate_hierarchy('20230502_1707-BAL-20_001.fcs')) #this is a step that could be run earlier. I moved this cell here for 
#convenient assembly of binary gates

root
╰── Singlets
    ╰── Live
        ╰── CD45+
            ├── T-cells
            │   ├── CD4+
            │   │   ├── T-regs
            │   │   ╰── CD4+ not T-regs
            │   ╰── CD8+
            ╰── T-cells-
                ├── Granulocytes
                ╰── Granulocytes-
                    ├── CD14+CD206+
                    │   ├── Monocytes
                    │   ╰── Macrophages
                    │       ├── CD206 high macorphages
                    │       ╰── CD206 low macorphages
                    ╰── CD14+CD206+-
                        ├── NK cells
                        ╰── NK cells-
                            ├── B-cells and Plasma cells
                            ╰── others


In [21]:
ws_json = wsp.get_gate_hierarchy('20230502_1707-BAL-20_001.fcs',output='json', indent=2)
print(ws_json)

{
  "name": "root",
  "children": [
    {
      "gate_type": "PolygonGate",
      "custom_gates": {},
      "name": "Singlets",
      "children": [
        {
          "gate_type": "PolygonGate",
          "custom_gates": {},
          "name": "Live",
          "children": [
            {
              "gate_type": "PolygonGate",
              "custom_gates": {},
              "name": "CD45+",
              "children": [
                {
                  "gate_type": "PolygonGate",
                  "custom_gates": {},
                  "name": "T-cells",
                  "children": [
                    {
                      "gate_type": "PolygonGate",
                      "custom_gates": {},
                      "name": "CD4+",
                      "children": [
                        {
                          "gate_type": "PolygonGate",
                          "custom_gates": {},
                          "name": "T-regs"
                        },
                    

Now for each gate, we create an individual dataframe and start assembling the file for preprocessing. 

In [40]:
CD45 = all_gated['CD45+'].fillna(0)
Tcells = all_gated['T-cells'].fillna(0)
CD4 = all_gated['CD4+'].fillna(0)
Tregs = all_gated['T-regs'].fillna(0)
non_Tregs = all_gated['CD4+ not T-regs'].fillna(0)
CD8 = all_gated['CD8+'].fillna(0)
non_Tcells = all_gated['T-cells-'].fillna(0)
Bcells = all_gated['B-cells and Plasma cells'].fillna(0)
others = all_gated['others'].fillna(0)
NK_minus = all_gated['NK cells-'].fillna(0)
NKcells = all_gated['NK cells'].fillna(0)
notCD14 = all_gated['CD14+CD206+-'].fillna(0)
CD14 = all_gated['CD14+CD206+'].fillna(0)
CD206_high = all_gated['CD206 high macorphages'].fillna(0)
CD206_low = all_gated['CD206 low macorphages'].fillna(0)
Macrophages = all_gated['Macrophages'].fillna(0)
Monocytes = all_gated['Monocytes'].fillna(0)
notGranolucytes = all_gated["Granulocytes-"].fillna(0)
Granulocytes = all_gated["Granulocytes"].fillna(0)
notTcells = all_gated["T-cells-"].fillna(0)
live = all_gated["Live"].fillna(0)
singlets = all_gated["Singlets"].fillna(0)

What we need is to concatenate dataframes so that we create hiearchical order for each cell. Every row a.k.a the cells that are a part of subgate, should also be a part of the higher order subgate. Automating this process is quite challenging and nor necessary at this step as we are still agreeing on the uniform gating strategy, so we will have to manually audit this process. (It is very easy).

In [41]:
cols_for_rem_dupl = [col for col in CD45.columns if col not in gate_names]

print(cols_for_rem_dupl)

['sample_id', 'FSC-A', 'FSC-H', 'FSC-W', 'SSC-A', 'SSC-H', 'SSC-W', 'DAPI-A BUV395 CD4 CD19', 'Side Pop-A BUV737 CD25 CD56', 'FITC-A SYTOX Green', 'PE-A PE CD3', 'PE-Texas Red-A PECF594 CD127', 'PE-Cy7-A PECy7 CD206', 'Pacific Blue-A eFluor450 HLA-DR', 'Qdot 655-A BU786 CD15', 'APC-A APC CD8 CD14 EpCAM', 'AmCyan-A BV510 CD45', 'Time']


In [23]:
print(wsp.get_gate_hierarchy('20230502_1707-BAL-20_001.fcs'))

root
╰── Singlets
    ╰── Live
        ╰── CD45+
            ├── T-cells
            │   ├── CD4+
            │   │   ├── T-regs
            │   │   ╰── CD4+ not T-regs
            │   ╰── CD8+
            ╰── T-cells-
                ├── Granulocytes
                ╰── Granulocytes-
                    ├── CD14+CD206+
                    │   ├── Monocytes
                    │   ╰── Macrophages
                    │       ├── CD206 high macorphages
                    │       ╰── CD206 low macorphages
                    ╰── CD14+CD206+-
                        ├── NK cells
                        ╰── NK cells-
                            ├── B-cells and Plasma cells
                            ╰── others


In [42]:
Tregs_nonTregs = pd.concat([Tregs, non_Tregs], ignore_index=True)
CD4_combined = pd.concat([Tregs_nonTregs, CD4], ignore_index=True)
CD4_combined = CD4_combined.groupby(cols_for_rem_dupl, as_index=False)[gate_names].max()
CD4_CD8_combined = pd.concat([CD4_combined, CD8], ignore_index=True)
CD4_CD8_combined.drop_duplicates(subset=cols_for_rem_dupl, inplace=True)
Tcells_fully_assembled = pd.concat([CD4_CD8_combined, Tcells]).groupby(cols_for_rem_dupl, as_index=False)[gate_names].max()
Bcells_others = pd.concat([Bcells,others], ignore_index = True)
NKminus_combined = pd.concat([NK_minus,Bcells_others], ignore_index=True).groupby(cols_for_rem_dupl, as_index=False)[gate_names].max()
NKminus_plus = pd.concat([NKminus_combined, NKcells], ignore_index = True)
notCD14_combined = pd.concat([NKminus_plus, notCD14], ignore_index = True).groupby(cols_for_rem_dupl, as_index=False)[gate_names].max()
CD206low_high = pd.concat([CD206_high,CD206_low], ignore_index = True)
Macrophages_combined = pd.concat([Macrophages,CD206low_high], ignore_index = True).groupby(cols_for_rem_dupl,as_index=False)[gate_names].max()
Monocytes_macrophages = pd.concat([Macrophages_combined,Monocytes], ignore_index=True)
CD14_combined = pd.concat([Monocytes_macrophages,CD14], ignore_index = True).groupby(cols_for_rem_dupl, as_index=False)[gate_names].max()
CD14_notCD14 = pd.concat([notCD14_combined,CD14_combined], ignore_index = True)
notGranulocytes_combined = pd.concat([CD14_notCD14,notGranolucytes], ignore_index = True).groupby(cols_for_rem_dupl, as_index = False)[gate_names].max()
Granu_notGranu = pd.concat([Granulocytes,notGranulocytes_combined],ignore_index = True)
notTcells_combined = pd.concat([Granu_notGranu, notTcells], ignore_index = True).groupby(cols_for_rem_dupl, as_index = False)[gate_names].max()
Tcells_notTcells = pd.concat([Tcells_fully_assembled,notTcells_combined],ignore_index = True)
CD45_combined = pd.concat([Tcells_notTcells,CD45], ignore_index = True).groupby(cols_for_rem_dupl, as_index = False)[gate_names].max()

In [43]:
#debug and make sure your gates have 0 and 1 values
print("CD45_combined dataframe:")
# Filter only the columns with gate names
columns_cd45 = CD45_combined[gate_names]
for col in columns_cd45.columns:
    print(f"{col} has values: {set(columns_cd45[col])}")


CD45_combined dataframe:
Singlets has values: {0.0}
Live has values: {0.0}
CD45+ has values: {1.0}
T-cells has values: {0.0, 1.0}
T-cells- has values: {0.0, 1.0}
CD4+ has values: {0.0, 1.0}
CD8+ has values: {0.0, 1.0}
Granulocytes has values: {0.0, 1.0}
Granulocytes- has values: {0.0, 1.0}
CD14+CD206+ has values: {0.0, 1.0}
CD14+CD206+- has values: {0.0, 1.0}
CD4+ not T-regs has values: {0.0, 1.0}
T-regs has values: {0.0, 1.0}
Macrophages has values: {0.0, 1.0}
Monocytes has values: {0.0, 1.0}
NK cells has values: {0.0, 1.0}
NK cells- has values: {0.0, 1.0}
B-cells and Plasma cells has values: {0.0, 1.0}
CD206 high macorphages has values: {0.0, 1.0}
CD206 low macorphages has values: {0.0, 1.0}
others has values: {0.0, 1.0}


In [44]:
CD45_combined.isna().sum()

sample_id                          0
FSC-A                              0
FSC-H                              0
FSC-W                              0
SSC-A                              0
SSC-H                              0
SSC-W                              0
DAPI-A BUV395 CD4 CD19             0
Side Pop-A BUV737 CD25 CD56        0
FITC-A SYTOX Green                 0
PE-A PE CD3                        0
PE-Texas Red-A PECF594 CD127       0
PE-Cy7-A PECy7 CD206               0
Pacific Blue-A eFluor450 HLA-DR    0
Qdot 655-A BU786 CD15              0
APC-A APC CD8 CD14 EpCAM           0
AmCyan-A BV510 CD45                0
Time                               0
Singlets                           0
Live                               0
CD45+                              0
T-cells                            0
T-cells-                           0
CD4+                               0
CD8+                               0
Granulocytes                       0
Granulocytes-                      0
C

Final step is to seperate this huge MEGA dataframe into unqiue datarames(and then csvs) seperated by sample_id. 

In [34]:
path = "/mnt/c/Users/Anechka/Documents/Northwestern/files/CD45_test_gate"
os.makedirs(path, exist_ok=True)  

for sample, df in CD45_combined.groupby("sample_id"):
    out_path = os.path.join(path, f"{sample}.csv")
    df.to_csv(out_path, index=False)


In [45]:
"""Debugging"""
data_pregate = CD45_combined[CD45_combined["T-cells"] == 1]
print(data_pregate.shape)
print(data_pregate[data_pregate["CD4+"]== 1].shape[0] + data_pregate[data_pregate["CD8+"]== 1].shape[0])


(5728, 39)
5215


Alternatively, use the following function to conctatenated different gate's df to preserve hiearchicahl structure:

In [46]:
def process_gating_hierarchy(gating_json, all_gated, cols_for_rem_dupl, gate_names):
    """
    Simplified gating hierarchy processor that treats all gate types the same.
    
    Args:
        gating_json (dict): Gating hierarchy JSON structure
        all_gated (dict): Dictionary of {gate_name: dataframe}
        cols_for_rem_dupl (list): Columns for duplicate removal
        gate_names (list): Marker columns to keep
        
    Returns:
        dict: Processed dataframes for each gate
    """
    # 1. Clean all DataFrames (replace NaN with 0)
    cleaned_gates = {gate: df.fillna(0) for gate, df in all_gated.items()}
    
    # Recursive processing function
    def process_node(node):
        node_name = node['name']
        node_df = cleaned_gates.get(node_name, pd.DataFrame())
        
        # Process children if they exist
        if 'children' in node and node['children']:
            # Combine all children (recursively processed)
            children_combined = pd.concat(
                [process_node(child) for child in node['children']],
                ignore_index=True
            )
            
            # If current node has its own dataframe, combine with children
            if not node_df.empty:
                combined = pd.concat([node_df, children_combined], ignore_index=True)
                return combined.groupby(cols_for_rem_dupl, as_index=False)[gate_names].max()
            return children_combined
        return node_df
    
    # Collect results for all nodes
    result = {}
    
    def collect_results(node):
        node_name = node['name']
        result[node_name] = process_node(node)
        if 'children' in node:
            for child in node['children']:
                collect_results(child)
    
    # Start processing from root's children (skip the root itself)
    for child in gating_json['children']:
        collect_results(child)
    
    return result

In [47]:
#calling and debugging function
ws_json_dict = json.loads(ws_json)
print(type(cols_for_rem_dupl))
print(type(gate_names)) #if any of this numpy array, it will throw an error
#if any of this numpy array, decomment this line: gate_names = gate_names.tolist()
test_dfs = process_gating_hierarchy(ws_json_dict, all_gated, cols_for_rem_dupl, gate_names)

CD45_combined = test_dfs['CD45+']
print("CD45_combined dataframe:")
# Filter only the columns with gate names
columns_cd45 = CD45_combined[gate_names]
for col in columns_cd45.columns:
    print(f"{col} has values: {set(columns_cd45[col])}")
#Checking if we get the same results if we did the manual concatenation
data_pregate = CD45_combined[CD45_combined["T-cells"] == 1]
print(data_pregate.shape)
print(data_pregate[data_pregate["CD4+"]== 1].shape[0] + data_pregate[data_pregate["CD8+"]== 1].shape[0])

<class 'list'>
<class 'list'>
CD45_combined dataframe:
Singlets has values: {0.0}
Live has values: {0.0}
CD45+ has values: {1.0}
T-cells has values: {0.0, 1.0}
T-cells- has values: {0.0, 1.0}
CD4+ has values: {0.0, 1.0}
CD8+ has values: {0.0, 1.0}
Granulocytes has values: {0.0, 1.0}
Granulocytes- has values: {0.0, 1.0}
CD14+CD206+ has values: {0.0, 1.0}
CD14+CD206+- has values: {0.0, 1.0}
CD4+ not T-regs has values: {0.0, 1.0}
T-regs has values: {0.0, 1.0}
Macrophages has values: {0.0, 1.0}
Monocytes has values: {0.0, 1.0}
NK cells has values: {0.0, 1.0}
NK cells- has values: {0.0, 1.0}
B-cells and Plasma cells has values: {0.0, 1.0}
CD206 high macorphages has values: {0.0, 1.0}
CD206 low macorphages has values: {0.0, 1.0}
others has values: {0.0, 1.0}
(5728, 39)
5215


In [48]:
#sepearte into sepearte sample unique csvs
path = "./sample_csvs_test_May15"
os.makedirs(path, exist_ok=True)  

for sample, df in CD45_combined.groupby("sample_id"):
    out_path = os.path.join(path, f"{sample}.csv")
    df.to_csv(out_path, index=False)